LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone
From: Palmer Dabbelt @ 2020-07-09  5:05 UTC (permalink / raw)
  To: alex
  Cc: aou, alex, Anup Patel, linux-kernel, Atish Patra, paulus, zong.li,
	Paul Walmsley, linux-riscv, linuxppc-dev
In-Reply-To: <20200607075949.665-2-alex@ghiti.fr>

On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@ghiti.fr wrote:
> This is a preparatory patch for relocatable kernel.
>
> The kernel used to be linked at PAGE_OFFSET address and used to be loaded
> physically at the beginning of the main memory. Therefore, we could use
> the linear mapping for the kernel mapping.
>
> But the relocated kernel base address will be different from PAGE_OFFSET
> and since in the linear mapping, two different virtual addresses cannot
> point to the same physical address, the kernel mapping needs to lie outside
> the linear mapping.

I know it's been a while, but I keep opening this up to review it and just
can't get over how ugly it is to put the kernel's linear map in the vmalloc
region.

I guess I don't understand why this is necessary at all.  Specifically: why
can't we just relocate the kernel within the linear map?  That would let the
bootloader put the kernel wherever it wants, modulo the physical memory size we
support.  We'd need to handle the regions that are coupled to the kernel's
execution address, but we could just put them in an explicit memory region
which is what we should probably be doing anyway.

> In addition, because modules and BPF must be close to the kernel (inside
> +-2GB window), the kernel is placed at the end of the vmalloc zone minus
> 2GB, which leaves room for modules and BPF. The kernel could not be
> placed at the beginning of the vmalloc zone since other vmalloc
> allocations from the kernel could get all the +-2GB window around the
> kernel which would prevent new modules and BPF programs to be loaded.

Well, that's not enough to make sure this doesn't happen -- it's just enough to
make sure it doesn't happen very quickily.  That's the same boat we're already
in, though, so it's not like it's worse.

> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
> Reviewed-by: Zong Li <zong.li@sifive.com>
> ---
>  arch/riscv/boot/loader.lds.S     |  3 +-
>  arch/riscv/include/asm/page.h    | 10 +++++-
>  arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++-------
>  arch/riscv/kernel/head.S         |  3 +-
>  arch/riscv/kernel/module.c       |  4 +--
>  arch/riscv/kernel/vmlinux.lds.S  |  3 +-
>  arch/riscv/mm/init.c             | 58 +++++++++++++++++++++++++-------
>  arch/riscv/mm/physaddr.c         |  2 +-
>  8 files changed, 88 insertions(+), 33 deletions(-)
>
> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
> index 47a5003c2e28..62d94696a19c 100644
> --- a/arch/riscv/boot/loader.lds.S
> +++ b/arch/riscv/boot/loader.lds.S
> @@ -1,13 +1,14 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>
>  #include <asm/page.h>
> +#include <asm/pgtable.h>
>
>  OUTPUT_ARCH(riscv)
>  ENTRY(_start)
>
>  SECTIONS
>  {
> -	. = PAGE_OFFSET;
> +	. = KERNEL_LINK_ADDR;
>
>  	.payload : {
>  		*(.payload)
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 2d50f76efe48..48bb09b6a9b7 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t;
>
>  #ifdef CONFIG_MMU
>  extern unsigned long va_pa_offset;
> +extern unsigned long va_kernel_pa_offset;
>  extern unsigned long pfn_base;
>  #define ARCH_PFN_OFFSET		(pfn_base)
>  #else
>  #define va_pa_offset		0
> +#define va_kernel_pa_offset	0
>  #define ARCH_PFN_OFFSET		(PAGE_OFFSET >> PAGE_SHIFT)
>  #endif /* CONFIG_MMU */
>
>  extern unsigned long max_low_pfn;
>  extern unsigned long min_low_pfn;
> +extern unsigned long kernel_virt_addr;
>
>  #define __pa_to_va_nodebug(x)	((void *)((unsigned long) (x) + va_pa_offset))
> -#define __va_to_pa_nodebug(x)	((unsigned long)(x) - va_pa_offset)
> +#define linear_mapping_va_to_pa(x)	((unsigned long)(x) - va_pa_offset)
> +#define kernel_mapping_va_to_pa(x)	\
> +	((unsigned long)(x) - va_kernel_pa_offset)
> +#define __va_to_pa_nodebug(x)		\
> +	(((x) >= PAGE_OFFSET) ?		\
> +		linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))
>
>  #ifdef CONFIG_DEBUG_VIRTUAL
>  extern phys_addr_t __virt_to_phys(unsigned long x);
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 35b60035b6b0..94ef3b49dfb6 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -11,23 +11,29 @@
>
>  #include <asm/pgtable-bits.h>
>
> -#ifndef __ASSEMBLY__
> -
> -/* Page Upper Directory not used in RISC-V */
> -#include <asm-generic/pgtable-nopud.h>
> -#include <asm/page.h>
> -#include <asm/tlbflush.h>
> -#include <linux/mm_types.h>
> -
> -#ifdef CONFIG_MMU
> +#ifndef CONFIG_MMU
> +#define KERNEL_VIRT_ADDR	PAGE_OFFSET
> +#define KERNEL_LINK_ADDR	PAGE_OFFSET
> +#else
> +/*
> + * Leave 2GB for modules and BPF that must lie within a 2GB range around
> + * the kernel.
> + */
> +#define KERNEL_VIRT_ADDR	(VMALLOC_END - SZ_2G + 1)
> +#define KERNEL_LINK_ADDR	KERNEL_VIRT_ADDR

At a bare minimum this is going to make a mess of the 32-bit port, as
non-relocatable kernels are now going to get linked at 1GiB which is where user
code is supposed to live.  That's an easy fix, though, as the 32-bit stuff
doesn't need any module address restrictions.

>  #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
>  #define VMALLOC_END      (PAGE_OFFSET - 1)
>  #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
>
>  #define BPF_JIT_REGION_SIZE	(SZ_128M)
> -#define BPF_JIT_REGION_START	(PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> -#define BPF_JIT_REGION_END	(VMALLOC_END)
> +#define BPF_JIT_REGION_START	PFN_ALIGN((unsigned long)&_end)
> +#define BPF_JIT_REGION_END	(BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
> +
> +#ifdef CONFIG_64BIT
> +#define VMALLOC_MODULE_START	BPF_JIT_REGION_END
> +#define VMALLOC_MODULE_END	(((unsigned long)&_start & PAGE_MASK) + SZ_2G)
> +#endif
>
>  /*
>   * Roughly size the vmemmap space to be large enough to fit enough
> @@ -57,9 +63,16 @@
>  #define FIXADDR_SIZE     PGDIR_SIZE
>  #endif
>  #define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)
> -
>  #endif
>
> +#ifndef __ASSEMBLY__
> +
> +/* Page Upper Directory not used in RISC-V */
> +#include <asm-generic/pgtable-nopud.h>
> +#include <asm/page.h>
> +#include <asm/tlbflush.h>
> +#include <linux/mm_types.h>
> +
>  #ifdef CONFIG_64BIT
>  #include <asm/pgtable-64.h>
>  #else
> @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl
>
>  #define kern_addr_valid(addr)   (1) /* FIXME */
>
> +extern char _start[];
>  extern void *dtb_early_va;
>  void setup_bootmem(void);
>  void paging_init(void);
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index 98a406474e7d..8f5bb7731327 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -49,7 +49,8 @@ ENTRY(_start)
>  #ifdef CONFIG_MMU
>  relocate:
>  	/* Relocate return address */
> -	li a1, PAGE_OFFSET
> +	la a1, kernel_virt_addr
> +	REG_L a1, 0(a1)
>  	la a2, _start
>  	sub a1, a1, a2
>  	add ra, ra, a1
> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> index 8bbe5dbe1341..1a8fbe05accf 100644
> --- a/arch/riscv/kernel/module.c
> +++ b/arch/riscv/kernel/module.c
> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
>  }
>
>  #if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> -#define VMALLOC_MODULE_START \
> -	 max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
>  void *module_alloc(unsigned long size)
>  {
>  	return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
> -				    VMALLOC_END, GFP_KERNEL,
> +				    VMALLOC_MODULE_END, GFP_KERNEL,
>  				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
>  				    __builtin_return_address(0));
>  }
> diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
> index 0339b6bbe11a..a9abde62909f 100644
> --- a/arch/riscv/kernel/vmlinux.lds.S
> +++ b/arch/riscv/kernel/vmlinux.lds.S
> @@ -4,7 +4,8 @@
>   * Copyright (C) 2017 SiFive
>   */
>
> -#define LOAD_OFFSET PAGE_OFFSET
> +#include <asm/pgtable.h>
> +#define LOAD_OFFSET KERNEL_LINK_ADDR
>  #include <asm/vmlinux.lds.h>
>  #include <asm/page.h>
>  #include <asm/cache.h>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 736de6c8739f..71da78914645 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -22,6 +22,9 @@
>
>  #include "../kernel/head.h"
>
> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR;
> +EXPORT_SYMBOL(kernel_virt_addr);
> +
>  unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
>  							__page_aligned_bss;
>  EXPORT_SYMBOL(empty_zero_page);
> @@ -178,8 +181,12 @@ void __init setup_bootmem(void)
>  }
>
>  #ifdef CONFIG_MMU
> +/* Offset between linear mapping virtual address and kernel load address */
>  unsigned long va_pa_offset;
>  EXPORT_SYMBOL(va_pa_offset);
> +/* Offset between kernel mapping virtual address and kernel load address */
> +unsigned long va_kernel_pa_offset;
> +EXPORT_SYMBOL(va_kernel_pa_offset);
>  unsigned long pfn_base;
>  EXPORT_SYMBOL(pfn_base);
>
> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va)
>  	if (mmu_enabled)
>  		return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
>
> -	pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT;
> +	pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT;
>  	BUG_ON(pmd_num >= NUM_EARLY_PMDS);
>  	return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD];
>  }
> @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
>  #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
>  #endif
>
> +static uintptr_t load_pa, load_sz;
> +
> +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size)
> +{
> +	uintptr_t va, end_va;
> +
> +	end_va = kernel_virt_addr + load_sz;
> +	for (va = kernel_virt_addr; va < end_va; va += map_size)
> +		create_pgd_mapping(pgdir, va,
> +				   load_pa + (va - kernel_virt_addr),
> +				   map_size, PAGE_KERNEL_EXEC);
> +}
> +
>  asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>  {
>  	uintptr_t va, end_va;
> -	uintptr_t load_pa = (uintptr_t)(&_start);
> -	uintptr_t load_sz = (uintptr_t)(&_end) - load_pa;
>  	uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE);
>
> +	load_pa = (uintptr_t)(&_start);
> +	load_sz = (uintptr_t)(&_end) - load_pa;
> +
>  	va_pa_offset = PAGE_OFFSET - load_pa;
> +	va_kernel_pa_offset = kernel_virt_addr - load_pa;
> +
>  	pfn_base = PFN_DOWN(load_pa);
>
>  	/*
> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>  	create_pmd_mapping(fixmap_pmd, FIXADDR_START,
>  			   (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
>  	/* Setup trampoline PGD and PMD */
> -	create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
> +	create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
>  			   (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
> -	create_pmd_mapping(trampoline_pmd, PAGE_OFFSET,
> +	create_pmd_mapping(trampoline_pmd, kernel_virt_addr,
>  			   load_pa, PMD_SIZE, PAGE_KERNEL_EXEC);
>  #else
>  	/* Setup trampoline PGD */
> -	create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET,
> +	create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr,
>  			   load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC);
>  #endif
>
>  	/*
> -	 * Setup early PGD covering entire kernel which will allows
> +	 * Setup early PGD covering entire kernel which will allow
>  	 * us to reach paging_init(). We map all memory banks later
>  	 * in setup_vm_final() below.
>  	 */
> -	end_va = PAGE_OFFSET + load_sz;
> -	for (va = PAGE_OFFSET; va < end_va; va += map_size)
> -		create_pgd_mapping(early_pg_dir, va,
> -				   load_pa + (va - PAGE_OFFSET),
> -				   map_size, PAGE_KERNEL_EXEC);
> +	create_kernel_page_table(early_pg_dir, map_size);
>
>  	/* Create fixed mapping for early FDT parsing */
>  	end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE;
> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void)
>  	uintptr_t va, map_size;
>  	phys_addr_t pa, start, end;
>  	struct memblock_region *reg;
> +	static struct vm_struct vm_kernel = { 0 };
>
>  	/* Set mmu_enabled flag */
>  	mmu_enabled = true;
> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void)
>  		for (pa = start; pa < end; pa += map_size) {
>  			va = (uintptr_t)__va(pa);
>  			create_pgd_mapping(swapper_pg_dir, va, pa,
> -					   map_size, PAGE_KERNEL_EXEC);
> +					   map_size, PAGE_KERNEL);
>  		}
>  	}
>
> +	/* Map the kernel */
> +	create_kernel_page_table(swapper_pg_dir, PMD_SIZE);
> +
> +	/* Reserve the vmalloc area occupied by the kernel */
> +	vm_kernel.addr = (void *)kernel_virt_addr;
> +	vm_kernel.phys_addr = load_pa;
> +	vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1);
> +	vm_kernel.flags = VM_MAP | VM_NO_GUARD;
> +	vm_kernel.caller = __builtin_return_address(0);
> +
> +	vm_area_add_early(&vm_kernel);
> +
>  	/* Clear fixmap PTE and PMD mappings */
>  	clear_fixmap(FIX_PTE);
>  	clear_fixmap(FIX_PMD);
> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c
> index e8e4dcd39fed..35703d5ef5fd 100644
> --- a/arch/riscv/mm/physaddr.c
> +++ b/arch/riscv/mm/physaddr.c
> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys);
>
>  phys_addr_t __phys_addr_symbol(unsigned long x)
>  {
> -	unsigned long kernel_start = (unsigned long)PAGE_OFFSET;
> +	unsigned long kernel_start = (unsigned long)kernel_virt_addr;
>  	unsigned long kernel_end = (unsigned long)_end;
>
>  	/*

^ permalink raw reply

* Re: [PATCH v2 2/3] powerpc/64s: remove PROT_SAO support
From: Paul Mackerras @ 2020-07-09  4:34 UTC (permalink / raw)
  To: Nicholas Piggin, David Gibson; +Cc: linux-mm, linuxppc-dev, kvm-ppc, linux-api
In-Reply-To: <20200703011958.1166620-3-npiggin@gmail.com>

On Fri, Jul 03, 2020 at 11:19:57AM +1000, Nicholas Piggin wrote:
> ISA v3.1 does not support the SAO storage control attribute required to
> implement PROT_SAO. PROT_SAO was used by specialised system software
> (Lx86) that has been discontinued for about 7 years, and is not thought
> to be used elsewhere, so removal should not cause problems.
> 
> We rather remove it than keep support for older processors, because
> live migrating guest partitions to newer processors may not be possible
> if SAO is in use (or worse allowed with silent races).

This is actually a real problem for KVM, because now we have the
capabilities of the host affecting the characteristics of the guest
virtual machine in a manner which userspace (e.g. QEMU) is unable to
control.

It would probably be better to disallow SAO on all machines than have
it available on some hosts and not others.  (Yes I know there is a
check on CPU_FTR_ARCH_206 in there, but that has been a no-op since we
removed the PPC970 KVM support.)

Solving this properly will probably require creating a new KVM host
capability and associated machine parameter in QEMU, along with a new
machine type.

[snip]

> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 9bb9bb370b53..fac39ff659d4 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -398,9 +398,10 @@ static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci)
>  {
>  	unsigned int wimg = hptel & HPTE_R_WIMG;
>  
> -	/* Handle SAO */
> +	/* Handle SAO for POWER7,8,9 */
>  	if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
> -	    cpu_has_feature(CPU_FTR_ARCH_206))
> +	    cpu_has_feature(CPU_FTR_ARCH_206) &&
> +	    !cpu_has_feature(CPU_FTR_ARCH_31))
>  		wimg = HPTE_R_M;

Paul.

^ permalink raw reply

* [PATCH v2 2/5] powerpc/lib: Initialize a temporary mm for code patching
From: Christopher M. Riedl @ 2020-07-09  4:03 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel-hardening
In-Reply-To: <20200709040316.12789-1-cmr@informatik.wtf>

When code patching a STRICT_KERNEL_RWX kernel the page containing the
address to be patched is temporarily mapped with permissive memory
protections. Currently, a per-cpu vmalloc patch area is used for this
purpose. While the patch area is per-cpu, the temporary page mapping is
inserted into the kernel page tables for the duration of the patching.
The mapping is exposed to CPUs other than the patching CPU - this is
undesirable from a hardening perspective.

Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm by copying the init mm. Choose a
randomized patching address inside the temporary mm userspace address
portion. The next patch uses the temporary mm and patching address for
code patching.

Based on x86 implementation:

commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---
 arch/powerpc/lib/code-patching.c | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0a051dfeb177..8ae1a9e5fe6e 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -11,6 +11,8 @@
 #include <linux/cpuhotplug.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/sched/task.h>
+#include <linux/random.h>
 
 #include <asm/tlbflush.h>
 #include <asm/page.h>
@@ -44,6 +46,37 @@ int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
 }
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
+
+static struct mm_struct *patching_mm __ro_after_init;
+static unsigned long patching_addr __ro_after_init;
+
+void __init poking_init(void)
+{
+	spinlock_t *ptl; /* for protecting pte table */
+	pte_t *ptep;
+
+	/*
+	 * Some parts of the kernel (static keys for example) depend on
+	 * successful code patching. Code patching under STRICT_KERNEL_RWX
+	 * requires this setup - otherwise we cannot patch at all. We use
+	 * BUG_ON() here and later since an early failure is preferred to
+	 * buggy behavior and/or strange crashes later.
+	 */
+	patching_mm = copy_init_mm();
+	BUG_ON(!patching_mm);
+
+	/*
+	 * In hash we cannot go above DEFAULT_MAP_WINDOW easily.
+	 * XXX: Do we want additional bits of entropy for radix?
+	 */
+	patching_addr = (get_random_long() & PAGE_MASK) %
+		(DEFAULT_MAP_WINDOW - PAGE_SIZE);
+
+	ptep = get_locked_pte(patching_mm, patching_addr, &ptl);
+	BUG_ON(!ptep);
+	pte_unmap_unlock(ptep, ptl);
+}
+
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
 
 static int text_area_cpu_up(unsigned int cpu)
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 3/5] powerpc/lib: Use a temporary mm for code patching
From: Christopher M. Riedl @ 2020-07-09  4:03 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel-hardening
In-Reply-To: <20200709040316.12789-1-cmr@informatik.wtf>

Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
mappings to other CPUs. These mappings should be kept local to the CPU
doing the patching. Use the pre-initialized temporary mm and patching
address for this purpose. Also add a check after patching to ensure the
patch succeeded.

Use the KUAP functions on non-BOOKS3_64 platforms since the temporary
mapping for patching uses a userspace address (to keep the mapping
local). On BOOKS3_64 platforms hash does not implement KUAP and on radix
the use of PAGE_KERNEL sets EAA[0] for the PTE which means the AMR
(KUAP) protection is ignored (see PowerISA v3.0b, Fig, 35).

Based on x86 implementation:

commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---
 arch/powerpc/lib/code-patching.c | 152 +++++++++++--------------------
 1 file changed, 54 insertions(+), 98 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 8ae1a9e5fe6e..80fe3864f377 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -19,6 +19,7 @@
 #include <asm/code-patching.h>
 #include <asm/setup.h>
 #include <asm/inst.h>
+#include <asm/mmu_context.h>
 
 static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr,
 			       struct ppc_inst *patch_addr)
@@ -77,106 +78,57 @@ void __init poking_init(void)
 	pte_unmap_unlock(ptep, ptl);
 }
 
-static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
-
-static int text_area_cpu_up(unsigned int cpu)
-{
-	struct vm_struct *area;
-
-	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
-	if (!area) {
-		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
-			cpu);
-		return -1;
-	}
-	this_cpu_write(text_poke_area, area);
-
-	return 0;
-}
-
-static int text_area_cpu_down(unsigned int cpu)
-{
-	free_vm_area(this_cpu_read(text_poke_area));
-	return 0;
-}
-
-/*
- * Run as a late init call. This allows all the boot time patching to be done
- * simply by patching the code, and then we're called here prior to
- * mark_rodata_ro(), which happens after all init calls are run. Although
- * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
- * it as being preferable to a kernel that will crash later when someone tries
- * to use patch_instruction().
- */
-static int __init setup_text_poke_area(void)
-{
-	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
-		"powerpc/text_poke:online", text_area_cpu_up,
-		text_area_cpu_down));
-
-	return 0;
-}
-late_initcall(setup_text_poke_area);
+struct patch_mapping {
+	spinlock_t *ptl; /* for protecting pte table */
+	pte_t *ptep;
+	struct temp_mm temp_mm;
+};
 
 /*
  * This can be called for kernel text or a module.
  */
-static int map_patch_area(void *addr, unsigned long text_poke_addr)
+static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
 {
-	unsigned long pfn;
-	int err;
+	struct page *page;
+	pte_t pte;
+	pgprot_t pgprot;
 
 	if (is_vmalloc_addr(addr))
-		pfn = vmalloc_to_pfn(addr);
+		page = vmalloc_to_page(addr);
 	else
-		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
+		page = virt_to_page(addr);
 
-	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
+	if (radix_enabled())
+		pgprot = PAGE_KERNEL;
+	else
+		pgprot = PAGE_SHARED;
 
-	pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
-	if (err)
+	patch_mapping->ptep = get_locked_pte(patching_mm, patching_addr,
+					     &patch_mapping->ptl);
+	if (unlikely(!patch_mapping->ptep)) {
+		pr_warn("map patch: failed to allocate pte for patching\n");
 		return -1;
+	}
+
+	pte = mk_pte(page, pgprot);
+	pte = pte_mkdirty(pte);
+	set_pte_at(patching_mm, patching_addr, patch_mapping->ptep, pte);
+
+	init_temp_mm(&patch_mapping->temp_mm, patching_mm);
+	use_temporary_mm(&patch_mapping->temp_mm);
 
 	return 0;
 }
 
-static inline int unmap_patch_area(unsigned long addr)
+static void unmap_patch(struct patch_mapping *patch_mapping)
 {
-	pte_t *ptep;
-	pmd_t *pmdp;
-	pud_t *pudp;
-	p4d_t *p4dp;
-	pgd_t *pgdp;
-
-	pgdp = pgd_offset_k(addr);
-	if (unlikely(!pgdp))
-		return -EINVAL;
-
-	p4dp = p4d_offset(pgdp, addr);
-	if (unlikely(!p4dp))
-		return -EINVAL;
-
-	pudp = pud_offset(p4dp, addr);
-	if (unlikely(!pudp))
-		return -EINVAL;
-
-	pmdp = pmd_offset(pudp, addr);
-	if (unlikely(!pmdp))
-		return -EINVAL;
-
-	ptep = pte_offset_kernel(pmdp, addr);
-	if (unlikely(!ptep))
-		return -EINVAL;
+	/* In hash, pte_clear flushes the tlb */
+	pte_clear(patching_mm, patching_addr, patch_mapping->ptep);
+	unuse_temporary_mm(&patch_mapping->temp_mm);
 
-	pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr);
-
-	/*
-	 * In hash, pte_clear flushes the tlb, in radix, we have to
-	 */
-	pte_clear(&init_mm, addr, ptep);
-	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-
-	return 0;
+	/* In radix, we have to explicitly flush the tlb (no-op in hash) */
+	local_flush_tlb_mm(patching_mm);
+	pte_unmap_unlock(patch_mapping->ptep, patch_mapping->ptl);
 }
 
 static int do_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
@@ -184,32 +136,36 @@ static int do_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
 	int err;
 	struct ppc_inst *patch_addr = NULL;
 	unsigned long flags;
-	unsigned long text_poke_addr;
-	unsigned long kaddr = (unsigned long)addr;
+	struct patch_mapping patch_mapping;
 
 	/*
-	 * During early early boot patch_instruction is called
-	 * when text_poke_area is not ready, but we still need
-	 * to allow patching. We just do the plain old patching
+	 * The patching_mm is initialized before calling mark_rodata_ro. Prior
+	 * to this, patch_instruction is called when we don't have (and don't
+	 * need) the patching_mm so just do plain old patching.
 	 */
-	if (!this_cpu_read(text_poke_area))
+	if (!patching_mm)
 		return raw_patch_instruction(addr, instr);
 
 	local_irq_save(flags);
 
-	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
-	if (map_patch_area(addr, text_poke_addr)) {
-		err = -1;
+	err = map_patch(addr, &patch_mapping);
+	if (err)
 		goto out;
-	}
 
-	patch_addr = (struct ppc_inst *)(text_poke_addr + (kaddr & ~PAGE_MASK));
+	patch_addr = (struct ppc_inst *)(patching_addr | offset_in_page(addr));
 
-	__patch_instruction(addr, instr, patch_addr);
+	if (!radix_enabled())
+		allow_write_to_user(patch_addr, ppc_inst_len(instr));
+	err = __patch_instruction(addr, instr, patch_addr);
+	if (!radix_enabled())
+		prevent_write_to_user(patch_addr, ppc_inst_len(instr));
 
-	err = unmap_patch_area(text_poke_addr);
-	if (err)
-		pr_warn("failed to unmap %lx\n", text_poke_addr);
+	unmap_patch(&patch_mapping);
+	/*
+	 * Something is wrong if what we just wrote doesn't match what we
+	 * think we just wrote.
+	 */
+	WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr));
 
 out:
 	local_irq_restore(flags);
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 5/5] powerpc: Add LKDTM test to hijack a patch mapping
From: Christopher M. Riedl @ 2020-07-09  4:03 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel-hardening
In-Reply-To: <20200709040316.12789-1-cmr@informatik.wtf>

When live patching with STRICT_KERNEL_RWX, the CPU doing the patching
must use a temporary mapping which allows for writing to kernel text.
During the entire window of time when this temporary mapping is in use,
another CPU could write to the same mapping and maliciously alter kernel
text. Implement a LKDTM test to attempt to exploit such a openings when
a CPU is patching under STRICT_KERNEL_RWX. The test is only implemented
on powerpc for now.

The LKDTM "hijack" test works as follows:

	1. A CPU executes an infinite loop to patch an instruction.
	   This is the "patching" CPU.
	2. Another CPU attempts to write to the address of the temporary
	   mapping used by the "patching" CPU. This other CPU is the
	   "hijacker" CPU. The hijack either fails with a segfault or
	   succeeds, in which case some kernel text is now overwritten.

How to run the test:

	mount -t debugfs none /sys/kernel/debug
	(echo HIJACK_PATCH > /sys/kernel/debug/provoke-crash/DIRECT)

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---
 drivers/misc/lkdtm/core.c  |  1 +
 drivers/misc/lkdtm/lkdtm.h |  1 +
 drivers/misc/lkdtm/perms.c | 99 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+)

diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index a5e344df9166..482e72f6a1e1 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -145,6 +145,7 @@ static const struct crashtype crashtypes[] = {
 	CRASHTYPE(WRITE_RO),
 	CRASHTYPE(WRITE_RO_AFTER_INIT),
 	CRASHTYPE(WRITE_KERN),
+	CRASHTYPE(HIJACK_PATCH),
 	CRASHTYPE(REFCOUNT_INC_OVERFLOW),
 	CRASHTYPE(REFCOUNT_ADD_OVERFLOW),
 	CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW),
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 601a2156a0d4..bfcf3542370d 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -62,6 +62,7 @@ void lkdtm_EXEC_USERSPACE(void);
 void lkdtm_EXEC_NULL(void);
 void lkdtm_ACCESS_USERSPACE(void);
 void lkdtm_ACCESS_NULL(void);
+void lkdtm_HIJACK_PATCH(void);
 
 /* lkdtm_refcount.c */
 void lkdtm_REFCOUNT_INC_OVERFLOW(void);
diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
index 62f76d506f04..b7149daaeb6f 100644
--- a/drivers/misc/lkdtm/perms.c
+++ b/drivers/misc/lkdtm/perms.c
@@ -9,6 +9,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mman.h>
 #include <linux/uaccess.h>
+#include <linux/kthread.h>
 #include <asm/cacheflush.h>
 
 /* Whether or not to fill the target memory area with do_nothing(). */
@@ -213,6 +214,104 @@ void lkdtm_ACCESS_NULL(void)
 	*ptr = tmp;
 }
 
+#if defined(CONFIG_PPC) && defined(CONFIG_STRICT_KERNEL_RWX)
+#include <include/asm/code-patching.h>
+
+static struct ppc_inst * const patch_site = (struct ppc_inst *)&do_nothing;
+
+static int lkdtm_patching_cpu(void *data)
+{
+	int err = 0;
+	struct ppc_inst insn = ppc_inst(0xdeadbeef);
+
+	pr_info("starting patching_cpu=%d\n", smp_processor_id());
+	do {
+		err = patch_instruction(patch_site, insn);
+	} while (ppc_inst_equal(ppc_inst_read(READ_ONCE(patch_site)), insn) &&
+			!err && !kthread_should_stop());
+
+	if (err)
+		pr_warn("patch_instruction returned error: %d\n", err);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	return err;
+}
+
+void lkdtm_HIJACK_PATCH(void)
+{
+	struct task_struct *patching_kthrd;
+	struct ppc_inst original_insn;
+	int patching_cpu, hijacker_cpu, attempts;
+	unsigned long addr;
+	bool hijacked;
+
+	if (num_online_cpus() < 2) {
+		pr_warn("need at least two cpus\n");
+		return;
+	}
+
+	original_insn = ppc_inst_read(READ_ONCE(patch_site));
+
+	hijacker_cpu = smp_processor_id();
+	patching_cpu = cpumask_any_but(cpu_online_mask, hijacker_cpu);
+
+	patching_kthrd = kthread_create_on_node(&lkdtm_patching_cpu, NULL,
+						cpu_to_node(patching_cpu),
+						"lkdtm_patching_cpu");
+	kthread_bind(patching_kthrd, patching_cpu);
+	wake_up_process(patching_kthrd);
+
+	addr = offset_in_page(patch_site) | read_cpu_patching_addr(patching_cpu);
+
+	pr_info("starting hijacker_cpu=%d\n", hijacker_cpu);
+	for (attempts = 0; attempts < 100000; ++attempts) {
+		/* Use __put_user to catch faults without an Oops */
+		hijacked = !__put_user(0xbad00bad, (unsigned int *)addr);
+
+		if (hijacked) {
+			if (kthread_stop(patching_kthrd))
+				goto out;
+			break;
+		}
+	}
+	pr_info("hijack attempts: %d\n", attempts);
+
+	if (hijacked) {
+		if (*(unsigned int *)READ_ONCE(patch_site) == 0xbad00bad)
+			pr_err("overwrote kernel text\n");
+		/*
+		 * There are window conditions where the hijacker cpu manages to
+		 * write to the patch site but the site gets overwritten again by
+		 * the patching cpu. We still consider that a "successful" hijack
+		 * since the hijacker cpu did not fault on the write.
+		 */
+		pr_err("FAIL: wrote to another cpu's patching area\n");
+	} else {
+		kthread_stop(patching_kthrd);
+	}
+
+out:
+	/* Restore the original insn for any future lkdtm tests */
+	patch_instruction(patch_site, original_insn);
+}
+
+#else
+
+void lkdtm_HIJACK_PATCH(void)
+{
+	if (!IS_ENABLED(CONFIG_PPC))
+		pr_err("XFAIL: this test is powerpc-only\n");
+	if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+		pr_err("XFAIL: this test requires CONFIG_STRICT_KERNEL_RWX\n");
+}
+
+#endif /* CONFIG_PPC && CONFIG_STRICT_KERNEL_RWX */
+
 void __init lkdtm_perms_init(void)
 {
 	/* Make sure we can write to __ro_after_init values during __init */
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH 1/1] KVM/PPC: Fix typo on H_DISABLE_AND_GET hcall
From: Paul Mackerras @ 2020-07-09  4:09 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: Mark Rutland, Sukadev Bhattiprolu, Peter Zijlstra, linuxppc-dev,
	Arnaldo Carvalho de Melo, Bharata B Rao, Alexander Shishkin,
	Ingo Molnar, kvm-ppc, Namhyung Kim, Vaibhav Jain, Jiri Olsa,
	linux-kernel
In-Reply-To: <20200707004812.190765-1-leobras.c@gmail.com>

On Mon, Jul 06, 2020 at 09:48:12PM -0300, Leonardo Bras wrote:
> On PAPR+ the hcall() on 0x1B0 is called H_DISABLE_AND_GET, but got
> defined as H_DISABLE_AND_GETC instead.
> 
> This define was introduced with a typo in commit <b13a96cfb055>
> ("[PATCH] powerpc: Extends HCALL interface for InfiniBand usage"), and was
> later used without having the typo noticed.
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>

Acked-by: Paul Mackerras <paulus@ozlabs.org>

Since this hypercall is not implemented in KVM nor used by KVM guests,
I'll leave this one for Michael to pick up.

Paul.

^ permalink raw reply

* [PATCH 0/5] Use per-CPU temporary mappings for patching
From: Christopher M. Riedl @ 2020-07-09  4:03 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel-hardening

When compiled with CONFIG_STRICT_KERNEL_RWX, the kernel must create
temporary mappings when patching itself. These mappings temporarily
override the strict RWX text protections to permit a write. Currently,
powerpc allocates a per-CPU VM area for patching. Patching occurs as
follows:

	1. Map page of text to be patched to per-CPU VM area w/
	   PAGE_KERNEL protection
	2. Patch text
	3. Remove the temporary mapping

While the VM area is per-CPU, the mapping is actually inserted into the
kernel page tables. Presumably, this could allow another CPU to access
the normally write-protected text - either malicously or accidentally -
via this same mapping if the address of the VM area is known. Ideally,
the mapping should be kept local to the CPU doing the patching (or any
other sensitive operations requiring temporarily overriding memory
protections) [0].

x86 introduced "temporary mm" structs which allow the creation of
mappings local to a particular CPU [1]. This series intends to bring the
notion of a temporary mm to powerpc and harden powerpc by using such a
mapping for patching a kernel with strict RWX permissions.

The first patch introduces the temporary mm struct and API for powerpc
along with a new function to retrieve a current hw breakpoint.

The second patch uses the `poking_init` init hook added by the x86
patches to initialize a temporary mm and patching address. The patching
address is randomized between 0 and DEFAULT_MAP_WINDOW-PAGE_SIZE. The
upper limit is necessary due to how the hash MMU operates - by default
the space above DEFAULT_MAP_WINDOW is not available. For now, both hash
and radix randomize inside this range. The number of possible random
addresses is dependent on PAGE_SIZE and limited by DEFAULT_MAP_WINDOW.

Bits of entropy with 64K page size on BOOK3S_64:

	bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)

	PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
	bits of entropy = log2(128TB / 64K)
	bits of entropy = 31

Randomization occurs only once during initialization at boot.

The third patch replaces the VM area with the temporary mm in the
patching code. The page for patching has to be mapped PAGE_SHARED with
the hash MMU since hash prevents the kernel from accessing userspace
pages with PAGE_PRIVILEGED bit set. On the radix MMU the page is mapped with
PAGE_KERNEL which has the added benefit that we can skip KUAP. 

The fourth and fifth patches implement an LKDTM test "proof-of-concept" which
exploits the previous vulnerability (ie. the mapping during patching is exposed
in kernel page tables and accessible by other CPUS). The LKDTM test is somewhat
"rough" in that it uses a brute-force approach - I am open to any suggestions
and/or ideas to improve this. Currently, the LKDTM test passes with this series
on POWER8 (hash) and POWER9 (radix, hash) and fails without this series (ie.
the temporary mapping for patching is exposed to CPUs other than the patching
CPU).

The test can be applied to a tree without this new series by applying
the last two patches of this series, and then fixing up in
/arch/powerpc/lib/code-patching.c:

 #ifdef CONFIG_STRICT_KERNEL_RWX
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);

+#ifdef CONFIG_LKDTM
+unsigned long read_cpu_patching_addr(unsigned int cpu)
+{
+       return (unsigned long)(per_cpu(text_poke_area, cpu))->addr;
+}
+#endif
+
 static int text_area_cpu_up(unsigned int cpu)
 {
        struct vm_struct *area;

I also have a tree with just linuxppc/next and the LKDTM test here:
https://github.com/cmr-informatik/linux/commits/percpu-lkdtm-only

Tested on Blackbird (8-core POWER9) w/ Hash (`disable_radix`) and Radix
MMUs.

v2:	* Rebase on linuxppc/next:
	  commit 105fb38124a4 ("powerpc/8xx: Modify ptep_get()")
	* Always dirty pte when mapping patch
	* Use `ppc_inst_len` instead of `sizeof` on instructions
	* Declare LKDTM patching addr accessor in header where it
	  belongs	

v1:	* Rebase on linuxppc/next (4336b9337824)
	* Save and restore second hw watchpoint
	* Use new ppc_inst_* functions for patching check and in LKDTM
	  test

rfc-v2:	* Many fixes and improvements mostly based on extensive feedback and
          testing by Christophe Leroy (thanks!).
	* Make patching_mm and patching_addr static and mode '__ro_after_init'
	  to after the variable name (more common in other parts of the kernel)
	* Use 'asm/debug.h' header instead of 'asm/hw_breakpoint.h' to fix
	  PPC64e compile
	* Add comment explaining why we use BUG_ON() during the init call to
	  setup for patching later
	* Move ptep into patch_mapping to avoid walking page tables a second
	  time when unmapping the temporary mapping
	* Use KUAP under non-radix, also manually dirty the PTE for patch
	  mapping on non-BOOK3S_64 platforms
	* Properly return any error from __patch_instruction
	* Do not use 'memcmp' where a simple comparison is appropriate
	* Simplify expression for patch address by removing pointer maths
	* Add LKDTM test

[0]: https://github.com/linuxppc/issues/issues/224
[1]: https://lore.kernel.org/kernel-hardening/20190426232303.28381-1-nadav.amit@gmail.com/

Christopher M. Riedl (5):
  powerpc/mm: Introduce temporary mm
  powerpc/lib: Initialize a temporary mm for code patching
  powerpc/lib: Use a temporary mm for code patching
  powerpc/lib: Add LKDTM accessor for patching addr
  powerpc: Add LKDTM test to hijack a patch mapping

 arch/powerpc/include/asm/code-patching.h |   4 +
 arch/powerpc/include/asm/debug.h         |   1 +
 arch/powerpc/include/asm/mmu_context.h   |  64 +++++++++
 arch/powerpc/kernel/process.c            |   5 +
 arch/powerpc/lib/code-patching.c         | 176 +++++++++++------------
 drivers/misc/lkdtm/core.c                |   1 +
 drivers/misc/lkdtm/lkdtm.h               |   1 +
 drivers/misc/lkdtm/perms.c               |  99 +++++++++++++
 8 files changed, 261 insertions(+), 90 deletions(-)

-- 
2.27.0

^ permalink raw reply

* [PATCH v2 1/5] powerpc/mm: Introduce temporary mm
From: Christopher M. Riedl @ 2020-07-09  4:03 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel-hardening
In-Reply-To: <20200709040316.12789-1-cmr@informatik.wtf>

x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. A side benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.

Mappings in the temporary mm can be set in the userspace portion of the
address-space.

Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use.

Based on x86 implementation:

commit cefa929c034e
("x86/mm: Introduce temporary mm structs")

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---
 arch/powerpc/include/asm/debug.h       |  1 +
 arch/powerpc/include/asm/mmu_context.h | 64 ++++++++++++++++++++++++++
 arch/powerpc/kernel/process.c          |  5 ++
 3 files changed, 70 insertions(+)

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index ec57daf87f40..827350c9bcf3 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -46,6 +46,7 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
 void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk);
+void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk);
 bool ppc_breakpoint_available(void);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 1a474f6b1992..9269c7c7b04e 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -10,6 +10,7 @@
 #include <asm/mmu.h>	
 #include <asm/cputable.h>
 #include <asm/cputhreads.h>
+#include <asm/debug.h>
 
 /*
  * Most if the context management is out of line
@@ -300,5 +301,68 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
 	return 0;
 }
 
+struct temp_mm {
+	struct mm_struct *temp;
+	struct mm_struct *prev;
+	bool is_kernel_thread;
+	struct arch_hw_breakpoint brk[HBP_NUM_MAX];
+};
+
+static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm)
+{
+	temp_mm->temp = mm;
+	temp_mm->prev = NULL;
+	temp_mm->is_kernel_thread = false;
+	memset(&temp_mm->brk, 0, sizeof(temp_mm->brk));
+}
+
+static inline void use_temporary_mm(struct temp_mm *temp_mm)
+{
+	lockdep_assert_irqs_disabled();
+
+	temp_mm->is_kernel_thread = current->mm == NULL;
+	if (temp_mm->is_kernel_thread)
+		temp_mm->prev = current->active_mm;
+	else
+		temp_mm->prev = current->mm;
+
+	/*
+	 * Hash requires a non-NULL current->mm to allocate a userspace address
+	 * when handling a page fault. Does not appear to hurt in Radix either.
+	 */
+	current->mm = temp_mm->temp;
+	switch_mm_irqs_off(NULL, temp_mm->temp, current);
+
+	if (ppc_breakpoint_available()) {
+		struct arch_hw_breakpoint null_brk = {0};
+		int i = 0;
+
+		for (; i < nr_wp_slots(); ++i) {
+			__get_breakpoint(i, &temp_mm->brk[i]);
+			if (temp_mm->brk[i].type != 0)
+				__set_breakpoint(i, &null_brk);
+		}
+	}
+}
+
+static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
+{
+	lockdep_assert_irqs_disabled();
+
+	if (temp_mm->is_kernel_thread)
+		current->mm = NULL;
+	else
+		current->mm = temp_mm->prev;
+	switch_mm_irqs_off(NULL, temp_mm->prev, current);
+
+	if (ppc_breakpoint_available()) {
+		int i = 0;
+
+		for (; i < nr_wp_slots(); ++i)
+			if (temp_mm->brk[i].type != 0)
+				__set_breakpoint(i, &temp_mm->brk[i]);
+	}
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 4650b9bb217f..b6c123bf5edd 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -824,6 +824,11 @@ static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk)
 	return 0;
 }
 
+void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk)
+{
+	memcpy(brk, this_cpu_ptr(&current_brk[nr]), sizeof(*brk));
+}
+
 void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
 {
 	memcpy(this_cpu_ptr(&current_brk[nr]), brk, sizeof(*brk));
-- 
2.27.0


^ permalink raw reply related

* [PATCH v2 4/5] powerpc/lib: Add LKDTM accessor for patching addr
From: Christopher M. Riedl @ 2020-07-09  4:03 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel-hardening
In-Reply-To: <20200709040316.12789-1-cmr@informatik.wtf>

When live patching a STRICT_RWX kernel, a mapping is installed at a
"patching address" with temporary write permissions. Provide a
LKDTM-only accessor function for this address in preparation for a LKDTM
test which attempts to "hijack" this mapping by writing to it from
another CPU.

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---
 arch/powerpc/include/asm/code-patching.h | 4 ++++
 arch/powerpc/lib/code-patching.c         | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index eacc9102c251..ffc6dfdbbf8e 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -187,4 +187,8 @@ static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
 				 ___PPC_RA(__REG_R1) | PPC_LR_STKOFF)
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_LKDTM
+unsigned long read_cpu_patching_addr(unsigned int cpu);
+#endif
+
 #endif /* _ASM_POWERPC_CODE_PATCHING_H */
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 80fe3864f377..a12db2092947 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -51,6 +51,13 @@ int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
 static struct mm_struct *patching_mm __ro_after_init;
 static unsigned long patching_addr __ro_after_init;
 
+#ifdef CONFIG_LKDTM
+unsigned long read_cpu_patching_addr(unsigned int cpu)
+{
+	return patching_addr;
+}
+#endif
+
 void __init poking_init(void)
 {
 	spinlock_t *ptl; /* for protecting pte table */
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH v2 09/10] tools/perf: Add perf tools support for extended register capability in powerpc
From: Athira Rajeev @ 2020-07-09  3:10 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: mikey, maddy, linuxppc-dev
In-Reply-To: <87pn962owo.fsf@mpe.ellerman.id.au>

[-- Attachment #1: Type: text/plain, Size: 6308 bytes --]



> On 08-Jul-2020, at 5:34 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Athira Rajeev <atrajeev@linux.vnet.ibm.com <mailto:atrajeev@linux.vnet.ibm.com>> writes:
>> From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
>> 
>> Add extended regs to sample_reg_mask in the tool side to use
>> with `-I?` option. Perf tools side uses extended mask to display
>> the platform supported register names (with -I? option) to the user
>> and also send this mask to the kernel to capture the extended registers
>> in each sample. Hence decide the mask value based on the processor
>> version.
>> 
>> Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
>> [Decide extended mask at run time based on platform]
>> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>> Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
> 
> Will need an ack from perf tools folks, who are not on Cc by the looks.
> 

Yes, my bad. Will make sure to add proper Cc 

>> diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
>> index f599064..485b1d5 100644
>> --- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
>> +++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
>> @@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
>> 	PERF_REG_POWERPC_DSISR,
>> 	PERF_REG_POWERPC_SIER,
>> 	PERF_REG_POWERPC_MMCRA,
>> -	PERF_REG_POWERPC_MAX,
>> +	/* Extended registers */
>> +	PERF_REG_POWERPC_MMCR0,
>> +	PERF_REG_POWERPC_MMCR1,
>> +	PERF_REG_POWERPC_MMCR2,
>> +	/* Max regs without the extended regs */
>> +	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
> 
> I don't really understand this idea of a max that's not the max.
> 
>> };
>> +
>> +#define PERF_REG_PMU_MASK	((1ULL << PERF_REG_POWERPC_MAX) - 1)
>> +
>> +/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
>> +#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \
>> +				- PERF_REG_PMU_MASK)
>> +
>> #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
>> diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
>> index e18a355..46ed00d 100644
>> --- a/tools/perf/arch/powerpc/include/perf_regs.h
>> +++ b/tools/perf/arch/powerpc/include/perf_regs.h
>> @@ -64,7 +64,10 @@
>> 	[PERF_REG_POWERPC_DAR] = "dar",
>> 	[PERF_REG_POWERPC_DSISR] = "dsisr",
>> 	[PERF_REG_POWERPC_SIER] = "sier",
>> -	[PERF_REG_POWERPC_MMCRA] = "mmcra"
>> +	[PERF_REG_POWERPC_MMCRA] = "mmcra",
>> +	[PERF_REG_POWERPC_MMCR0] = "mmcr0",
>> +	[PERF_REG_POWERPC_MMCR1] = "mmcr1",
>> +	[PERF_REG_POWERPC_MMCR2] = "mmcr2",
>> };
>> 
>> static inline const char *perf_reg_name(int id)
>> diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
>> index 0a52429..9179230 100644
>> --- a/tools/perf/arch/powerpc/util/perf_regs.c
>> +++ b/tools/perf/arch/powerpc/util/perf_regs.c
>> @@ -6,9 +6,14 @@
>> 
>> #include "../../../util/perf_regs.h"
>> #include "../../../util/debug.h"
>> +#include "../../../util/event.h"
>> +#include "../../../util/header.h"
>> +#include "../../../perf-sys.h"
>> 
>> #include <linux/kernel.h>
>> 
>> +#define PVR_POWER9		0x004E
>> +
>> const struct sample_reg sample_reg_masks[] = {
>> 	SMPL_REG(r0, PERF_REG_POWERPC_R0),
>> 	SMPL_REG(r1, PERF_REG_POWERPC_R1),
>> @@ -55,6 +60,9 @@
>> 	SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
>> 	SMPL_REG(sier, PERF_REG_POWERPC_SIER),
>> 	SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
>> +	SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
>> +	SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
>> +	SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
>> 	SMPL_REG_END
>> };
>> 
>> @@ -163,3 +171,50 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
>> 
>> 	return SDT_ARG_VALID;
>> }
>> +
>> +uint64_t arch__intr_reg_mask(void)
>> +{
>> +	struct perf_event_attr attr = {
>> +		.type                   = PERF_TYPE_HARDWARE,
>> +		.config                 = PERF_COUNT_HW_CPU_CYCLES,
>> +		.sample_type            = PERF_SAMPLE_REGS_INTR,
>> +		.precise_ip             = 1,
>> +		.disabled               = 1,
>> +		.exclude_kernel         = 1,
>> +	};
>> +	int fd, ret;
>> +	char buffer[64];
>> +	u32 version;
>> +	u64 extended_mask = 0;
>> +
>> +	/* Get the PVR value to set the extended
>> +	 * mask specific to platform
> 
> Comment format is wrong, and punctuation please.
> 
>> +	 */
>> +	get_cpuid(buffer, sizeof(buffer));
>> +	ret = sscanf(buffer, "%u,", &version);
> 
> This is powerpc specific code, why not just use mfspr(SPRN_PVR), rather
> than redirecting via printf/sscanf.

Hi Michael

For perf tools, defines for `mfspr` , `SPRN_PVR` are in arch/powerpc/util/header.c 
So I have re-used existing utility. Otherwise, we will need to include these defines here as well
Does that sounds good ?

> 
>> +
>> +	if (ret != 1) {
>> +		pr_debug("Failed to get the processor version, unable to output extended registers\n");
>> +		return PERF_REGS_MASK;
>> +	}
>> +
>> +	if (version == PVR_POWER9)
>> +		extended_mask = PERF_REG_PMU_MASK_300;
>> +	else
>> +		return PERF_REGS_MASK;
>> +
>> +	attr.sample_regs_intr = extended_mask;
>> +	attr.sample_period = 1;
>> +	event_attr_init(&attr);
>> +
>> +	/*
>> +	 * check if the pmu supports perf extended regs, before
>> +	 * returning the register mask to sample.
>> +	 */
>> +	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
>> +	if (fd != -1) {
>> +		close(fd);
>> +		return (extended_mask | PERF_REGS_MASK);
>> +	}
>> +	return PERF_REGS_MASK;
> 
> I think this would read a bit better like:
> 
> 	mask = PERF_REGS_MASK;
> 
> 	if (version == PVR_POWER9)
> 		extended_mask = PERF_REG_PMU_MASK_300;
>        else
>        	return mask;
> 
>        attr.sample_regs_intr = extended_mask;
>        attr.sample_period = 1;
>        event_attr_init(&attr);
> 
>        /*
>          * check if the pmu supports perf extended regs, before
>          * returning the register mask to sample.
>          */
>        fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
>        if (fd != -1) {
>                close(fd);
>                mask |= extended_mask;
>        }
> 
> 	return mask;

Sure, I will try with this change

Thanks
Athira
> 
> 
> cheers


[-- Attachment #2: Type: text/html, Size: 43844 bytes --]

^ permalink raw reply

* Re: [PATCH v2 07/10] powerpc/perf: support BHRB disable bit and new filtering modes
From: Athira Rajeev @ 2020-07-09  2:43 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Michael Neuling, maddy, linuxppc-dev
In-Reply-To: <87v9iy2pyt.fsf@mpe.ellerman.id.au>

[-- Attachment #1: Type: text/plain, Size: 10270 bytes --]



> On 08-Jul-2020, at 5:12 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Athira Rajeev <atrajeev@linux.vnet.ibm.com <mailto:atrajeev@linux.vnet.ibm.com>> writes:
> 
>> PowerISA v3.1 has few updates for the Branch History Rolling Buffer(BHRB).
>                   ^
>                   a
>> First is the addition of BHRB disable bit and second new filtering
>                                                      ^
>                                                      is
>> modes for BHRB.
>> 
>> BHRB disable is controlled via Monitor Mode Control Register A (MMCRA)
>> bit 26, namely "BHRB Recording Disable (BHRBRD)". This field controls
> 
> Most people call that bit 37.
> 
>> whether BHRB entries are written when BHRB recording is enabled by other
>> bits. Patch implements support for this BHRB disable bit.
>       ^
>       This
> 
>> Secondly PowerISA v3.1 introduce filtering support for
> 
> .. that should be in a separate patch please.
> 
>> PERF_SAMPLE_BRANCH_IND_CALL/COND. The patch adds BHRB filter support
>                                    ^
>                                    This
>> for "ind_call" and "cond" in power10_bhrb_filter_map().
>> 
>> 'commit bb19af816025 ("powerpc/perf: Prevent kernel address leak to userspace via BHRB buffer")'
> 
> That doesn't need single quotes, and should be wrapped at 72 columns
> like the rest of the text.
> 
>> added a check in bhrb_read() to filter the kernel address from BHRB buffer. Patch here modified
>> it to avoid that check for PowerISA v3.1 based processors, since PowerISA v3.1 allows
>> only MSR[PR]=1 address to be written to BHRB buffer.
> 
> And that should be a separate patch again please.

Sure, I will split these to separate patches

> 
>> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/perf/core-book3s.c       | 27 +++++++++++++++++++++------
>> arch/powerpc/perf/isa207-common.c     | 13 +++++++++++++
>> arch/powerpc/perf/power10-pmu.c       | 13 +++++++++++--
>> arch/powerpc/platforms/powernv/idle.c | 14 ++++++++++++++
>> 4 files changed, 59 insertions(+), 8 deletions(-)
>> 
>> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
>> index fad5159..9709606 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -466,9 +466,13 @@ static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *
>> 			 * addresses at this point. Check the privileges before
>> 			 * exporting it to userspace (avoid exposure of regions
>> 			 * where we could have speculative execution)
>> +			 * Incase of ISA 310, BHRB will capture only user-space
>                           ^
>                           In case of ISA v3.1,

Ok, 
> 
>> +			 * address,hence include a check before filtering code
>                           ^                                                  ^
>                           addresses, hence                                   .
>> 			 */
>> -			if (is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0)
>> -				continue;
>> +			if (!(ppmu->flags & PPMU_ARCH_310S))
>> +				if (is_kernel_addr(addr) &&
>> +				perf_allow_kernel(&event->attr) != 0)
>> +					continue;
> 
> The indentation is weird. You should just check all three conditions
> with &&.

Ok, will correct this.
> 
>> 
>> 			/* Branches are read most recent first (ie. mfbhrb 0 is
>> 			 * the most recent branch).
>> @@ -1212,7 +1216,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
>> static void power_pmu_disable(struct pmu *pmu)
>> {
>> 	struct cpu_hw_events *cpuhw;
>> -	unsigned long flags, mmcr0, val;
>> +	unsigned long flags, mmcr0, val, mmcra = 0;
> 
> You initialise it below.
> 
>> 	if (!ppmu)
>> 		return;
>> @@ -1245,12 +1249,23 @@ static void power_pmu_disable(struct pmu *pmu)
>> 		mb();
>> 		isync();
>> 
>> +		val = mmcra = cpuhw->mmcr[2];
>> +
> 
> For mmcr0 (above), val is the variable we mutate and mmcr0 is the
> original value. But here you've done the reverse, which is confusing.

Yes, I am altering mmcra here and using val as original value. I should have done it reverse.

> 
>> 		/*
>> 		 * Disable instruction sampling if it was enabled
>> 		 */
>> -		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
>> -			mtspr(SPRN_MMCRA,
>> -			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
>> +		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE)
>> +			mmcra = cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE;
> 
> You just loaded cpuhw->mmcr[2] into mmcra, use it rather than referring
> back to cpuhw->mmcr[2] over and over.
> 

Ok,
>> +
>> +		/* Disable BHRB via mmcra [:26] for p10 if needed */
>> +		if (!(cpuhw->mmcr[2] & MMCRA_BHRB_DISABLE))
> 
> You don't need to check that it's clear AFAICS. Just always set disable
> and the check against val below will catch the nop case.

My thought here was to avoid writing to MMCRA ( also avoid mb() and isync() ) if its not needed.
But as you suggested, since I am comparing against original value before writing, I may not need this check.
And I missed feature check here. Will correct it.
 
> 
>> +			mmcra |= MMCRA_BHRB_DISABLE;
>> +
>> +		/* Write SPRN_MMCRA if mmcra has either disabled
> 
> Comment format is wrong.
> 
>> +		 * instruction sampling or BHRB
> 
> Full stop please.

Sure
> 
>> +		 */
>> +		if (val != mmcra) {
>> +			mtspr(SPRN_MMCRA, mmcra);
>> 			mb();
>> 			isync();
>> 		}
>> diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
>> index 7d4839e..463d925 100644
>> --- a/arch/powerpc/perf/isa207-common.c
>> +++ b/arch/powerpc/perf/isa207-common.c
>> @@ -404,6 +404,12 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
>> 
>> 	mmcra = mmcr1 = mmcr2 = mmcr3 = 0;
>> 
>> +	/* Disable bhrb unless explicitly requested
>> +	 * by setting MMCRA [:26] bit.
>> +	 */
> 
> Comment format again.
> 
>> +	if (cpu_has_feature(CPU_FTR_ARCH_31))
>> +		mmcra |= MMCRA_BHRB_DISABLE;
> 
> Here we do a feature check before setting MMCRA_BHRB_DISABLE, but you
> didn't above?
> 
>> +
>> 	/* Second pass: assign PMCs, set all MMCR1 fields */
>> 	for (i = 0; i < n_ev; ++i) {
>> 		pmc     = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK;
>> @@ -475,10 +481,17 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
>> 		}
>> 
>> 		if (event[i] & EVENT_WANTS_BHRB) {
>> +			/* set MMCRA[:26] to 0 for Power10 to enable BHRB */
> 
> "set MMCRA[:26] to 0" == "clear MMCRA[:26]”
> 
Ok

>> +			if (cpu_has_feature(CPU_FTR_ARCH_31))
>> +				mmcra &= ~MMCRA_BHRB_DISABLE;
> 
> Newline please.
> 
>> 			val = (event[i] >> EVENT_IFM_SHIFT) & EVENT_IFM_MASK;
>> 			mmcra |= val << MMCRA_IFM_SHIFT;
>> 		}
>> 
>> +		/* set MMCRA[:26] to 0 if there is user request for BHRB */
>> +		if (cpu_has_feature(CPU_FTR_ARCH_31) && has_branch_stack(pevents[i]))
>> +			mmcra &= ~MMCRA_BHRB_DISABLE;
>> +
> 
> I think it would be cleaner if you did a single test, eg:
> 
> 		if (cpu_has_feature(CPU_FTR_ARCH_31) &&
>                   (has_branch_stack(pevents[i]) || (event[i] & EVENT_WANTS_BHRB)))
> 			mmcra &= ~MMCRA_BHRB_DISABLE;

Sure Michael

Thanks for the review. I will address all these changes in the next version

Thanks
Athira 
> 
>> 		if (pevents[i]->attr.exclude_user)
>> 			mmcr2 |= MMCR2_FCP(pmc);
>> 
>> diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c
>> index d64d69d..07fb919 100644
>> --- a/arch/powerpc/perf/power10-pmu.c
>> +++ b/arch/powerpc/perf/power10-pmu.c
>> @@ -82,6 +82,8 @@
>> 
>> /* MMCRA IFM bits - POWER10 */
>> #define POWER10_MMCRA_IFM1		0x0000000040000000UL
>> +#define POWER10_MMCRA_IFM2		0x0000000080000000UL
>> +#define POWER10_MMCRA_IFM3		0x00000000C0000000UL
>> #define POWER10_MMCRA_BHRB_MASK	0x00000000C0000000UL
>> 
>> /* Table of alternatives, sorted by column 0 */
>> @@ -233,8 +235,15 @@ static u64 power10_bhrb_filter_map(u64 branch_sample_type)
>> 	if (branch_sample_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
>> 		return -1;
>> 
>> -	if (branch_sample_type & PERF_SAMPLE_BRANCH_IND_CALL)
>> -		return -1;
>> +	if (branch_sample_type & PERF_SAMPLE_BRANCH_IND_CALL) {
>> +		pmu_bhrb_filter |= POWER10_MMCRA_IFM2;
>> +		return pmu_bhrb_filter;
>> +	}
>> +
>> +	if (branch_sample_type & PERF_SAMPLE_BRANCH_COND) {
>> +		pmu_bhrb_filter |= POWER10_MMCRA_IFM3;
>> +		return pmu_bhrb_filter;
>> +	}
>> 
>> 	if (branch_sample_type & PERF_SAMPLE_BRANCH_CALL)
>> 		return -1;
>> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
>> index 2dd4673..7db99c7 100644
>> --- a/arch/powerpc/platforms/powernv/idle.c
>> +++ b/arch/powerpc/platforms/powernv/idle.c
>> @@ -611,6 +611,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>> 	unsigned long srr1;
>> 	unsigned long pls;
>> 	unsigned long mmcr0 = 0;
>> +	unsigned long mmcra_bhrb = 0;
>> 	struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
>> 	bool sprs_saved = false;
>> 
>> @@ -657,6 +658,15 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>> 		  */
>> 		mmcr0		= mfspr(SPRN_MMCR0);
>> 	}
>> +
>> +	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
>> +		/* POWER10 uses MMCRA[:26] as BHRB disable bit
> 
> Comment format.
> 
>> +		 * to disable BHRB logic when not used. Hence Save and
>> +		 * restore MMCRA after a state-loss idle.
>> +		 */
>> +		mmcra_bhrb		= mfspr(SPRN_MMCRA);
>> +	}
> 
> It's the whole mmcra it should be called mmcra?

Yes, we are saving the whole mmcra. 
> 
>> +
>> 	if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
>> 		sprs.lpcr	= mfspr(SPRN_LPCR);
>> 		sprs.hfscr	= mfspr(SPRN_HFSCR);
>> @@ -721,6 +731,10 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>> 			mtspr(SPRN_MMCR0, mmcr0);
>> 		}
>> 
>> +		/* Reload MMCRA to restore BHRB disable bit for POWER10 */
>> +		if (cpu_has_feature(CPU_FTR_ARCH_31))
>> +			mtspr(SPRN_MMCRA, mmcra_bhrb);
>> +
>> 		/*
>> 		 * DD2.2 and earlier need to set then clear bit 60 in MMCRA
>> 		 * to ensure the PMU starts running.
> 
> 
> cheers


[-- Attachment #2: Type: text/html, Size: 85847 bytes --]

^ permalink raw reply

* Re: [PATCH v2 07/10] powerpc/perf: support BHRB disable bit and new filtering modes
From: Athira Rajeev @ 2020-07-09  2:01 UTC (permalink / raw)
  To: ego; +Cc: Vaidyanathan Srinivasan, Michael Neuling, maddy, linuxppc-dev
In-Reply-To: <20200708074315.GA21370@in.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 4609 bytes --]



> On 08-Jul-2020, at 1:13 PM, Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
> 
> On Tue, Jul 07, 2020 at 05:17:55PM +1000, Michael Neuling wrote:
>> On Wed, 2020-07-01 at 05:20 -0400, Athira Rajeev wrote:
>>> PowerISA v3.1 has few updates for the Branch History Rolling Buffer(BHRB).
>>> First is the addition of BHRB disable bit and second new filtering
>>> modes for BHRB.
>>> 
>>> BHRB disable is controlled via Monitor Mode Control Register A (MMCRA)
>>> bit 26, namely "BHRB Recording Disable (BHRBRD)". This field controls
>>> whether BHRB entries are written when BHRB recording is enabled by other
>>> bits. Patch implements support for this BHRB disable bit.
>> 
>> Probably good to note here that this is backwards compatible. So if you have a
>> kernel that doesn't know about this bit, it'll clear it and hence you still get
>> BHRB. 
>> 
>> You should also note why you'd want to do disable this (ie. the core will run
>> faster).
>> 
>>> Secondly PowerISA v3.1 introduce filtering support for
>>> PERF_SAMPLE_BRANCH_IND_CALL/COND. The patch adds BHRB filter support
>>> for "ind_call" and "cond" in power10_bhrb_filter_map().
>>> 
>>> 'commit bb19af816025 ("powerpc/perf: Prevent kernel address leak to userspace
>>> via BHRB buffer")'
>>> added a check in bhrb_read() to filter the kernel address from BHRB buffer.
>>> Patch here modified
>>> it to avoid that check for PowerISA v3.1 based processors, since PowerISA v3.1
>>> allows
>>> only MSR[PR]=1 address to be written to BHRB buffer.
>>> 
>>> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>> ---
>>> arch/powerpc/perf/core-book3s.c       | 27 +++++++++++++++++++++------
>>> arch/powerpc/perf/isa207-common.c     | 13 +++++++++++++
>>> arch/powerpc/perf/power10-pmu.c       | 13 +++++++++++--
>>> arch/powerpc/platforms/powernv/idle.c | 14 ++++++++++++++
>> 
>> This touches the idle code so we should get those guys on CC (adding Vaidy and
>> Ego).
>> 
>>> 4 files changed, 59 insertions(+), 8 deletions(-)
>>> 
> 
> [..snip..]
> 
> 
>>> diff --git a/arch/powerpc/platforms/powernv/idle.c
>>> b/arch/powerpc/platforms/powernv/idle.c
>>> index 2dd4673..7db99c7 100644
>>> --- a/arch/powerpc/platforms/powernv/idle.c
>>> +++ b/arch/powerpc/platforms/powernv/idle.c
>>> @@ -611,6 +611,7 @@ static unsigned long power9_idle_stop(unsigned long psscr,
>>> bool mmu_on)
>>> 	unsigned long srr1;
>>> 	unsigned long pls;
>>> 	unsigned long mmcr0 = 0;
>>> +	unsigned long mmcra_bhrb = 0;
> 
> We are saving the whole of MMCRA aren't we ? We might want to just
> name it mmcra in that case.
> 
>>> 	struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
>>> 	bool sprs_saved = false;
>>> 
>>> @@ -657,6 +658,15 @@ static unsigned long power9_idle_stop(unsigned long
>>> psscr, bool mmu_on)
>>> 		  */
>>> 		mmcr0		= mfspr(SPRN_MMCR0);
>>> 	}
>>> +
>>> +	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
>>> +		/* POWER10 uses MMCRA[:26] as BHRB disable bit
>>> +		 * to disable BHRB logic when not used. Hence Save and
>>> +		 * restore MMCRA after a state-loss idle.
>>> +		 */
> 
> Multi-line comment usually has the first line blank.

Hi Gautham

Thanks for checking. I will change the comment format
Yes, we are saving whole of MMCRA. 
> 
> 		/*
> 	         * Line 1
> 		 * Line 2
> 		 * .
> 		 * .
> 		 * .
> 		 * Line N
> 		 */
> 
>>> +		mmcra_bhrb		= mfspr(SPRN_MMCRA);
>> 
>> 
>> Why is the bhrb bit of mmcra special here?
> 
> The comment above could include the consequence of not saving and
> restoring MMCRA i.e
> 
> - If the user hasn't asked for the BHRB to be
>  written the value of MMCRA[BHRBD] = 1.
> 
> - On wakeup from stop, MMCRA[BHRBD] will be 0, since MMCRA is not a
>  previleged resource and will be lost.
> 
> - Thus, if we do not save and restore the MMCRA[BHRBD], the hardware
>  will be needlessly writing to the BHRB in the problem mode.
> 
>> 
>>> +	}
>>> +
>>> 	if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
>>> 		sprs.lpcr	= mfspr(SPRN_LPCR);
>>> 		sprs.hfscr	= mfspr(SPRN_HFSCR);
>>> @@ -721,6 +731,10 @@ static unsigned long power9_idle_stop(unsigned long
>>> psscr, bool mmu_on)
>>> 			mtspr(SPRN_MMCR0, mmcr0);
>>> 		}
>>> 
>>> +		/* Reload MMCRA to restore BHRB disable bit for POWER10 */
>>> +		if (cpu_has_feature(CPU_FTR_ARCH_31))
>>> +			mtspr(SPRN_MMCRA, mmcra_bhrb);
>>> +
>>> 		/*
>>> 		 * DD2.2 and earlier need to set then clear bit 60 in MMCRA
>>> 		 * to ensure the PMU starts running.
>> 
> 
> --
> Thanks and Regards
> gautham.


[-- Attachment #2: Type: text/html, Size: 39451 bytes --]

^ permalink raw reply

* Re: [PATCH v2 03/10] powerpc/xmon: Add PowerISA v3.1 PMU SPRs
From: Athira Rajeev @ 2020-07-09  1:57 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Michael Neuling, maddy, linuxppc-dev
In-Reply-To: <871rlm469d.fsf@mpe.ellerman.id.au>

[-- Attachment #1: Type: text/plain, Size: 1428 bytes --]



> On 08-Jul-2020, at 4:34 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Athira Rajeev <atrajeev@linux.vnet.ibm.com <mailto:atrajeev@linux.vnet.ibm.com>> writes:
>> From: Madhavan Srinivasan <maddy@linux.ibm.com>
>> 
>> PowerISA v3.1 added three new perfromance
>> monitoring unit (PMU) speical purpose register (SPR).
>> They are Monitor Mode Control Register 3 (MMCR3),
>> Sampled Instruction Event Register 2 (SIER2),
>> Sampled Instruction Event Register 3 (SIER3).
>> 
>> Patch here adds a new dump function dump_310_sprs
>> to print these SPR values.
>> 
>> Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
>> ---
>> arch/powerpc/xmon/xmon.c | 15 +++++++++++++++
>> 1 file changed, 15 insertions(+)
>> 
>> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
>> index 7efe4bc..8917fe8 100644
>> --- a/arch/powerpc/xmon/xmon.c
>> +++ b/arch/powerpc/xmon/xmon.c
>> @@ -2022,6 +2022,20 @@ static void dump_300_sprs(void)
>> #endif
>> }
>> 
>> +static void dump_310_sprs(void)
>> +{
>> +#ifdef CONFIG_PPC64
>> +	if (!cpu_has_feature(CPU_FTR_ARCH_31))
>> +		return;
>> +
>> +	printf("mmcr3  = %.16lx\n",
>> +		mfspr(SPRN_MMCR3));
>> +
>> +	printf("sier2  = %.16lx  sier3  = %.16lx\n",
>> +		mfspr(SPRN_SIER2), mfspr(SPRN_SIER3));
> 
> Why not all on one line like many of the others?

Sure, will change this to one line

Thanks
Athira
> 
> cheers


[-- Attachment #2: Type: text/html, Size: 6580 bytes --]

^ permalink raw reply

* Re: [PATCH V4 0/3] arm64: Enable vmemmap mapping from device memory
From: Anshuman Khandual @ 2020-07-09  3:30 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: Mark Rutland, Michal Hocko, linux-ia64, David Hildenbrand,
	Peter Zijlstra, catalin.marinas, Dave Hansen, Paul Mackerras,
	linux-riscv, Will Deacon, Thomas Gleixner, justin.he, x86,
	Matthew Wilcox (Oracle), Mike Rapoport, Ingo Molnar, Fenghua Yu,
	Pavel Tatashin, Andy Lutomirski, Paul Walmsley, Dan Williams,
	linux-arm-kernel, Tony Luck, linux-kernel, Palmer Dabbelt, akpm,
	linuxppc-dev, Kirill A. Shutemov
In-Reply-To: <1594004178-8861-1-git-send-email-anshuman.khandual@arm.com>



On 07/06/2020 08:26 AM, Anshuman Khandual wrote:
> This series enables vmemmap backing memory allocation from device memory
> ranges on arm64. But before that, it enables vmemmap_populate_basepages()
> and vmemmap_alloc_block_buf() to accommodate struct vmem_altmap based
> alocation requests.
> 
> This series applies on 5.8-rc4.
> 
> Changes in V4:
> 
> - Dropped 'fallback' from vmemmap_alloc_block_buf() per Catalin

Hello Andrew,

This series has been a long running one :) Now that all the three patches
here have been reviewed, could you please consider this series for merging
towards 5.9-rc1. Catalin had suggested earlier [1] that it should go via
the MM tree instead, as it touches multiple architecture. Thank you.

[1] https://patchwork.kernel.org/patch/11611103/

- Anshuman

^ permalink raw reply

* [PATCH v6 23/23] powerpc/book3s64/pkeys: Remove is_pkey_enabled()
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

There is only one caller to this function and the function is wrongly
named. Avoid further confusion w.r.t name and open code this at the
only call site. Also remove read_uamor(). There are no users for
the same after this.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/book3s64/pkeys.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 480ae31fad52..a4768c694720 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -251,25 +251,6 @@ static inline void write_iamr(u64 value)
 	mtspr(SPRN_IAMR, value);
 }
 
-static inline u64 read_uamor(void)
-{
-	return mfspr(SPRN_UAMOR);
-}
-
-static bool is_pkey_enabled(int pkey)
-{
-	u64 uamor = read_uamor();
-	u64 pkey_bits = 0x3ul << pkeyshift(pkey);
-	u64 uamor_pkey_bits = (uamor & pkey_bits);
-
-	/*
-	 * Both the bits in UAMOR corresponding to the key should be set or
-	 * reset.
-	 */
-	WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
-	return !!(uamor_pkey_bits);
-}
-
 static inline void init_amr(int pkey, u8 init_bits)
 {
 	u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
@@ -295,8 +276,18 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 {
 	u64 new_amr_bits = 0x0ul;
 	u64 new_iamr_bits = 0x0ul;
+	u64 pkey_bits, uamor_pkey_bits;
 
-	if (!is_pkey_enabled(pkey))
+	/*
+	 * Check whether the key is disabled by UAMOR.
+	 */
+	pkey_bits = 0x3ul << pkeyshift(pkey);
+	uamor_pkey_bits = (default_uamor & pkey_bits);
+
+	/*
+	 * Both the bits in UAMOR corresponding to the key should be set
+	 */
+	if (uamor_pkey_bits != pkey_bits)
 		return -EINVAL;
 
 	if (init_val & PKEY_DISABLE_EXECUTE) {
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 22/23] powerpc/selftest/ptrace-pkey: Don't update expected UAMOR value
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

with commit: 4a4a5e5d2aad ("powerpc/pkeys: key allocation/deallocation must not change pkey registers")
we are not updating UAMOR on key allocation. So don't update the
expected uamor value in the test.

Fixes: 4a4a5e5d2aad ("powerpc/pkeys: key allocation/deallocation must not change pkey registers")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
index bc33d748d95b..3694613f418f 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
@@ -101,15 +101,20 @@ static int child(struct shared_info *info)
 	 */
 	info->invalid_amr = info->amr2 | (~0x0UL & ~info->expected_uamor);
 
+	/*
+	 * if PKEY_DISABLE_EXECUTE succeeded we should update the expected_iamr
+	 */
 	if (disable_execute)
 		info->expected_iamr |= 1ul << pkeyshift(pkey1);
 	else
 		info->expected_iamr &= ~(1ul << pkeyshift(pkey1));
 
-	info->expected_iamr &= ~(1ul << pkeyshift(pkey2) | 1ul << pkeyshift(pkey3));
+	/*
+	 * We allocated pkey2 and pkey 3 above. Clear the IAMR bits.
+	 */
+	info->expected_iamr &= ~(1ul << pkeyshift(pkey2));
+	info->expected_iamr &= ~(1ul << pkeyshift(pkey3));
 
-	info->expected_uamor |= 3ul << pkeyshift(pkey1) |
-				3ul << pkeyshift(pkey2);
 	/*
 	 * Create an IAMR value different from expected value.
 	 * Kernel will reject an IAMR and UAMOR change.
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 21/23] powerpc/selftest/ptrace-pkey: Update the test to mark an invalid pkey correctly
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../selftests/powerpc/ptrace/ptrace-pkey.c    | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
index f9216c7a1829..bc33d748d95b 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
@@ -66,11 +66,6 @@ static int sys_pkey_alloc(unsigned long flags, unsigned long init_access_rights)
 	return syscall(__NR_pkey_alloc, flags, init_access_rights);
 }
 
-static int sys_pkey_free(int pkey)
-{
-	return syscall(__NR_pkey_free, pkey);
-}
-
 static int child(struct shared_info *info)
 {
 	unsigned long reg;
@@ -100,7 +95,11 @@ static int child(struct shared_info *info)
 
 	info->amr1 |= 3ul << pkeyshift(pkey1);
 	info->amr2 |= 3ul << pkeyshift(pkey2);
-	info->invalid_amr |= info->amr2 | 3ul << pkeyshift(pkey3);
+	/*
+	 * invalid amr value where we try to force write
+	 * things which are deined by a uamor setting.
+	 */
+	info->invalid_amr = info->amr2 | (~0x0UL & ~info->expected_uamor);
 
 	if (disable_execute)
 		info->expected_iamr |= 1ul << pkeyshift(pkey1);
@@ -111,17 +110,12 @@ static int child(struct shared_info *info)
 
 	info->expected_uamor |= 3ul << pkeyshift(pkey1) |
 				3ul << pkeyshift(pkey2);
-	info->invalid_iamr |= 1ul << pkeyshift(pkey1) | 1ul << pkeyshift(pkey2);
-	info->invalid_uamor |= 3ul << pkeyshift(pkey1);
-
 	/*
-	 * We won't use pkey3. We just want a plausible but invalid key to test
-	 * whether ptrace will let us write to AMR bits we are not supposed to.
-	 *
-	 * This also tests whether the kernel restores the UAMOR permissions
-	 * after a key is freed.
+	 * Create an IAMR value different from expected value.
+	 * Kernel will reject an IAMR and UAMOR change.
 	 */
-	sys_pkey_free(pkey3);
+	info->invalid_iamr = info->expected_iamr | (1ul << pkeyshift(pkey1) | 1ul << pkeyshift(pkey2));
+	info->invalid_uamor = info->expected_uamor & ~(0x3ul << pkeyshift(pkey1));
 
 	printf("%-30s AMR: %016lx pkey1: %d pkey2: %d pkey3: %d\n",
 	       user_write, info->amr1, pkey1, pkey2, pkey3);
@@ -196,9 +190,9 @@ static int parent(struct shared_info *info, pid_t pid)
 	PARENT_SKIP_IF_UNSUPPORTED(ret, &info->child_sync);
 	PARENT_FAIL_IF(ret, &info->child_sync);
 
-	info->amr1 = info->amr2 = info->invalid_amr = regs[0];
-	info->expected_iamr = info->invalid_iamr = regs[1];
-	info->expected_uamor = info->invalid_uamor = regs[2];
+	info->amr1 = info->amr2 = regs[0];
+	info->expected_iamr = regs[1];
+	info->expected_uamor = regs[2];
 
 	/* Wake up child so that it can set itself up. */
 	ret = prod_child(&info->child_sync);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 19/23] powerpc/book3s64/kuap: Move UAMOR setup to key init function
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

UAMOR values are not application-specific. The kernel initializes
its value based on different reserved keys. Remove the thread-specific
UAMOR value and don't switch the UAMOR on context switch.

Move UAMOR initialization to key initialization code and remove
thread_struct.uamor because it is not used anymore.

Before commit: 4a4a5e5d2aad ("powerpc/pkeys: key allocation/deallocation must not change pkey registers")
we used to update uamor based on key allocation and free.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pkeys.h |  2 ++
 arch/powerpc/include/asm/processor.h       |  1 -
 arch/powerpc/kernel/ptrace/ptrace-view.c   | 27 +++++++++++++++-----
 arch/powerpc/kernel/smp.c                  |  1 +
 arch/powerpc/mm/book3s64/hash_utils.c      |  4 +++
 arch/powerpc/mm/book3s64/pkeys.c           | 29 +++++++++-------------
 arch/powerpc/mm/book3s64/radix_pgtable.c   |  4 +++
 7 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pkeys.h b/arch/powerpc/include/asm/book3s/64/pkeys.h
index 5b178139f3c0..b7d9f4267bcd 100644
--- a/arch/powerpc/include/asm/book3s/64/pkeys.h
+++ b/arch/powerpc/include/asm/book3s/64/pkeys.h
@@ -5,6 +5,8 @@
 
 #include <asm/book3s/64/hash-pkey.h>
 
+extern u64 __ro_after_init default_uamor;
+
 static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
 {
 	if (!mmu_has_feature(MMU_FTR_PKEY))
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 52a67835057a..6ac12168f1fe 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -237,7 +237,6 @@ struct thread_struct {
 #ifdef CONFIG_PPC_MEM_KEYS
 	unsigned long	amr;
 	unsigned long	iamr;
-	unsigned long	uamor;
 #endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	void*		kvm_shadow_vcpu; /* KVM internal data */
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c
index caeb5822a8f4..ac7d480cb9c1 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -488,14 +488,21 @@ static int pkey_active(struct task_struct *target, const struct user_regset *reg
 static int pkey_get(struct task_struct *target, const struct user_regset *regset,
 		    unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf)
 {
+	int ret;
+
 	BUILD_BUG_ON(TSO(amr) + sizeof(unsigned long) != TSO(iamr));
-	BUILD_BUG_ON(TSO(iamr) + sizeof(unsigned long) != TSO(uamor));
 
 	if (!arch_pkeys_enabled())
 		return -ENODEV;
 
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.amr,
-				   0, ELF_NPKEY * sizeof(unsigned long));
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.amr,
+				  0, 2 * sizeof(unsigned long));
+	if (ret)
+		return ret;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &default_uamor,
+				  2 * sizeof(unsigned long), 3 * sizeof(unsigned long));
+	return ret;
 }
 
 static int pkey_set(struct task_struct *target, const struct user_regset *regset,
@@ -517,9 +524,17 @@ static int pkey_set(struct task_struct *target, const struct user_regset *regset
 	if (ret)
 		return ret;
 
-	/* UAMOR determines which bits of the AMR can be set from userspace. */
-	target->thread.amr = (new_amr & target->thread.uamor) |
-			     (target->thread.amr & ~target->thread.uamor);
+	/*
+	 * UAMOR determines which bits of the AMR can be set from userspace.
+	 * UAMOR value 0b11 indicates that the AMR value can be modified
+	 * from userspace. If the kernel is using a specific key, we avoid
+	 * userspace modifying the AMR value for that key by masking them
+	 * via UAMOR 0b00.
+	 *
+	 * Pick the AMR values for the keys that kernel is using. This
+	 * will be indicated by the ~default_uamor bits.
+	 */
+	target->thread.amr = (new_amr & default_uamor) | (target->thread.amr & ~default_uamor);
 
 	return 0;
 }
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..8261999c7d52 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -59,6 +59,7 @@
 #include <asm/asm-prototypes.h>
 #include <asm/cpu_has_feature.h>
 #include <asm/ftrace.h>
+#include <asm/kup.h>
 
 #ifdef DEBUG
 #include <asm/udbg.h>
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index eec6f4e5e481..9dfb0ceed5e3 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1110,6 +1110,10 @@ void hash__early_init_mmu_secondary(void)
 	if (cpu_has_feature(CPU_FTR_ARCH_206)
 			&& cpu_has_feature(CPU_FTR_HVMODE))
 		tlbiel_all();
+
+#ifdef CONFIG_PPC_MEM_KEYS
+	mtspr(SPRN_UAMOR, default_uamor);
+#endif
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index c682eefd3fc1..480ae31fad52 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -27,9 +27,7 @@ static u32 initial_allocation_mask __ro_after_init;
  */
 static u64 default_amr = ~0x0UL;
 static u64 default_iamr = 0x5555555555555555UL;
-
-/* Allow all keys to be modified by default */
-static u64 default_uamor = ~0x0UL;
+u64 default_uamor __ro_after_init;
 /*
  * Key used to implement PROT_EXEC mmap. Denies READ/WRITE
  * We pick key 2 because 0 is special key and 1 is reserved as per ISA.
@@ -122,10 +120,11 @@ void __init pkey_early_init_devtree(void)
 
 	/* scan the device tree for pkey feature */
 	pkeys_total = scan_pkey_feature();
-	if (!pkeys_total) {
-		/* No support for pkey. Mark it disabled */
-		return;
-	}
+	if (!pkeys_total)
+		goto out;
+
+	/* Allow all keys to be modified by default */
+	default_uamor = ~0x0UL;
 
 	cur_cpu_spec->mmu_features |= MMU_FTR_PKEY;
 
@@ -209,6 +208,12 @@ void __init pkey_early_init_devtree(void)
 	initial_allocation_mask |= reserved_allocation_mask;
 
 	pr_info("Enabling pkeys with max key count %d\n", num_pkey);
+out:
+	/*
+	 * Setup uamor on boot cpu
+	 */
+	mtspr(SPRN_UAMOR, default_uamor);
+
 	return;
 }
 
@@ -251,11 +256,6 @@ static inline u64 read_uamor(void)
 	return mfspr(SPRN_UAMOR);
 }
 
-static inline void write_uamor(u64 value)
-{
-	mtspr(SPRN_UAMOR, value);
-}
-
 static bool is_pkey_enabled(int pkey)
 {
 	u64 uamor = read_uamor();
@@ -326,7 +326,6 @@ void thread_pkey_regs_save(struct thread_struct *thread)
 	 */
 	thread->amr = read_amr();
 	thread->iamr = read_iamr();
-	thread->uamor = read_uamor();
 }
 
 void thread_pkey_regs_restore(struct thread_struct *new_thread,
@@ -339,8 +338,6 @@ void thread_pkey_regs_restore(struct thread_struct *new_thread,
 		write_amr(new_thread->amr);
 	if (old_thread->iamr != new_thread->iamr)
 		write_iamr(new_thread->iamr);
-	if (old_thread->uamor != new_thread->uamor)
-		write_uamor(new_thread->uamor);
 }
 
 void thread_pkey_regs_init(struct thread_struct *thread)
@@ -350,11 +347,9 @@ void thread_pkey_regs_init(struct thread_struct *thread)
 
 	thread->amr   = default_amr;
 	thread->iamr  = default_iamr;
-	thread->uamor = default_uamor;
 
 	write_amr(default_amr);
 	write_iamr(default_iamr);
-	write_uamor(default_uamor);
 }
 
 int execute_only_pkey(struct mm_struct *mm)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 6d814c9bb4bf..ed24ba56d78a 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -546,6 +546,10 @@ void setup_kuap(bool disabled)
 
 	/* Make sure userspace can't change the AMR */
 	mtspr(SPRN_UAMOR, 0);
+
+	/*
+	 * Set the default kernel AMR values on all cpus.
+	 */
 	mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
 	isync();
 }
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 18/23] powerpc/book3s64/keys/kuap: Reset AMR/IAMR values on kexec
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

As we kexec across kernels that use AMR/IAMR for different purposes
we need to ensure that new kernels get kexec'd with a reset value
of AMR/IAMR. For ex: the new kernel can use key 0 for kernel mapping and the old
AMR value prevents access to key 0.

This patch also removes reset if IAMR and AMOR in kexec_sequence. Reset of AMOR
is not needed and the IAMR reset is partial (it doesn't do the reset
on secondary cpus) and is redundant with this patch.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/kexec.h | 23 ++++++++++++++++++++++
 arch/powerpc/include/asm/kexec.h           | 12 +++++++++++
 arch/powerpc/kernel/misc_64.S              | 14 -------------
 arch/powerpc/kexec/core_64.c               |  2 ++
 arch/powerpc/mm/book3s64/pgtable.c         |  3 +++
 5 files changed, 40 insertions(+), 14 deletions(-)
 create mode 100644 arch/powerpc/include/asm/book3s/64/kexec.h

diff --git a/arch/powerpc/include/asm/book3s/64/kexec.h b/arch/powerpc/include/asm/book3s/64/kexec.h
new file mode 100644
index 000000000000..6b5c3a248ba2
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/kexec.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_POWERPC_BOOK3S_64_KEXEC_H_
+#define _ASM_POWERPC_BOOK3S_64_KEXEC_H_
+
+
+#define reset_sprs reset_sprs
+static inline void reset_sprs(void)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		mtspr(SPRN_AMR, 0);
+		mtspr(SPRN_UAMOR, 0);
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		mtspr(SPRN_IAMR, 0);
+	}
+
+	/*  Do we need isync()? We are going via a kexec reset */
+	isync();
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index c68476818753..89f7e3462292 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -150,6 +150,18 @@ static inline void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
 }
 
 #endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#include <asm/book3s/64/kexec.h>
+#endif
+
+#ifndef reset_sprs
+#define reset_sprs reset_sprs
+static inline void reset_sprs(void)
+{
+}
+#endif
+
 #endif /* ! __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_KEXEC_H */
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 1864605eca29..7bb46ad98207 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -413,20 +413,6 @@ _GLOBAL(kexec_sequence)
 	li	r0,0
 	std	r0,16(r1)
 
-BEGIN_FTR_SECTION
-	/*
-	 * This is the best time to turn AMR/IAMR off.
-	 * key 0 is used in radix for supervisor<->user
-	 * protection, but on hash key 0 is reserved
-	 * ideally we want to enter with a clean state.
-	 * NOTE, we rely on r0 being 0 from above.
-	 */
-	mtspr	SPRN_IAMR,r0
-BEGIN_FTR_SECTION_NESTED(42)
-	mtspr	SPRN_AMOR,r0
-END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
 	/* save regs for local vars on new stack.
 	 * yes, we won't go back, but ...
 	 */
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index b4184092172a..8a449b2d8715 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -152,6 +152,8 @@ static void kexec_smp_down(void *arg)
 	if (ppc_md.kexec_cpu_down)
 		ppc_md.kexec_cpu_down(0, 1);
 
+	reset_sprs();
+
 	kexec_smp_wait();
 	/* NOTREACHED */
 }
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index c58ad1049909..e63fcc00744c 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -15,6 +15,7 @@
 #include <asm/powernv.h>
 #include <asm/firmware.h>
 #include <asm/ultravisor.h>
+#include <asm/kexec.h>
 
 #include <mm/mmu_decl.h>
 #include <trace/events/thp.h>
@@ -165,6 +166,8 @@ void mmu_cleanup_all(void)
 		radix__mmu_cleanup_all();
 	else if (mmu_hash_ops.hpte_clear_all)
 		mmu_hash_ops.hpte_clear_all();
+
+	reset_sprs();
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 20/23] powerpc/selftest/ptrave-pkey: Rename variables to make it easier to follow code
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

Rename variable to indicate that they are invalid values which we will use to
test ptrace update of pkeys.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 .../selftests/powerpc/ptrace/ptrace-pkey.c    | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
index bdbbbe8431e0..f9216c7a1829 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
@@ -44,7 +44,7 @@ struct shared_info {
 	unsigned long amr2;
 
 	/* AMR value that ptrace should refuse to write to the child. */
-	unsigned long amr3;
+	unsigned long invalid_amr;
 
 	/* IAMR value the parent expects to read from the child. */
 	unsigned long expected_iamr;
@@ -57,8 +57,8 @@ struct shared_info {
 	 * (even though they're valid ones) because userspace doesn't have
 	 * access to those registers.
 	 */
-	unsigned long new_iamr;
-	unsigned long new_uamor;
+	unsigned long invalid_iamr;
+	unsigned long invalid_uamor;
 };
 
 static int sys_pkey_alloc(unsigned long flags, unsigned long init_access_rights)
@@ -100,7 +100,7 @@ static int child(struct shared_info *info)
 
 	info->amr1 |= 3ul << pkeyshift(pkey1);
 	info->amr2 |= 3ul << pkeyshift(pkey2);
-	info->amr3 |= info->amr2 | 3ul << pkeyshift(pkey3);
+	info->invalid_amr |= info->amr2 | 3ul << pkeyshift(pkey3);
 
 	if (disable_execute)
 		info->expected_iamr |= 1ul << pkeyshift(pkey1);
@@ -111,8 +111,8 @@ static int child(struct shared_info *info)
 
 	info->expected_uamor |= 3ul << pkeyshift(pkey1) |
 				3ul << pkeyshift(pkey2);
-	info->new_iamr |= 1ul << pkeyshift(pkey1) | 1ul << pkeyshift(pkey2);
-	info->new_uamor |= 3ul << pkeyshift(pkey1);
+	info->invalid_iamr |= 1ul << pkeyshift(pkey1) | 1ul << pkeyshift(pkey2);
+	info->invalid_uamor |= 3ul << pkeyshift(pkey1);
 
 	/*
 	 * We won't use pkey3. We just want a plausible but invalid key to test
@@ -196,9 +196,9 @@ static int parent(struct shared_info *info, pid_t pid)
 	PARENT_SKIP_IF_UNSUPPORTED(ret, &info->child_sync);
 	PARENT_FAIL_IF(ret, &info->child_sync);
 
-	info->amr1 = info->amr2 = info->amr3 = regs[0];
-	info->expected_iamr = info->new_iamr = regs[1];
-	info->expected_uamor = info->new_uamor = regs[2];
+	info->amr1 = info->amr2 = info->invalid_amr = regs[0];
+	info->expected_iamr = info->invalid_iamr = regs[1];
+	info->expected_uamor = info->invalid_uamor = regs[2];
 
 	/* Wake up child so that it can set itself up. */
 	ret = prod_child(&info->child_sync);
@@ -234,10 +234,10 @@ static int parent(struct shared_info *info, pid_t pid)
 		return ret;
 
 	/* Write invalid AMR value in child. */
-	ret = ptrace_write_regs(pid, NT_PPC_PKEY, &info->amr3, 1);
+	ret = ptrace_write_regs(pid, NT_PPC_PKEY, &info->invalid_amr, 1);
 	PARENT_FAIL_IF(ret, &info->child_sync);
 
-	printf("%-30s AMR: %016lx\n", ptrace_write_running, info->amr3);
+	printf("%-30s AMR: %016lx\n", ptrace_write_running, info->invalid_amr);
 
 	/* Wake up child so that it can verify it didn't change. */
 	ret = prod_child(&info->child_sync);
@@ -249,7 +249,7 @@ static int parent(struct shared_info *info, pid_t pid)
 
 	/* Try to write to IAMR. */
 	regs[0] = info->amr1;
-	regs[1] = info->new_iamr;
+	regs[1] = info->invalid_iamr;
 	ret = ptrace_write_regs(pid, NT_PPC_PKEY, regs, 2);
 	PARENT_FAIL_IF(!ret, &info->child_sync);
 
@@ -257,7 +257,7 @@ static int parent(struct shared_info *info, pid_t pid)
 	       ptrace_write_running, regs[0], regs[1]);
 
 	/* Try to write to IAMR and UAMOR. */
-	regs[2] = info->new_uamor;
+	regs[2] = info->invalid_uamor;
 	ret = ptrace_write_regs(pid, NT_PPC_PKEY, regs, 3);
 	PARENT_FAIL_IF(!ret, &info->child_sync);
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 17/23] powerpc/book3s64/keys: Print information during boot.
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/book3s64/pkeys.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index f388c8d5359a..c682eefd3fc1 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -208,6 +208,7 @@ void __init pkey_early_init_devtree(void)
 	 */
 	initial_allocation_mask |= reserved_allocation_mask;
 
+	pr_info("Enabling pkeys with max key count %d\n", num_pkey);
 	return;
 }
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 16/23] powerpc/book3s64/pkeys: Use MMU_FTR_PKEY instead of pkey_disabled static key
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

Instead of pkey_disabled static key use mmu feature MMU_FTR_PKEY.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pkeys.h |  2 +-
 arch/powerpc/include/asm/pkeys.h           | 14 ++++++--------
 arch/powerpc/mm/book3s64/pkeys.c           | 17 +++++++----------
 3 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pkeys.h b/arch/powerpc/include/asm/book3s/64/pkeys.h
index 8174662a9173..5b178139f3c0 100644
--- a/arch/powerpc/include/asm/book3s/64/pkeys.h
+++ b/arch/powerpc/include/asm/book3s/64/pkeys.h
@@ -7,7 +7,7 @@
 
 static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return 0x0UL;
 
 	if (radix_enabled())
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index f44a14d64d47..a7951049e129 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -11,7 +11,6 @@
 #include <linux/jump_label.h>
 #include <asm/firmware.h>
 
-DECLARE_STATIC_KEY_FALSE(pkey_disabled);
 extern int num_pkey;
 extern u32 reserved_allocation_mask; /* bits set for reserved keys */
 
@@ -38,7 +37,7 @@ static inline u64 pkey_to_vmflag_bits(u16 pkey)
 
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return 0;
 	return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT;
 }
@@ -93,9 +92,8 @@ static inline int mm_pkey_alloc(struct mm_struct *mm)
 	u32 all_pkeys_mask = (u32)(~(0x0));
 	int ret;
 
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return -1;
-
 	/*
 	 * Are we out of pkeys? We must handle this specially because ffz()
 	 * behavior is undefined if there are no zeros.
@@ -111,7 +109,7 @@ static inline int mm_pkey_alloc(struct mm_struct *mm)
 
 static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return -1;
 
 	if (!mm_pkey_is_allocated(mm, pkey))
@@ -132,7 +130,7 @@ extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
 static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
 					      int prot, int pkey)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return 0;
 
 	/*
@@ -150,7 +148,7 @@ extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 					    unsigned long init_val)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return -EINVAL;
 
 	/*
@@ -167,7 +165,7 @@ static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 
 static inline bool arch_pkeys_enabled(void)
 {
-	return !static_branch_likely(&pkey_disabled);
+	return mmu_has_feature(MMU_FTR_PKEY);
 }
 
 extern void pkey_mm_init(struct mm_struct *mm);
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index db2d0d34f515..f388c8d5359a 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -12,8 +12,6 @@
 #include <linux/pkeys.h>
 #include <linux/of_fdt.h>
 
-
-DEFINE_STATIC_KEY_FALSE(pkey_disabled);
 int  num_pkey;		/* Max number of pkeys supported */
 /*
  *  Keys marked in the reservation list cannot be allocated by  userspace
@@ -126,7 +124,6 @@ void __init pkey_early_init_devtree(void)
 	pkeys_total = scan_pkey_feature();
 	if (!pkeys_total) {
 		/* No support for pkey. Mark it disabled */
-		static_branch_enable(&pkey_disabled);
 		return;
 	}
 
@@ -216,7 +213,7 @@ void __init pkey_early_init_devtree(void)
 
 void pkey_mm_init(struct mm_struct *mm)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return;
 	mm_pkey_allocation_map(mm) = initial_allocation_mask;
 	mm->context.execute_only_pkey = execute_only_key;
@@ -320,7 +317,7 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 
 void thread_pkey_regs_save(struct thread_struct *thread)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return;
 
 	/*
@@ -334,7 +331,7 @@ void thread_pkey_regs_save(struct thread_struct *thread)
 void thread_pkey_regs_restore(struct thread_struct *new_thread,
 			      struct thread_struct *old_thread)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return;
 
 	if (old_thread->amr != new_thread->amr)
@@ -347,7 +344,7 @@ void thread_pkey_regs_restore(struct thread_struct *new_thread,
 
 void thread_pkey_regs_init(struct thread_struct *thread)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return;
 
 	thread->amr   = default_amr;
@@ -416,7 +413,7 @@ static bool pkey_access_permitted(int pkey, bool write, bool execute)
 
 bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return true;
 
 	return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
@@ -433,7 +430,7 @@ bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
 bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
 			       bool execute, bool foreign)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return true;
 	/*
 	 * Do not enforce our key-permissions on a foreign vma.
@@ -446,7 +443,7 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
 
 void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
 {
-	if (static_branch_likely(&pkey_disabled))
+	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return;
 
 	/* Duplicate the oldmm pkey state in mm: */
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 15/23] powerpc/book3s64/pkeys: Use pkey_execute_disable_supported
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

Use pkey_execute_disable_supported to check for execute key support instead
of pkey_disabled.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/pkeys.h | 10 +---------
 arch/powerpc/mm/book3s64/pkeys.c |  6 +++---
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index dd32e30f6767..f44a14d64d47 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -126,15 +126,7 @@ static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
  * Try to dedicate one of the protection keys to be used as an
  * execute-only protection key.
  */
-extern int __execute_only_pkey(struct mm_struct *mm);
-static inline int execute_only_pkey(struct mm_struct *mm)
-{
-	if (static_branch_likely(&pkey_disabled))
-		return -1;
-
-	return __execute_only_pkey(mm);
-}
-
+extern int execute_only_pkey(struct mm_struct *mm);
 extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
 					 int prot, int pkey);
 static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index c795a28c1964..db2d0d34f515 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -23,7 +23,6 @@ u32 reserved_allocation_mask __ro_after_init;
 /* Bits set for the initially allocated keys */
 static u32 initial_allocation_mask __ro_after_init;
 
-static bool pkey_execute_disable_supported;
 /*
  * Even if we allocate keys with sys_pkey_alloc(), we need to make sure
  * other thread still find the access denied using the same keys.
@@ -38,6 +37,7 @@ static u64 default_uamor = ~0x0UL;
  * We pick key 2 because 0 is special key and 1 is reserved as per ISA.
  */
 static int execute_only_key = 2;
+static bool pkey_execute_disable_supported;
 
 
 #define AMR_BITS_PER_PKEY 2
@@ -151,7 +151,7 @@ void __init pkey_early_init_devtree(void)
 	num_pkey = pkeys_total;
 #endif
 
-	if (unlikely(num_pkey <= execute_only_key)) {
+	if (unlikely(num_pkey <= execute_only_key) || !pkey_execute_disable_supported) {
 		/*
 		 * Insufficient number of keys to support
 		 * execute only key. Mark it unavailable.
@@ -359,7 +359,7 @@ void thread_pkey_regs_init(struct thread_struct *thread)
 	write_uamor(default_uamor);
 }
 
-int __execute_only_pkey(struct mm_struct *mm)
+int execute_only_pkey(struct mm_struct *mm)
 {
 	return mm->context.execute_only_pkey;
 }
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 14/23] powerpc/book3s64/kuep: Add MMU_FTR_KUEP
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

This will be used to enable/disable Kernel Userspace Execution
Prevention (KUEP).

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/mmu.h           | 8 ++++++++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 88aed01fad81..df767315ec8c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -28,6 +28,10 @@
  * Individual features below.
  */
 
+/*
+ * Suppor for KUEP feature.
+ */
+#define MMU_FTR_KUEP			ASM_CONST(0x00000800)
 /*
  * Support for memory protection keys.
  */
@@ -184,6 +188,10 @@ enum {
 #ifdef CONFIG_PPC_MEM_KEYS
 	MMU_FTR_PKEY |
 #endif
+#ifdef CONFIG_PPC_KUEP
+	MMU_FTR_KUEP |
+#endif /* CONFIG_PPC_KUAP */
+
 		0,
 };
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index bb00e0cba119..6d814c9bb4bf 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -519,8 +519,10 @@ void setup_kuep(bool disabled)
 	if (disabled || !early_radix_enabled())
 		return;
 
-	if (smp_processor_id() == boot_cpuid)
+	if (smp_processor_id() == boot_cpuid) {
 		pr_info("Activating Kernel Userspace Execution Prevention\n");
+		cur_cpu_spec->mmu_features |= MMU_FTR_KUEP;
+	}
 
 	/*
 	 * Radix always uses key0 of the IAMR to determine if an access is
-- 
2.26.2


^ permalink raw reply related

* [PATCH v6 13/23] powerpc/book3s64/pkeys: Add MMU_FTR_PKEY
From: Aneesh Kumar K.V @ 2020-07-09  3:29 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, linuxram, bauerman
In-Reply-To: <20200709032946.881753-1-aneesh.kumar@linux.ibm.com>

Parse storage keys related device tree entry in early_init_devtree
and enable MMU feature MMU_FTR_PKEY if pkeys are supported.

MMU feature is used instead of CPU feature because this enables us
to group MMU_FTR_KUAP and MMU_FTR_PKEY in asm feature fixup code.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  6 +++
 arch/powerpc/include/asm/mmu.h           |  9 ++++
 arch/powerpc/kernel/prom.c               |  5 +++
 arch/powerpc/mm/book3s64/pkeys.c         | 52 ++++++++++++++----------
 4 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 5393a535240c..3371ea05b7d3 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -209,6 +209,12 @@ extern int mmu_io_psize;
 void mmu_early_init_devtree(void);
 void hash__early_init_devtree(void);
 void radix__early_init_devtree(void);
+#ifdef CONFIG_PPC_MEM_KEYS
+void pkey_early_init_devtree(void);
+#else
+static inline void pkey_early_init_devtree(void) {}
+#endif
+
 extern void hash__early_init_mmu(void);
 extern void radix__early_init_mmu(void);
 static inline void __init early_init_mmu(void)
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index f4ac25d4df05..88aed01fad81 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -28,6 +28,10 @@
  * Individual features below.
  */
 
+/*
+ * Support for memory protection keys.
+ */
+#define MMU_FTR_PKEY			ASM_CONST(0x00001000)
 /*
  * Support for 68 bit VA space. We added that from ISA 2.05
  */
@@ -177,6 +181,9 @@ enum {
 		MMU_FTR_RADIX_KUAP |
 #endif /* CONFIG_PPC_KUAP */
 #endif /* CONFIG_PPC_RADIX_MMU */
+#ifdef CONFIG_PPC_MEM_KEYS
+	MMU_FTR_PKEY |
+#endif
 		0,
 };
 
@@ -356,6 +363,8 @@ extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 				       phys_addr_t first_memblock_size);
 static inline void mmu_early_init_devtree(void) { }
 
+static inline void pkey_early_init_devtree(void) {}
+
 extern void *abatron_pteptrs[2];
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 9cc49f265c86..4cb65fd5f532 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -815,6 +815,11 @@ void __init early_init_devtree(void *params)
 	/* Now try to figure out if we are running on LPAR and so on */
 	pseries_probe_fw_features();
 
+	/*
+	 * Initialize pkey features and default AMR/IAMR values
+	 */
+	pkey_early_init_devtree();
+
 #ifdef CONFIG_PPC_PS3
 	/* Identify PS3 firmware */
 	if (of_flat_dt_is_compatible(of_get_flat_dt_root(), "sony,ps3"))
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index ac272166e5b4..c795a28c1964 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -10,7 +10,8 @@
 #include <asm/mmu.h>
 #include <asm/setup.h>
 #include <linux/pkeys.h>
-#include <linux/of_device.h>
+#include <linux/of_fdt.h>
+
 
 DEFINE_STATIC_KEY_FALSE(pkey_disabled);
 int  num_pkey;		/* Max number of pkeys supported */
@@ -46,31 +47,38 @@ static int execute_only_key = 2;
 #define PKEY_REG_BITS (sizeof(u64) * 8)
 #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
 
+static int __init dt_scan_storage_keys(unsigned long node,
+				       const char *uname, int depth,
+				       void *data)
+{
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+	const __be32 *prop;
+	int *pkeys_total = (int *) data;
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "ibm,processor-storage-keys", NULL);
+	if (!prop)
+		return 0;
+	*pkeys_total = be32_to_cpu(prop[0]);
+	return 1;
+}
+
 static int scan_pkey_feature(void)
 {
-	u32 vals[2];
+	int ret;
 	int pkeys_total = 0;
-	struct device_node *cpu;
 
 	/*
 	 * Pkey is not supported with Radix translation.
 	 */
-	if (radix_enabled())
-		return 0;
-
-	cpu = of_find_node_by_type(NULL, "cpu");
-	if (!cpu)
+	if (early_radix_enabled())
 		return 0;
 
-	if (of_property_read_u32_array(cpu,
-				       "ibm,processor-storage-keys", vals, 2) == 0) {
-		/*
-		 * Since any pkey can be used for data or execute, we will
-		 * just treat all keys as equal and track them as one entity.
-		 */
-		pkeys_total = vals[0];
-	} else {
-
+	ret = of_scan_flat_dt(dt_scan_storage_keys, &pkeys_total);
+	if (ret == 0) {
 		/*
 		 * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device
 		 * tree. We make this exception since some version of skiboot forgot to
@@ -94,7 +102,7 @@ static int scan_pkey_feature(void)
 	return pkeys_total;
 }
 
-static int pkey_initialize(void)
+void __init pkey_early_init_devtree(void)
 {
 	int pkeys_total, i;
 
@@ -119,9 +127,11 @@ static int pkey_initialize(void)
 	if (!pkeys_total) {
 		/* No support for pkey. Mark it disabled */
 		static_branch_enable(&pkey_disabled);
-		return 0;
+		return;
 	}
 
+	cur_cpu_spec->mmu_features |= MMU_FTR_PKEY;
+
 	/*
 	 * The device tree cannot be relied to indicate support for
 	 * execute_disable support. Instead we use a PVR check.
@@ -201,11 +211,9 @@ static int pkey_initialize(void)
 	 */
 	initial_allocation_mask |= reserved_allocation_mask;
 
-	return 0;
+	return;
 }
 
-arch_initcall(pkey_initialize);
-
 void pkey_mm_init(struct mm_struct *mm)
 {
 	if (static_branch_likely(&pkey_disabled))
-- 
2.26.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox