* [PATCH v2 1/3] powerpc: remove stale calc_vm_prot_bits comment
From: Nicholas Piggin @ 2020-07-03 1:19 UTC (permalink / raw)
To: linuxppc-dev; +Cc: linux-mm, kvm-ppc, Nicholas Piggin, linux-api
In-Reply-To: <20200703011958.1166620-1-npiggin@gmail.com>
This comment is wrong, we wouldn't use calc_vm_prot_bits here because
we are being called by calc_vm_prot_bits to modify its behaviour.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/mman.h | 4 ----
1 file changed, 4 deletions(-)
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index d610c2e07b28..4ba303ea27f5 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -13,10 +13,6 @@
#include <linux/pkeys.h>
#include <asm/cpu_has_feature.h>
-/*
- * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
- * here. How important is the optimization?
- */
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
unsigned long pkey)
{
--
2.23.0
^ permalink raw reply related
* [PATCH v2 2/3] powerpc/64s: remove PROT_SAO support
From: Nicholas Piggin @ 2020-07-03 1:19 UTC (permalink / raw)
To: linuxppc-dev; +Cc: linux-mm, kvm-ppc, Nicholas Piggin, linux-api
In-Reply-To: <20200703011958.1166620-1-npiggin@gmail.com>
ISA v3.1 does not support the SAO storage control attribute required to
implement PROT_SAO. PROT_SAO was used by specialised system software
(Lx86) that has been discontinued for about 7 years, and is not thought
to be used elsewhere, so removal should not cause problems.
We rather remove it than keep support for older processors, because
live migrating guest partitions to newer processors may not be possible
if SAO is in use (or worse allowed with silent races).
- PROT_SAO stays in the uapi header so code using it would still build.
- arch_validate_prot() is removed, the generic version rejects PROT_SAO
so applications would get a failure at mmap() time.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/book3s/64/pgtable.h | 8 ++--
arch/powerpc/include/asm/cputable.h | 10 ++---
arch/powerpc/include/asm/kvm_book3s_64.h | 5 ++-
arch/powerpc/include/asm/mman.h | 26 ++----------
arch/powerpc/include/asm/nohash/64/pgtable.h | 2 -
arch/powerpc/include/uapi/asm/mman.h | 2 +-
arch/powerpc/kernel/dt_cpu_ftrs.c | 2 +-
arch/powerpc/mm/book3s64/hash_utils.c | 2 -
include/linux/mm.h | 2 -
include/trace/events/mmflags.h | 2 -
mm/ksm.c | 4 --
tools/testing/selftests/powerpc/mm/.gitignore | 1 -
tools/testing/selftests/powerpc/mm/Makefile | 4 +-
tools/testing/selftests/powerpc/mm/prot_sao.c | 42 -------------------
14 files changed, 20 insertions(+), 92 deletions(-)
delete mode 100644 tools/testing/selftests/powerpc/mm/prot_sao.c
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 25c3cb8272c0..8e9aca96143b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -20,9 +20,13 @@
#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE)
#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */
-#define _PAGE_SAO 0x00010 /* Strong access order */
+
+#define _PAGE_CACHE_CTL 0x00030 /* Bits for the folowing cache modes */
+ /* No bits set is normal cacheable memory */
+ /* 0x00010 unused, is SAO bit on radix POWER9 */
#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */
#define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */
+
#define _PAGE_DIRTY 0x00080 /* C: page changed */
#define _PAGE_ACCESSED 0x00100 /* R: page referenced */
/*
@@ -825,8 +829,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
return hash__set_pte_at(mm, addr, ptep, pte, percpu);
}
-#define _PAGE_CACHE_CTL (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
-
#define pgprot_noncached pgprot_noncached
static inline pgprot_t pgprot_noncached(pgprot_t prot)
{
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index bac2252c839e..87284750535d 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -191,7 +191,7 @@ static inline void cpu_feature_keys_init(void) { }
#define CPU_FTR_SPURR LONG_ASM_CONST(0x0000000001000000)
#define CPU_FTR_DSCR LONG_ASM_CONST(0x0000000002000000)
#define CPU_FTR_VSX LONG_ASM_CONST(0x0000000004000000)
-#define CPU_FTR_SAO LONG_ASM_CONST(0x0000000008000000)
+// Free LONG_ASM_CONST(0x0000000008000000)
#define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0000000010000000)
#define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0000000020000000)
#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0000000040000000)
@@ -435,7 +435,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
- CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \
+ CPU_FTR_DSCR | CPU_FTR_ASYM_SMT | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_CFAR | CPU_FTR_HVMODE | \
CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX | CPU_FTR_PKEY)
@@ -444,7 +444,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
- CPU_FTR_DSCR | CPU_FTR_SAO | \
+ CPU_FTR_DSCR | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
@@ -455,7 +455,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
- CPU_FTR_DSCR | CPU_FTR_SAO | \
+ CPU_FTR_DSCR | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
@@ -473,7 +473,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
- CPU_FTR_DSCR | CPU_FTR_SAO | \
+ CPU_FTR_DSCR | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9bb9bb370b53..fac39ff659d4 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -398,9 +398,10 @@ static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci)
{
unsigned int wimg = hptel & HPTE_R_WIMG;
- /* Handle SAO */
+ /* Handle SAO for POWER7,8,9 */
if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) &&
- cpu_has_feature(CPU_FTR_ARCH_206))
+ cpu_has_feature(CPU_FTR_ARCH_206) &&
+ !cpu_has_feature(CPU_FTR_ARCH_31))
wimg = HPTE_R_M;
if (!is_ci)
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 4ba303ea27f5..7c07728af300 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -13,38 +13,20 @@
#include <linux/pkeys.h>
#include <asm/cpu_has_feature.h>
+#ifdef CONFIG_PPC_MEM_KEYS
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
unsigned long pkey)
{
-#ifdef CONFIG_PPC_MEM_KEYS
- return (((prot & PROT_SAO) ? VM_SAO : 0) | pkey_to_vmflag_bits(pkey));
-#else
- return ((prot & PROT_SAO) ? VM_SAO : 0);
-#endif
+ return pkey_to_vmflag_bits(pkey);
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
{
-#ifdef CONFIG_PPC_MEM_KEYS
- return (vm_flags & VM_SAO) ?
- __pgprot(_PAGE_SAO | vmflag_to_pte_pkey_bits(vm_flags)) :
- __pgprot(0 | vmflag_to_pte_pkey_bits(vm_flags));
-#else
- return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0);
-#endif
+ return __pgprot(vmflag_to_pte_pkey_bits(vm_flags));
}
#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
-
-static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
-{
- if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_SAO))
- return false;
- if ((prot & PROT_SAO) && !cpu_has_feature(CPU_FTR_SAO))
- return false;
- return true;
-}
-#define arch_validate_prot arch_validate_prot
+#endif
#endif /* CONFIG_PPC64 */
#endif /* _ASM_POWERPC_MMAN_H */
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 6cb8aa357191..59ee9fa4ae09 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -82,8 +82,6 @@
*/
#include <asm/nohash/pte-book3e.h>
-#define _PAGE_SAO 0
-
#define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1))
/*
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index c0c737215b00..3a700351feca 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -11,7 +11,7 @@
#include <asm-generic/mman-common.h>
-#define PROT_SAO 0x10 /* Strong Access Ordering */
+#define PROT_SAO 0x10 /* Unsupported since v5.9 */
#define MAP_RENAME MAP_ANONYMOUS /* In SunOS terminology */
#define MAP_NORESERVE 0x40 /* don't reserve swap pages */
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 3a409517c031..41412c198e70 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -622,7 +622,7 @@ static struct dt_cpu_feature_match __initdata
{"processor-control-facility-v3", feat_enable_dbell, CPU_FTR_DBELL},
{"processor-utilization-of-resources-register", feat_enable_purr, 0},
{"no-execute", feat_enable, 0},
- {"strong-access-ordering", feat_enable, CPU_FTR_SAO},
+ /* strong-access-ordering is unused */
{"cache-inhibited-large-page", feat_enable_large_ci, 0},
{"coprocessor-icswx", feat_enable, 0},
{"hypervisor-virtualization-interrupt", feat_enable_hvi, 0},
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 468169e33c86..e35d8dae4f84 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -232,8 +232,6 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
rflags |= HPTE_R_I;
else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
rflags |= (HPTE_R_I | HPTE_R_G);
- else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
- rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);
else
/*
* Add memory coherence if cache inhibited is not set
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc7b87310c10..6c8333d6c991 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -317,8 +317,6 @@ extern unsigned int kobjsize(const void *objp);
#if defined(CONFIG_X86)
# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
-#elif defined(CONFIG_PPC)
-# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP VM_ARCH_1
#elif defined(CONFIG_IA64)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5fb752034386..939092dbcb8b 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -114,8 +114,6 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
#if defined(CONFIG_X86)
#define __VM_ARCH_SPECIFIC_1 {VM_PAT, "pat" }
-#elif defined(CONFIG_PPC)
-#define __VM_ARCH_SPECIFIC_1 {VM_SAO, "sao" }
#elif defined(CONFIG_PARISC) || defined(CONFIG_IA64)
#define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP, "growsup" }
#elif !defined(CONFIG_MMU)
diff --git a/mm/ksm.c b/mm/ksm.c
index 4102034cd55a..d1cfa18689b5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2452,10 +2452,6 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (vma_is_dax(vma))
return 0;
-#ifdef VM_SAO
- if (*vm_flags & VM_SAO)
- return 0;
-#endif
#ifdef VM_SPARC_ADI
if (*vm_flags & VM_SPARC_ADI)
return 0;
diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore
index 2ca523255b1b..ff296c94f627 100644
--- a/tools/testing/selftests/powerpc/mm/.gitignore
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -2,7 +2,6 @@
hugetlb_vs_thp_test
subpage_prot
tempfile
-prot_sao
segv_errors
wild_bctr
large_vm_fork_separation
diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
index b9103c4bb414..9b8a7b3069c5 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -2,7 +2,7 @@
noarg:
$(MAKE) -C ../
-TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \
+TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot segv_errors wild_bctr \
large_vm_fork_separation bad_accesses
TEST_GEN_PROGS_EXTENDED := tlbie_test
TEST_GEN_FILES := tempfile
@@ -12,8 +12,6 @@ include ../../lib.mk
$(TEST_GEN_PROGS): ../harness.c
-$(OUTPUT)/prot_sao: ../utils.c
-
$(OUTPUT)/wild_bctr: CFLAGS += -m64
$(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
$(OUTPUT)/bad_accesses: CFLAGS += -m64
diff --git a/tools/testing/selftests/powerpc/mm/prot_sao.c b/tools/testing/selftests/powerpc/mm/prot_sao.c
deleted file mode 100644
index e2eed65b7735..000000000000
--- a/tools/testing/selftests/powerpc/mm/prot_sao.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2016, Michael Ellerman, IBM Corp.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-
-#include <asm/cputable.h>
-
-#include "utils.h"
-
-#define SIZE (64 * 1024)
-
-int test_prot_sao(void)
-{
- char *p;
-
- /* 2.06 or later should support SAO */
- SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
-
- /*
- * Ensure we can ask for PROT_SAO.
- * We can't really verify that it does the right thing, but at least we
- * confirm the kernel will accept it.
- */
- p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE | PROT_SAO,
- MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
- FAIL_IF(p == MAP_FAILED);
-
- /* Write to the mapping, to at least cause a fault */
- memset(p, 0xaa, SIZE);
-
- return 0;
-}
-
-int main(void)
-{
- return test_harness(test_prot_sao, "prot-sao");
-}
--
2.23.0
^ permalink raw reply related
* [PATCH v2 3/3] powerpc/64s/hash: disable subpage_prot syscall by default
From: Nicholas Piggin @ 2020-07-03 1:19 UTC (permalink / raw)
To: linuxppc-dev; +Cc: linux-mm, kvm-ppc, Nicholas Piggin, linux-api
In-Reply-To: <20200703011958.1166620-1-npiggin@gmail.com>
The subpage_prot syscall was added for specialised system software
(Lx86) that has been discontinued for about 7 years, and is not thought
to be used elsewhere, so disable it by default.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/Kconfig | 7 +++++--
arch/powerpc/configs/powernv_defconfig | 1 -
arch/powerpc/configs/pseries_defconfig | 1 -
3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fa23eb320ff..04c6ca17661a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -833,13 +833,16 @@ config FORCE_MAX_ZONEORDER
this in mind when choosing a value for this option.
config PPC_SUBPAGE_PROT
- bool "Support setting protections for 4k subpages"
+ bool "Support setting protections for 4k subpages (subpage_prot syscall)"
+ default n
depends on PPC_BOOK3S_64 && PPC_64K_PAGES
help
- This option adds support for a system call to allow user programs
+ This option adds support for system call to allow user programs
to set access permissions (read/write, readonly, or no access)
on the 4k subpages of each 64k page.
+ If unsure, say N here.
+
config PPC_COPRO_BASE
bool
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 2de9aadf0f50..afc0dd73a1e6 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -64,7 +64,6 @@ CONFIG_HWPOISON_INJECT=m
CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_PPC_64K_PAGES=y
-CONFIG_PPC_SUBPAGE_PROT=y
CONFIG_SCHED_SMT=y
CONFIG_PM=y
CONFIG_HOTPLUG_PCI=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index dfa4a726333b..894e8d85fb48 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -57,7 +57,6 @@ CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_PPC_64K_PAGES=y
-CONFIG_PPC_SUBPAGE_PROT=y
CONFIG_SCHED_SMT=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_RPA=m
--
2.23.0
^ permalink raw reply related
* [Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8
From: bugzilla-daemon @ 2020-07-03 4:55 UTC (permalink / raw)
To: linuxppc-dev
In-Reply-To: <bug-208181-206035@https.bugzilla.kernel.org/>
https://bugzilla.kernel.org/show_bug.cgi?id=208181
--- Comment #15 from Christophe Leroy (christophe.leroy@csgroup.eu) ---
Ah yes, having init_text above the 24 bits limit might be a problem for
function calls. I'm surprised that the linker doesn't complain.
Anyway, it is not a problem in itself, and it's unrelated to this bug.
--
You are receiving this mail because:
You are watching the assignee of the bug.
^ permalink raw reply
* Re: objtool clac/stac handling change..
From: Christophe Leroy @ 2020-07-03 5:27 UTC (permalink / raw)
To: Michael Ellerman, Linus Torvalds, Al Viro
Cc: Peter Zijlstra, the arch/x86 maintainers,
linuxppc-dev@lists.ozlabs.org, Linux Kernel Mailing List,
Josh Poimboeuf
In-Reply-To: <87h7up70e5.fsf@mpe.ellerman.id.au>
Le 03/07/2020 à 05:17, Michael Ellerman a écrit :
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> Le 02/07/2020 à 15:34, Michael Ellerman a écrit :
>>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>>> On Wed, Jul 1, 2020 at 12:59 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>>> On Wed, Jul 01, 2020 at 12:04:36PM -0700, Linus Torvalds wrote:
>>>>>>
>>>>>> That's actually for the access granting. Shutting the access down ends
>>>>>> up always doing the same thing anyway..
>>>>>
>>>>> #define user_read_access_end prevent_current_read_from_user
>>>>> #define user_write_access_end prevent_current_write_to_user
>>>>> static inline void prevent_current_read_from_user(void)
>>>>> {
>>>>> prevent_user_access(NULL, NULL, ~0UL, KUAP_CURRENT_READ);
>>>>> }
>>>>>
>>>>> static inline void prevent_current_write_to_user(void)
>>>>> {
>>>>> prevent_user_access(NULL, NULL, ~0UL, KUAP_CURRENT_WRITE);
>>>>> }
>>>>>
>>>>> and prevent_user_access() has instances that do care about the direction...
>>>>
>>>> Go and look closer.
>>>>
>>>> There are three cases:
>>>>
>>>> (a) the 32-bit book3s case. It looks like it cares, but when you look
>>>> closer, it ends up not caring about the read side, and saving the
>>>> "which address to I allow user writes to" in current->thread.kuap
>>>>
>>>> (b) the nohash 32-bit case - doesn't care
>>>>
>>>> (c) the 64-bit books case - doesn't care
>>>>
>>>> So yes, in the (a) case it does make a difference between reads and
>>>> writes, but at least as far as I can tell, it ignores the read case,
>>>> and has code to avoid the unnecessary "disable user writes" case when
>>>> there was only a read enable done.
>>>
>>> Yeah that's my understanding too.
>>>
>>> Christophe is the expert on that code so I'll defer to him if I'm wrong.
>>>
>>>> Now, it's possible that I'm wrong, but the upshot of that is that even
>>>> on powerpc, I think that if we just made the rule be that "taking a
>>>> user exception should automatically do the 'user_access_end()' for us"
>>>> is trivial.
>>>
>>> I think we can do something to make it work.
>>>
>>> We don't have an equivalent of x86's ex_handler_uaccess(), so it's not
>>> quite as easy as whacking a user_access_end() in there.
>>
>> Isn't it something easy to do in bad_page_fault() ?
>
> We'd need to do it there at least.
>
> But I'm not convinced that's the only place we'd need to do it. We could
> theoretically take a machine check on a user access, and those are
> handled differently on each sub-(sub-sub)-platform, and I think all or
> most of them don't call bad_page_fault().
Indeed, it needs to be done everywhere we do
regs->nip = extable_fixup(entry)
There are half a dozen of places that do that, in additional of
bad_page_fault() that's mainly machine checks, also kprobe.
I think we can create a fixup_exception() function which takes regs and
entry as parameters and does the nip fixup and kuap closuse.
>
>> Not exactly a call to user_access_end() but altering regs->kuap so that
>> user access is not restored on exception exit.
>
> Yes.
>
>>> Probably the simplest option for us is to just handle it in our
>>> unsafe_op_wrap(). I'll try and come up with something tomorrow.
>>
>> unsafe_op_wrap() is not used anymore for unsafe_put_user() as we are now
>> using asm goto.
>
> Sure, but we could change it back to use unsafe_op_wrap().
But the whole purpose of using goto in unsafe_???_user() is to allow the
use of asm goto. See explanations in commit
https://github.com/linuxppc/linux/commit/1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51#diff-eba084de047bb8a9087dac10c06f44bc
>
> I did a quick hack to do that and see no difference in the generated
> code, but your commit adding put_user_goto() did show better code
> generation, so possibly it depends on compiler version, or my example
> wasn't complicated enough (filldir()).
Yes as explained above it should remove the error checking in the caller
so your exemple was most likely too trivial.
Christophe
^ permalink raw reply
* [PATCH v3 0/3] Off-load TLB invalidations to host for !GTSE
From: Bharata B Rao @ 2020-07-03 5:36 UTC (permalink / raw)
To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
Hypervisor may choose not to enable Guest Translation Shootdown Enable
(GTSE) option for the guest. When GTSE isn't ON, the guest OS isn't
permitted to use instructions like tblie and tlbsync directly, but is
expected to make hypervisor calls to get the TLB flushed.
This series enables the TLB flush routines in the radix code to
off-load TLB flushing to hypervisor via the newly proposed hcall
H_RPT_INVALIDATE.
To easily check the availability of GTSE, it is made an MMU feature.
The OV5 handling and H_REGISTER_PROC_TBL hcall are changed to
handle GTSE as an optionally available feature and to not assume GTSE
when radix support is available.
The actual hcall implementation for KVM isn't included in this
patchset and will be posted separately.
Changes in v3
=============
- Fixed a bug in the hcall wrapper code where we were missing setting
H_RPTI_TYPE_NESTED while retrying the failed flush request with
a full flush for the nested case.
- s/psize_to_h_rpti/psize_to_rpti_pgsize
v2: https://lore.kernel.org/linuxppc-dev/20200626131000.5207-1-bharata@linux.ibm.com/T/#t
Bharata B Rao (2):
powerpc/mm: Enable radix GTSE only if supported.
powerpc/pseries: H_REGISTER_PROC_TBL should ask for GTSE only if
enabled
Nicholas Piggin (1):
powerpc/mm/book3s64/radix: Off-load TLB invalidations to host when
!GTSE
.../include/asm/book3s/64/tlbflush-radix.h | 15 ++++
arch/powerpc/include/asm/hvcall.h | 34 +++++++-
arch/powerpc/include/asm/mmu.h | 4 +
arch/powerpc/include/asm/plpar_wrappers.h | 52 ++++++++++++
arch/powerpc/kernel/dt_cpu_ftrs.c | 1 +
arch/powerpc/kernel/prom_init.c | 13 +--
arch/powerpc/mm/book3s64/radix_tlb.c | 82 +++++++++++++++++--
arch/powerpc/mm/init_64.c | 5 +-
arch/powerpc/platforms/pseries/lpar.c | 8 +-
9 files changed, 197 insertions(+), 17 deletions(-)
--
2.21.3
^ permalink raw reply
* [PATCH v3 1/3] powerpc/mm: Enable radix GTSE only if supported.
From: Bharata B Rao @ 2020-07-03 5:36 UTC (permalink / raw)
To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
In-Reply-To: <20200703053608.12884-1-bharata@linux.ibm.com>
Make GTSE an MMU feature and enable it by default for radix.
However for guest, conditionally enable it if hypervisor supports
it via OV5 vector. Let prom_init ask for radix GTSE only if the
support exists.
Having GTSE as an MMU feature will make it easy to enable radix
without GTSE. Currently radix assumes GTSE is enabled by default.
Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
arch/powerpc/include/asm/mmu.h | 4 ++++
arch/powerpc/kernel/dt_cpu_ftrs.c | 1 +
arch/powerpc/kernel/prom_init.c | 13 ++++++++-----
arch/powerpc/mm/init_64.c | 5 ++++-
4 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index f4ac25d4df05..884d51995934 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -28,6 +28,9 @@
* Individual features below.
*/
+/* Guest Translation Shootdown Enable */
+#define MMU_FTR_GTSE ASM_CONST(0x00001000)
+
/*
* Support for 68 bit VA space. We added that from ISA 2.05
*/
@@ -173,6 +176,7 @@ enum {
#endif
#ifdef CONFIG_PPC_RADIX_MMU
MMU_FTR_TYPE_RADIX |
+ MMU_FTR_GTSE |
#ifdef CONFIG_PPC_KUAP
MMU_FTR_RADIX_KUAP |
#endif /* CONFIG_PPC_KUAP */
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index a0edeb391e3e..ac650c233cd9 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -336,6 +336,7 @@ static int __init feat_enable_mmu_radix(struct dt_cpu_feature *f)
#ifdef CONFIG_PPC_RADIX_MMU
cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
cur_cpu_spec->mmu_features |= MMU_FTRS_HASH_BASE;
+ cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_MMU;
return 1;
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 90c604d00b7d..cbc605cfdec0 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1336,12 +1336,15 @@ static void __init prom_check_platform_support(void)
}
}
- if (supported.radix_mmu && supported.radix_gtse &&
- IS_ENABLED(CONFIG_PPC_RADIX_MMU)) {
- /* Radix preferred - but we require GTSE for now */
- prom_debug("Asking for radix with GTSE\n");
+ if (supported.radix_mmu && IS_ENABLED(CONFIG_PPC_RADIX_MMU)) {
+ /* Radix preferred - Check if GTSE is also supported */
+ prom_debug("Asking for radix\n");
ibm_architecture_vec.vec5.mmu = OV5_FEAT(OV5_MMU_RADIX);
- ibm_architecture_vec.vec5.radix_ext = OV5_FEAT(OV5_RADIX_GTSE);
+ if (supported.radix_gtse)
+ ibm_architecture_vec.vec5.radix_ext =
+ OV5_FEAT(OV5_RADIX_GTSE);
+ else
+ prom_debug("Radix GTSE isn't supported\n");
} else if (supported.hash_mmu) {
/* Default to hash mmu (if we can) */
prom_debug("Asking for hash\n");
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index bc73abf0bc25..152aa0200cef 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -407,12 +407,15 @@ static void __init early_check_vec5(void)
if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
OV5_FEAT(OV5_RADIX_GTSE))) {
pr_warn("WARNING: Hypervisor doesn't support RADIX with GTSE\n");
- }
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
+ } else
+ cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
/* Do radix anyway - the hypervisor said we had to */
cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
/* Hypervisor only supports hash - disable radix */
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
}
}
--
2.21.3
^ permalink raw reply related
* [PATCH v3 2/3] powerpc/pseries: H_REGISTER_PROC_TBL should ask for GTSE only if enabled
From: Bharata B Rao @ 2020-07-03 5:36 UTC (permalink / raw)
To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
In-Reply-To: <20200703053608.12884-1-bharata@linux.ibm.com>
H_REGISTER_PROC_TBL asks for GTSE by default. GTSE flag bit should
be set only when GTSE is supported.
Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
arch/powerpc/platforms/pseries/lpar.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index fd26f3d21d7b..f82569a505f1 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1680,9 +1680,11 @@ static int pseries_lpar_register_process_table(unsigned long base,
if (table_size)
flags |= PROC_TABLE_NEW;
- if (radix_enabled())
- flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
- else
+ if (radix_enabled()) {
+ flags |= PROC_TABLE_RADIX;
+ if (mmu_has_feature(MMU_FTR_GTSE))
+ flags |= PROC_TABLE_GTSE;
+ } else
flags |= PROC_TABLE_HPT_SLB;
for (;;) {
rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
--
2.21.3
^ permalink raw reply related
* [PATCH v3 3/3] powerpc/mm/book3s64/radix: Off-load TLB invalidations to host when !GTSE
From: Bharata B Rao @ 2020-07-03 5:36 UTC (permalink / raw)
To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
In-Reply-To: <20200703053608.12884-1-bharata@linux.ibm.com>
From: Nicholas Piggin <npiggin@gmail.com>
When platform doesn't support GTSE, let TLB invalidation requests
for radix guests be off-loaded to the host using H_RPT_INVALIDATE
hcall.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
[hcall wrapper, error path handling and renames]
---
.../include/asm/book3s/64/tlbflush-radix.h | 15 ++++
arch/powerpc/include/asm/hvcall.h | 34 +++++++-
arch/powerpc/include/asm/plpar_wrappers.h | 52 ++++++++++++
arch/powerpc/mm/book3s64/radix_tlb.c | 82 +++++++++++++++++--
4 files changed, 175 insertions(+), 8 deletions(-)
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index ca8db193ae38..94439e0cefc9 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -2,10 +2,25 @@
#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H
#define _ASM_POWERPC_TLBFLUSH_RADIX_H
+#include <asm/hvcall.h>
+
struct vm_area_struct;
struct mm_struct;
struct mmu_gather;
+static inline u64 psize_to_rpti_pgsize(unsigned long psize)
+{
+ if (psize == MMU_PAGE_4K)
+ return H_RPTI_PAGE_4K;
+ if (psize == MMU_PAGE_64K)
+ return H_RPTI_PAGE_64K;
+ if (psize == MMU_PAGE_2M)
+ return H_RPTI_PAGE_2M;
+ if (psize == MMU_PAGE_1G)
+ return H_RPTI_PAGE_1G;
+ return H_RPTI_PAGE_ALL;
+}
+
static inline int mmu_get_ap(int psize)
{
return mmu_psize_defs[psize].ap;
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index e90c073e437e..43486e773bd6 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -305,7 +305,8 @@
#define H_SCM_UNBIND_ALL 0x3FC
#define H_SCM_HEALTH 0x400
#define H_SCM_PERFORMANCE_STATS 0x418
-#define MAX_HCALL_OPCODE H_SCM_PERFORMANCE_STATS
+#define H_RPT_INVALIDATE 0x448
+#define MAX_HCALL_OPCODE H_RPT_INVALIDATE
/* Scope args for H_SCM_UNBIND_ALL */
#define H_UNBIND_SCOPE_ALL (0x1)
@@ -389,6 +390,37 @@
#define PROC_TABLE_RADIX 0x04
#define PROC_TABLE_GTSE 0x01
+/*
+ * Defines for
+ * H_RPT_INVALIDATE - Invalidate RPT translation lookaside information.
+ */
+
+/* Type of translation to invalidate (type) */
+#define H_RPTI_TYPE_NESTED 0x0001 /* Invalidate nested guest partition-scope */
+#define H_RPTI_TYPE_TLB 0x0002 /* Invalidate TLB */
+#define H_RPTI_TYPE_PWC 0x0004 /* Invalidate Page Walk Cache */
+/* Invalidate Process Table Entries if H_RPTI_TYPE_NESTED is clear */
+#define H_RPTI_TYPE_PRT 0x0008
+/* Invalidate Partition Table Entries if H_RPTI_TYPE_NESTED is set */
+#define H_RPTI_TYPE_PAT 0x0008
+#define H_RPTI_TYPE_ALL (H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | \
+ H_RPTI_TYPE_PRT)
+#define H_RPTI_TYPE_NESTED_ALL (H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | \
+ H_RPTI_TYPE_PAT)
+
+/* Invalidation targets (target) */
+#define H_RPTI_TARGET_CMMU 0x01 /* All virtual processors in the partition */
+#define H_RPTI_TARGET_CMMU_LOCAL 0x02 /* Current virtual processor */
+/* All nest/accelerator agents in use by the partition */
+#define H_RPTI_TARGET_NMMU 0x04
+
+/* Page size mask (page sizes) */
+#define H_RPTI_PAGE_4K 0x01
+#define H_RPTI_PAGE_64K 0x02
+#define H_RPTI_PAGE_2M 0x04
+#define H_RPTI_PAGE_1G 0x08
+#define H_RPTI_PAGE_ALL (-1UL)
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 4497c8afb573..4293c5d2ddf4 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -334,6 +334,51 @@ static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p)
return rc;
}
+/*
+ * Wrapper to H_RPT_INVALIDATE hcall that handles return values appropriately
+ *
+ * - Returns H_SUCCESS on success
+ * - For H_BUSY return value, we retry the hcall.
+ * - For any other hcall failures, attempt a full flush once before
+ * resorting to BUG().
+ *
+ * Note: This hcall is expected to fail only very rarely. The correct
+ * error recovery of killing the process/guest will be eventually
+ * needed.
+ */
+static inline long pseries_rpt_invalidate(u32 pid, u64 target, u64 type,
+ u64 page_sizes, u64 start, u64 end)
+{
+ long rc;
+ unsigned long all;
+
+ while (true) {
+ rc = plpar_hcall_norets(H_RPT_INVALIDATE, pid, target, type,
+ page_sizes, start, end);
+ if (rc == H_BUSY) {
+ cpu_relax();
+ continue;
+ } else if (rc == H_SUCCESS)
+ return rc;
+
+ /* Flush request failed, try with a full flush once */
+ if (type & H_RPTI_TYPE_NESTED)
+ all = H_RPTI_TYPE_NESTED | H_RPTI_TYPE_NESTED_ALL;
+ else
+ all = H_RPTI_TYPE_ALL;
+retry:
+ rc = plpar_hcall_norets(H_RPT_INVALIDATE, pid, target,
+ all, page_sizes, 0, -1UL);
+ if (rc == H_BUSY) {
+ cpu_relax();
+ goto retry;
+ } else if (rc == H_SUCCESS)
+ return rc;
+
+ BUG();
+ }
+}
+
#else /* !CONFIG_PPC_PSERIES */
static inline long plpar_set_ciabr(unsigned long ciabr)
@@ -346,6 +391,13 @@ static inline long plpar_pte_read_4(unsigned long flags, unsigned long ptex,
{
return 0;
}
+
+static inline long pseries_rpt_invalidate(u32 pid, u64 target, u64 type,
+ u64 page_sizes, u64 start, u64 end)
+{
+ return 0;
+}
+
#endif /* CONFIG_PPC_PSERIES */
#endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b5cc9b23cf02..0d233763441f 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -16,6 +16,7 @@
#include <asm/tlbflush.h>
#include <asm/trace.h>
#include <asm/cputhreads.h>
+#include <asm/plpar_wrappers.h>
#define RIC_FLUSH_TLB 0
#define RIC_FLUSH_PWC 1
@@ -694,7 +695,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
goto local;
}
- if (cputlb_use_tlbie()) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie()) {
if (mm_needs_flush_escalation(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
@@ -727,7 +735,16 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
goto local;
}
}
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie())
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
@@ -760,7 +777,19 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
exit_flush_lazy_tlbs(mm);
goto local;
}
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, pg_sizes, size;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ pg_sizes = psize_to_rpti_pgsize(psize);
+ size = 1UL << mmu_psize_to_shift(psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ pg_sizes, vmaddr,
+ vmaddr + size);
+ } else if (cputlb_use_tlbie())
_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
else
_tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
@@ -810,7 +839,14 @@ static inline void _tlbiel_kernel_broadcast(void)
*/
void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL,
+ start, end);
+ } else if (cputlb_use_tlbie())
_tlbie_pid(0, RIC_FLUSH_ALL);
else
_tlbiel_kernel_broadcast();
@@ -864,7 +900,17 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
nr_pages > tlb_local_single_page_flush_ceiling);
}
- if (full) {
+ if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M);
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, pg_sizes,
+ start, end);
+ } else if (full) {
if (local) {
_tlbiel_pid(pid, RIC_FLUSH_TLB);
} else {
@@ -1046,7 +1092,17 @@ static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
nr_pages > tlb_local_single_page_flush_ceiling);
}
- if (full) {
+ if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(psize);
+
+ if (also_pwc)
+ type |= H_RPTI_TYPE_PWC;
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
+ } else if (full) {
if (local) {
_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
} else {
@@ -1111,7 +1167,19 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
exit_flush_lazy_tlbs(mm);
goto local;
}
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, type, pg_sizes;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+ pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes,
+ addr, end);
+ } else if (cputlb_use_tlbie())
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
else
_tlbiel_va_range_multicast(mm,
--
2.21.3
^ permalink raw reply related
* [PATCH v3 0/6] Remove default DMA window before creating DDW
From: Leonardo Bras @ 2020-07-03 6:18 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
Ram Pai
Cc: linuxppc-dev, linux-kernel
There are some devices in which a hypervisor may only allow 1 DMA window
to exist at a time, and in those cases, a DDW is never created to them,
since the default DMA window keeps using this resource.
LoPAR recommends this procedure:
1. Remove the default DMA window,
2. Query for which configs the DDW can be created,
3. Create a DDW.
Patch #1:
Create defines for outputs of ibm,ddw-applicable, so it's easier to
identify them.
Patch #2:
- After LoPAR level 2.8, there is an extension that can make
ibm,query-pe-dma-windows to have 6 outputs instead of 5. This changes the
order of the outputs, and that can cause some trouble.
- query_ddw() was updated to check how many outputs the
ibm,query-pe-dma-windows is supposed to have, update the rtas_call() and
deal correctly with the outputs in both cases.
- This patch looks somehow unrelated to the series, but it can avoid future
problems on DDW creation.
Patch #3 moves the window-removing code from remove_ddw() to
remove_dma_window(), creating a way to delete any DMA window, so it can be
used to delete the default DMA window.
Patch #4 makes use of the remove_dma_window() from patch #3 to remove the
default DMA window before query_ddw(). It also implements a new rtas call
to recover the default DMA window, in case anything fails after it was
removed, and a DDW couldn't be created.
Patch #5:
Instead of destroying the created DDW if it doesn't map the whole
partition, make use of it instead of the default DMA window as it improves
performance.
Patch #6:
Does some renaming of 'direct window' to 'dma window', given the DDW
created can now be also used in indirect mapping if direct mapping is not
available.
All patches were tested into an LPAR with an Ethernet VF:
4005:01:00.0 Ethernet controller: Mellanox Technologies MT27700 Family
[ConnectX-4 Virtual Function]
Patch #5 It was tested with a 64GB DDW which did not map the whole
partition (128G). Performance improvement noticed by using the DDW instead
of the default DMA window:
64 thread write throughput: +203.0%
64 thread read throughput: +17.5%
1 thread write throughput: +20.5%
1 thread read throughput: +3.43%
Average write latency: -23.0%
Average read latency: -2.26%
---
Changes since v2:
- Change the way ibm,ddw-extensions is accessed, using a proper function
instead of doing this inline everytime it's used.
- Remove previous patch #6, as it doesn't look like it would be useful.
- Add new patch, for changing names from direct* to dma*, as indirect
mapping can be used from now on.
- Fix some typos, corrects some define usage.
- v2 link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=185433&state=%2A&archive=both
Changes since v1:
- Add defines for ibm,ddw-applicable and ibm,ddw-extensions outputs
- Merge aux function query_ddw_out_sz() into query_ddw()
- Merge reset_dma_window() patch (prev. #2) into remove default DMA
window patch (#4).
- Keep device_node *np name instead of using pdn in remove_*()
- Rename 'device_node *pdn' into 'parent' in new functions
- Rename dfl_win to default_win
- Only remove the default DMA window if there is no window available
in first query.
- Check if default DMA window can be restored before removing it.
- Fix 'unitialized use' (found by travis mpe:ci-test)
- New patches #5 and #6
- v1 link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=184420&state=%2A&archive=both
Special thanks for Alexey Kardashevskiy and Oliver O'Halloran for
the feedback provided!
Leonardo Bras (6):
powerpc/pseries/iommu: Create defines for operations in
ibm,ddw-applicable
powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
powerpc/pseries/iommu: Move window-removing part of remove_ddw into
remove_dma_window
powerpc/pseries/iommu: Remove default DMA window before creating DDW
powerpc/pseries/iommu: Make use of DDW even if it does not map the
partition
powerpc/pseries/iommu: Rename "direct window" to "dma window"
arch/powerpc/platforms/pseries/iommu.c | 379 ++++++++++++++++++-------
1 file changed, 269 insertions(+), 110 deletions(-)
--
2.25.4
^ permalink raw reply
* [PATCH v3 1/6] powerpc/pseries/iommu: Create defines for operations in ibm, ddw-applicable
From: Leonardo Bras @ 2020-07-03 6:18 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
Ram Pai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>
Create defines to help handling ibm,ddw-applicable values, avoiding
confusion about the index of given operations.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
arch/powerpc/platforms/pseries/iommu.c | 43 ++++++++++++++++----------
1 file changed, 26 insertions(+), 17 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 6d47b4a3ce39..ac0d6376bdad 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -39,6 +39,14 @@
#include "pseries.h"
+enum {
+ DDW_QUERY_PE_DMA_WIN = 0,
+ DDW_CREATE_PE_DMA_WIN = 1,
+ DDW_REMOVE_PE_DMA_WIN = 2,
+
+ DDW_APPLICABLE_SIZE
+};
+
static struct iommu_table_group *iommu_pseries_alloc_group(int node)
{
struct iommu_table_group *table_group;
@@ -771,12 +779,12 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
{
struct dynamic_dma_window_prop *dwp;
struct property *win64;
- u32 ddw_avail[3];
+ u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
int ret = 0;
ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
- &ddw_avail[0], 3);
+ &ddw_avail[0], DDW_APPLICABLE_SIZE);
win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
if (!win64)
@@ -798,15 +806,15 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
pr_debug("%pOF successfully cleared tces in window.\n",
np);
- ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+ ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
pr_warn("%pOF: failed to remove direct window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
- np, ret, ddw_avail[2], liobn);
+ np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
pr_debug("%pOF: successfully removed direct window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
- np, ret, ddw_avail[2], liobn);
+ np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
delprop:
if (remove_prop)
@@ -889,11 +897,11 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
- ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
- cfg_addr, BUID_HI(buid), BUID_LO(buid));
+ ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+ cfg_addr, BUID_HI(buid), BUID_LO(buid));
dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
- " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
- BUID_LO(buid), ret);
+ " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
+ BUID_HI(buid), BUID_LO(buid), ret);
return ret;
}
@@ -920,15 +928,16 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
do {
/* extra outputs are LIOBN and dma-addr (hi, lo) */
- ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create,
- cfg_addr, BUID_HI(buid), BUID_LO(buid),
- page_shift, window_shift);
+ ret = rtas_call(ddw_avail[DDW_CREATE_PE_DMA_WIN], 5, 4,
+ (u32 *)create, cfg_addr, BUID_HI(buid),
+ BUID_LO(buid), page_shift, window_shift);
} while (rtas_busy_delay(ret));
dev_info(&dev->dev,
"ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
- "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
- cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
- window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
+ "(liobn = 0x%x starting addr = %x %x)\n",
+ ddw_avail[DDW_CREATE_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+ BUID_LO(buid), page_shift, window_shift, ret, create->liobn,
+ create->addr_hi, create->addr_lo);
return ret;
}
@@ -996,7 +1005,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
int page_shift;
u64 dma_addr, max_addr;
struct device_node *dn;
- u32 ddw_avail[3];
+ u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64;
struct dynamic_dma_window_prop *ddwprop;
@@ -1029,7 +1038,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
* the property is actually in the parent, not the PE
*/
ret = of_property_read_u32_array(pdn, "ibm,ddw-applicable",
- &ddw_avail[0], 3);
+ &ddw_avail[0], DDW_APPLICABLE_SIZE);
if (ret)
goto out_failed;
--
2.25.4
^ permalink raw reply related
* [PATCH v3 2/6] powerpc/pseries/iommu: Update call to ibm, query-pe-dma-windows
From: Leonardo Bras @ 2020-07-03 6:18 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
Ram Pai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>
From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
This change of output size is meant to expand the address size of
largest_available_block PE TCE from 32-bit to 64-bit, which ends up
shifting page_size and migration_capable.
This ends up requiring the update of
ddw_query_response->largest_available_block from u32 to u64, and manually
assigning the values from the buffer into this struct, according to
output size.
Also, a routine was created for helping reading the ddw extensions as
suggested by LoPAR: First reading the size of the extension array from
index 0, checking if the property exists, and then returning it's value.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
arch/powerpc/platforms/pseries/iommu.c | 91 +++++++++++++++++++++++---
1 file changed, 81 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index ac0d6376bdad..1a933c4e8bba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -47,6 +47,12 @@ enum {
DDW_APPLICABLE_SIZE
};
+enum {
+ DDW_EXT_SIZE = 0,
+ DDW_EXT_RESET_DMA_WIN = 1,
+ DDW_EXT_QUERY_OUT_SIZE = 2
+};
+
static struct iommu_table_group *iommu_pseries_alloc_group(int node)
{
struct iommu_table_group *table_group;
@@ -342,7 +348,7 @@ struct direct_window {
/* Dynamic DMA Window support */
struct ddw_query_response {
u32 windows_available;
- u32 largest_available_block;
+ u64 largest_available_block;
u32 page_size;
u32 migration_capable;
};
@@ -877,14 +883,62 @@ static int find_existing_ddw_windows(void)
}
machine_arch_initcall(pseries, find_existing_ddw_windows);
+/**
+ * ddw_read_ext - Get the value of an DDW extension
+ * @np: device node from which the extension value is to be read.
+ * @extnum: index number of the extension.
+ * @value: pointer to return value, modified when extension is available.
+ *
+ * Checks if "ibm,ddw-extensions" exists for this node, and get the value
+ * on index 'extnum'.
+ * It can be used only to check if a property exists, passing value == NULL.
+ *
+ * Returns:
+ * 0 if extension successfully read
+ * -EINVAL if the "ibm,ddw-extensions" does not exist,
+ * -ENODATA if "ibm,ddw-extensions" does not have a value, and
+ * -EOVERFLOW if "ibm,ddw-extensions" does not contain this extension.
+ */
+static inline int ddw_read_ext(const struct device_node *np, int extnum,
+ u32 *value)
+{
+ static const char propname[] = "ibm,ddw-extensions";
+ u32 count;
+ int ret;
+
+ ret = of_property_read_u32_index(np, propname, DDW_EXT_SIZE, &count);
+ if (ret)
+ return ret;
+
+ if (count < extnum)
+ return -EOVERFLOW;
+
+ if (!value)
+ value = &count;
+
+ return of_property_read_u32_index(np, propname, extnum, value);
+}
+
static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
- struct ddw_query_response *query)
+ struct ddw_query_response *query,
+ struct device_node *parent)
{
struct device_node *dn;
struct pci_dn *pdn;
- u32 cfg_addr;
+ u32 cfg_addr, ext_query, query_out[5];
u64 buid;
- int ret;
+ int ret, out_sz;
+
+ /*
+ * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many
+ * output parameters ibm,query-pe-dma-windows will have, ranging from
+ * 5 to 6.
+ */
+ ret = ddw_read_ext(parent, DDW_EXT_QUERY_OUT_SIZE, &ext_query);
+ if (!ret && ext_query == 1)
+ out_sz = 6;
+ else
+ out_sz = 5;
/*
* Get the config address and phb buid of the PE window.
@@ -897,11 +951,28 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
buid = pdn->phb->buid;
cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
- ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, 5, (u32 *)query,
+ ret = rtas_call(ddw_avail[DDW_QUERY_PE_DMA_WIN], 3, out_sz, query_out,
cfg_addr, BUID_HI(buid), BUID_LO(buid));
- dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
- " returned %d\n", ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr,
- BUID_HI(buid), BUID_LO(buid), ret);
+ dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n",
+ ddw_avail[DDW_QUERY_PE_DMA_WIN], cfg_addr, BUID_HI(buid),
+ BUID_LO(buid), ret);
+
+ switch (out_sz) {
+ case 5:
+ query->windows_available = query_out[0];
+ query->largest_available_block = query_out[1];
+ query->page_size = query_out[2];
+ query->migration_capable = query_out[3];
+ break;
+ case 6:
+ query->windows_available = query_out[0];
+ query->largest_available_block = ((u64)query_out[1] << 32) |
+ query_out[2];
+ query->page_size = query_out[3];
+ query->migration_capable = query_out[4];
+ break;
+ }
+
return ret;
}
@@ -1049,7 +1120,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
* of page sizes: supported and supported for migrate-dma.
*/
dn = pci_device_to_OF_node(dev);
- ret = query_ddw(dev, ddw_avail, &query);
+ ret = query_ddw(dev, ddw_avail, &query, pdn);
if (ret != 0)
goto out_failed;
@@ -1077,7 +1148,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
/* check largest block * page size > max memory hotplug addr */
max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
- dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
+ dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
"%llu-sized pages\n", max_addr, query.largest_available_block,
1ULL << page_shift);
goto out_failed;
--
2.25.4
^ permalink raw reply related
* [PATCH v3 3/6] powerpc/pseries/iommu: Move window-removing part of remove_ddw into remove_dma_window
From: Leonardo Bras @ 2020-07-03 6:18 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
Ram Pai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>
Move the window-removing part of remove_ddw into a new function
(remove_dma_window), so it can be used to remove other DMA windows.
It's useful for removing DMA windows that don't create DIRECT64_PROPNAME
property, like the default DMA window from the device, which uses
"ibm,dma-window".
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
arch/powerpc/platforms/pseries/iommu.c | 45 +++++++++++++++-----------
1 file changed, 27 insertions(+), 18 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 1a933c4e8bba..4e33147825cc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -781,25 +781,14 @@ static int __init disable_ddw_setup(char *str)
early_param("disable_ddw", disable_ddw_setup);
-static void remove_ddw(struct device_node *np, bool remove_prop)
+static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
+ struct property *win)
{
struct dynamic_dma_window_prop *dwp;
- struct property *win64;
- u32 ddw_avail[DDW_APPLICABLE_SIZE];
u64 liobn;
- int ret = 0;
-
- ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
- &ddw_avail[0], DDW_APPLICABLE_SIZE);
-
- win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
- if (!win64)
- return;
-
- if (ret || win64->length < sizeof(*dwp))
- goto delprop;
+ int ret;
- dwp = win64->value;
+ dwp = win->value;
liobn = (u64)be32_to_cpu(dwp->liobn);
/* clear the whole window, note the arg is in kernel pages */
@@ -821,10 +810,30 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
pr_debug("%pOF: successfully removed direct window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
+}
+
+static void remove_ddw(struct device_node *np, bool remove_prop)
+{
+ struct property *win;
+ u32 ddw_avail[DDW_APPLICABLE_SIZE];
+ int ret = 0;
+
+ ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
+ &ddw_avail[0], DDW_APPLICABLE_SIZE);
+ if (ret)
+ return;
+
+ win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+ if (!win)
+ return;
+
+ if (win->length >= sizeof(struct dynamic_dma_window_prop))
+ remove_dma_window(np, ddw_avail, win);
+
+ if (!remove_prop)
+ return;
-delprop:
- if (remove_prop)
- ret = of_remove_property(np, win64);
+ ret = of_remove_property(np, win);
if (ret)
pr_warn("%pOF: failed to remove direct window property: %d\n",
np, ret);
--
2.25.4
^ permalink raw reply related
* [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Leonardo Bras @ 2020-07-03 6:18 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
Ram Pai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>
On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
default DMA window for the device, before attempting to configure a DDW,
in order to make the maximum resources available for the next DDW to be
created.
This is a requirement for using DDW on devices in which hypervisor
allows only one DMA window.
If setting up a new DDW fails anywhere after the removal of this
default DMA window, it's needed to restore the default DMA window.
For this, an implementation of ibm,reset-pe-dma-windows rtas call is
needed:
Platforms supporting the DDW option starting with LoPAR level 2.7 implement
ibm,ddw-extensions. The first extension available (index 2) carries the
token for ibm,reset-pe-dma-windows rtas call, which is used to restore
the default DMA window for a device, if it has been deleted.
It does so by resetting the TCE table allocation for the PE to it's
boot time value, available in "ibm,dma-window" device tree node.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
arch/powerpc/platforms/pseries/iommu.c | 83 +++++++++++++++++++++-----
1 file changed, 69 insertions(+), 14 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4e33147825cc..5b520ac354c6 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1066,6 +1066,38 @@ static phys_addr_t ddw_memory_hotplug_max(void)
return max_addr;
}
+/*
+ * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
+ * ibm,ddw-extensions, which carries the rtas token for
+ * ibm,reset-pe-dma-windows.
+ * That rtas-call can be used to restore the default DMA window for the device.
+ */
+static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
+{
+ int ret;
+ u32 cfg_addr, reset_dma_win;
+ u64 buid;
+ struct device_node *dn;
+ struct pci_dn *pdn;
+
+ ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
+ if (ret)
+ return;
+
+ dn = pci_device_to_OF_node(dev);
+ pdn = PCI_DN(dn);
+ buid = pdn->phb->buid;
+ cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
+
+ ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
+ BUID_LO(buid));
+ if (ret)
+ dev_info(&dev->dev,
+ "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
+ reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
+ ret);
+}
+
/*
* If the PE supports dynamic dma windows, and there is space for a table
* that can map all pages in a linear offset, then setup such a table,
@@ -1079,7 +1111,7 @@ static phys_addr_t ddw_memory_hotplug_max(void)
*/
static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
{
- int len, ret;
+ int len, ret, reset_win_ext;
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
@@ -1087,7 +1119,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
- struct property *win64;
+ struct property *win64, *default_win = NULL;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
@@ -1122,7 +1154,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
if (ret)
goto out_failed;
- /*
+ /*
* Query if there is a second window of size to map the
* whole partition. Query returns number of windows, largest
* block assigned to PE (partition endpoint), and two bitmasks
@@ -1133,14 +1165,34 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
if (ret != 0)
goto out_failed;
+ /*
+ * If there is no window available, remove the default DMA window,
+ * if it's present. This will make all the resources available to the
+ * new DDW window.
+ * If anything fails after this, we need to restore it, so also check
+ * for extensions presence.
+ */
if (query.windows_available == 0) {
- /*
- * no additional windows are available for this device.
- * We might be able to reallocate the existing window,
- * trading in for a larger page size.
- */
- dev_dbg(&dev->dev, "no free dynamic windows");
- goto out_failed;
+ default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+ if (!default_win)
+ goto out_failed;
+
+ reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
+ if (reset_win_ext)
+ goto out_failed;
+
+ remove_dma_window(pdn, ddw_avail, default_win);
+
+ /* Query again, to check if the window is available */
+ ret = query_ddw(dev, ddw_avail, &query, pdn);
+ if (ret != 0)
+ goto out_restore_defwin;
+
+ if (query.windows_available == 0) {
+ /* no windows are available for this device. */
+ dev_dbg(&dev->dev, "no free dynamic windows");
+ goto out_restore_defwin;
+ }
}
if (query.page_size & 4) {
page_shift = 24; /* 16MB */
@@ -1151,7 +1203,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
} else {
dev_dbg(&dev->dev, "no supported direct page size in mask %x",
query.page_size);
- goto out_failed;
+ goto out_restore_defwin;
}
/* verify the window * number of ptes will map the partition */
/* check largest block * page size > max memory hotplug addr */
@@ -1160,14 +1212,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
"%llu-sized pages\n", max_addr, query.largest_available_block,
1ULL << page_shift);
- goto out_failed;
+ goto out_restore_defwin;
}
len = order_base_2(max_addr);
win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
if (!win64) {
dev_info(&dev->dev,
"couldn't allocate property for 64bit dma window\n");
- goto out_failed;
+ goto out_restore_defwin;
}
win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
@@ -1230,8 +1282,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
kfree(win64->value);
kfree(win64);
-out_failed:
+out_restore_defwin:
+ if (default_win && reset_win_ext == 0)
+ reset_dma_window(dev, pdn);
+out_failed:
fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
if (!fpdn)
goto out_unlock;
--
2.25.4
^ permalink raw reply related
* [PATCH v3 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition
From: Leonardo Bras @ 2020-07-03 6:18 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
Ram Pai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>
As of today, if the biggest DDW that can be created can't map the whole
partition, it's creation is skipped and the default DMA window
"ibm,dma-window" is used instead.
Usually this DDW is bigger than the default DMA window, and it performs
better, so it would be nice to use it instead.
The ddw created will be used for direct mapping by default.
If it's not available, indirect mapping sill be used instead.
As there will never have both mappings at the same time, the same property
name can be used for the created DDW.
So renaming
define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
to
define DMA64_PROPNAME "linux,dma64-ddr-window-info"
looks the right thing to do.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
arch/powerpc/platforms/pseries/iommu.c | 38 ++++++++++++++++----------
1 file changed, 24 insertions(+), 14 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 5b520ac354c6..c652177de09c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -364,7 +364,7 @@ static LIST_HEAD(direct_window_list);
static DEFINE_SPINLOCK(direct_window_list_lock);
/* protects initializing window twice for same device */
static DEFINE_MUTEX(direct_window_init_mutex);
-#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -690,7 +690,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
struct iommu_table *tbl;
struct device_node *dn, *pdn;
struct pci_dn *ppci;
- const __be32 *dma_window = NULL;
+ const __be32 *dma_window = NULL, *alt_dma_window = NULL;
dn = pci_bus_to_OF_node(bus);
@@ -704,8 +704,13 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
break;
}
+ /* If there is a DDW available, use it instead */
+ alt_dma_window = of_get_property(pdn, DMA64_PROPNAME, NULL);
+ if (alt_dma_window)
+ dma_window = alt_dma_window;
+
if (dma_window == NULL) {
- pr_debug(" no ibm,dma-window property !\n");
+ pr_debug(" no ibm,dma-window nor linux,dma64-ddr-window-info property !\n");
return;
}
@@ -823,7 +828,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
if (ret)
return;
- win = of_find_property(np, DIRECT64_PROPNAME, NULL);
+ win = of_find_property(np, DMA64_PROPNAME, NULL);
if (!win)
return;
@@ -869,8 +874,8 @@ static int find_existing_ddw_windows(void)
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
- for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
- direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+ for_each_node_with_property(pdn, DMA64_PROPNAME) {
+ direct64 = of_get_property(pdn, DMA64_PROPNAME, &len);
if (!direct64)
continue;
@@ -1205,23 +1210,26 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
query.page_size);
goto out_restore_defwin;
}
+
/* verify the window * number of ptes will map the partition */
- /* check largest block * page size > max memory hotplug addr */
max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
- dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
- "%llu-sized pages\n", max_addr, query.largest_available_block,
- 1ULL << page_shift);
- goto out_restore_defwin;
+ dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n",
+ max_addr, query.largest_available_block,
+ 1ULL << page_shift);
+
+ len = order_base_2(query.largest_available_block << page_shift);
+ } else {
+ len = order_base_2(max_addr);
}
- len = order_base_2(max_addr);
+
win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
if (!win64) {
dev_info(&dev->dev,
"couldn't allocate property for 64bit dma window\n");
goto out_restore_defwin;
}
- win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+ win64->name = kstrdup(DMA64_PROPNAME, GFP_KERNEL);
win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
win64->length = sizeof(*ddwprop);
if (!win64->name || !win64->value) {
@@ -1268,7 +1276,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
list_add(&window->list, &direct_window_list);
spin_unlock(&direct_window_list_lock);
- dma_addr = be64_to_cpu(ddwprop->dma_base);
+ /* Only returns the dma_addr if DDW maps the whole partition */
+ if (len == order_base_2(max_addr))
+ dma_addr = be64_to_cpu(ddwprop->dma_base);
goto out_unlock;
out_free_window:
--
2.25.4
^ permalink raw reply related
* [PATCH v3 6/6] powerpc/pseries/iommu: Rename "direct window" to "dma window"
From: Leonardo Bras @ 2020-07-03 6:18 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
Alexey Kardashevskiy, Leonardo Bras, Thiago Jung Bauermann,
Ram Pai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-1-leobras.c@gmail.com>
A previous change introduced the usage of DDW as a bigger indirect DMA
mapping when the DDW available size does not map the whole partition.
As most of the code that manipulates direct mappings was reused for
indirect mappings, it's necessary to rename all names and debug/info
messages to reflect that it can be used for both kinds of mapping.
Also, defines DEFAULT_DMA_WIN as "ibm,dma-window" to document that
it's the name of the default DMA window.
Those changes are not supposed to change how the code works in any
way, just adjust naming.
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
arch/powerpc/platforms/pseries/iommu.c | 101 +++++++++++++------------
1 file changed, 53 insertions(+), 48 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index c652177de09c..070b80efc43a 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -339,7 +339,7 @@ struct dynamic_dma_window_prop {
__be32 window_shift; /* ilog2(tce_window_size) */
};
-struct direct_window {
+struct dma_win {
struct device_node *device;
const struct dynamic_dma_window_prop *prop;
struct list_head list;
@@ -359,12 +359,13 @@ struct ddw_create_response {
u32 addr_lo;
};
-static LIST_HEAD(direct_window_list);
+static LIST_HEAD(dma_win_list);
/* prevents races between memory on/offline and window creation */
-static DEFINE_SPINLOCK(direct_window_list_lock);
+static DEFINE_SPINLOCK(dma_win_list_lock);
/* protects initializing window twice for same device */
-static DEFINE_MUTEX(direct_window_init_mutex);
+static DEFINE_MUTEX(dma_win_init_mutex);
#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
+#define DEFAULT_DMA_WIN "ibm,dma-window"
static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
unsigned long num_pfn, const void *arg)
@@ -697,9 +698,12 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
dn);
- /* Find nearest ibm,dma-window, walking up the device tree */
+ /*
+ * Find nearest ibm,dma-window (default DMA window), walking up the
+ * device tree
+ */
for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
- dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
if (dma_window != NULL)
break;
}
@@ -710,7 +714,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
dma_window = alt_dma_window;
if (dma_window == NULL) {
- pr_debug(" no ibm,dma-window nor linux,dma64-ddr-window-info property !\n");
+ pr_debug(" no %s nor %s property !\n",
+ DEFAULT_DMA_WIN, DMA64_PROPNAME);
return;
}
@@ -808,11 +813,11 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail,
ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn);
if (ret)
- pr_warn("%pOF: failed to remove direct window: rtas returned "
+ pr_warn("%pOF: failed to remove dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
else
- pr_debug("%pOF: successfully removed direct window: rtas returned "
+ pr_debug("%pOF: successfully removed dma window: rtas returned "
"%d to ibm,remove-pe-dma-window(%x) %llx\n",
np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn);
}
@@ -840,26 +845,26 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
ret = of_remove_property(np, win);
if (ret)
- pr_warn("%pOF: failed to remove direct window property: %d\n",
+ pr_warn("%pOF: failed to remove dma window property: %d\n",
np, ret);
}
static u64 find_existing_ddw(struct device_node *pdn)
{
- struct direct_window *window;
- const struct dynamic_dma_window_prop *direct64;
+ struct dma_win *window;
+ const struct dynamic_dma_window_prop *dma64;
u64 dma_addr = 0;
- spin_lock(&direct_window_list_lock);
+ spin_lock(&dma_win_list_lock);
/* check if we already created a window and dupe that config if so */
- list_for_each_entry(window, &direct_window_list, list) {
+ list_for_each_entry(window, &dma_win_list, list) {
if (window->device == pdn) {
- direct64 = window->prop;
- dma_addr = be64_to_cpu(direct64->dma_base);
+ dma64 = window->prop;
+ dma_addr = be64_to_cpu(dma64->dma_base);
break;
}
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
return dma_addr;
}
@@ -868,15 +873,15 @@ static int find_existing_ddw_windows(void)
{
int len;
struct device_node *pdn;
- struct direct_window *window;
- const struct dynamic_dma_window_prop *direct64;
+ struct dma_win *window;
+ const struct dynamic_dma_window_prop *dma64;
if (!firmware_has_feature(FW_FEATURE_LPAR))
return 0;
for_each_node_with_property(pdn, DMA64_PROPNAME) {
- direct64 = of_get_property(pdn, DMA64_PROPNAME, &len);
- if (!direct64)
+ dma64 = of_get_property(pdn, DMA64_PROPNAME, &len);
+ if (!dma64)
continue;
window = kzalloc(sizeof(*window), GFP_KERNEL);
@@ -887,10 +892,10 @@ static int find_existing_ddw_windows(void)
}
window->device = pdn;
- window->prop = direct64;
- spin_lock(&direct_window_list_lock);
- list_add(&window->list, &direct_window_list);
- spin_unlock(&direct_window_list_lock);
+ window->prop = dma64;
+ spin_lock(&dma_win_list_lock);
+ list_add(&window->list, &dma_win_list);
+ spin_unlock(&dma_win_list_lock);
}
return 0;
@@ -1123,12 +1128,12 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
u64 dma_addr, max_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
- struct direct_window *window;
+ struct dma_win *window;
struct property *win64, *default_win = NULL;
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
- mutex_lock(&direct_window_init_mutex);
+ mutex_lock(&dma_win_init_mutex);
dma_addr = find_existing_ddw(pdn);
if (dma_addr != 0)
@@ -1178,7 +1183,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
* for extensions presence.
*/
if (query.windows_available == 0) {
- default_win = of_find_property(pdn, "ibm,dma-window", NULL);
+ default_win = of_find_property(pdn, DEFAULT_DMA_WIN, NULL);
if (!default_win)
goto out_failed;
@@ -1206,8 +1211,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
} else if (query.page_size & 1) {
page_shift = 12; /* 4kB */
} else {
- dev_dbg(&dev->dev, "no supported direct page size in mask %x",
- query.page_size);
+ dev_dbg(&dev->dev, "no supported page size in mask %x",
+ query.page_size);
goto out_restore_defwin;
}
@@ -1258,7 +1263,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
win64->value, tce_setrange_multi_pSeriesLP_walk);
if (ret) {
- dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
+ dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
dn, ret);
goto out_free_window;
}
@@ -1272,9 +1277,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
window->device = pdn;
window->prop = ddwprop;
- spin_lock(&direct_window_list_lock);
- list_add(&window->list, &direct_window_list);
- spin_unlock(&direct_window_list_lock);
+ spin_lock(&dma_win_list_lock);
+ list_add(&window->list, &dma_win_list);
+ spin_unlock(&dma_win_list_lock);
/* Only returns the dma_addr if DDW maps the whole partition */
if (len == order_base_2(max_addr))
@@ -1304,7 +1309,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
list_add(&fpdn->list, &failed_ddw_pdn_list);
out_unlock:
- mutex_unlock(&direct_window_init_mutex);
+ mutex_unlock(&dma_win_init_mutex);
return dma_addr;
}
@@ -1328,7 +1333,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
pdn = pdn->parent) {
- dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
if (dma_window)
break;
}
@@ -1379,7 +1384,7 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
*/
for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
pdn = pdn->parent) {
- dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ dma_window = of_get_property(pdn, DEFAULT_DMA_WIN, NULL);
if (dma_window)
break;
}
@@ -1396,29 +1401,29 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask)
static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
void *data)
{
- struct direct_window *window;
+ struct dma_win *window;
struct memory_notify *arg = data;
int ret = 0;
switch (action) {
case MEM_GOING_ONLINE:
- spin_lock(&direct_window_list_lock);
- list_for_each_entry(window, &direct_window_list, list) {
+ spin_lock(&dma_win_list_lock);
+ list_for_each_entry(window, &dma_win_list, list) {
ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
arg->nr_pages, window->prop);
/* XXX log error */
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
break;
case MEM_CANCEL_ONLINE:
case MEM_OFFLINE:
- spin_lock(&direct_window_list_lock);
- list_for_each_entry(window, &direct_window_list, list) {
+ spin_lock(&dma_win_list_lock);
+ list_for_each_entry(window, &dma_win_list, list) {
ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
arg->nr_pages, window->prop);
/* XXX log error */
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
break;
default:
break;
@@ -1439,7 +1444,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
struct of_reconfig_data *rd = data;
struct device_node *np = rd->dn;
struct pci_dn *pci = PCI_DN(np);
- struct direct_window *window;
+ struct dma_win *window;
switch (action) {
case OF_RECONFIG_DETACH_NODE:
@@ -1455,15 +1460,15 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
iommu_pseries_free_group(pci->table_group,
np->full_name);
- spin_lock(&direct_window_list_lock);
- list_for_each_entry(window, &direct_window_list, list) {
+ spin_lock(&dma_win_list_lock);
+ list_for_each_entry(window, &dma_win_list, list) {
if (window->device == np) {
list_del(&window->list);
kfree(window);
break;
}
}
- spin_unlock(&direct_window_list_lock);
+ spin_unlock(&dma_win_list_lock);
break;
default:
err = NOTIFY_DONE;
--
2.25.4
^ permalink raw reply related
* Re: [PATCH v2 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition
From: Leonardo Bras @ 2020-07-03 6:27 UTC (permalink / raw)
To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
Paul Mackerras, Thiago Jung Bauermann, Ram Pai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <2c5dc8d2-f379-5a5f-844a-f4eea233f265@ozlabs.ru>
On Thu, 2020-07-02 at 10:31 +1000, Alexey Kardashevskiy wrote:
>
> On 02/07/2020 09:48, Leonardo Bras wrote:
> > On Wed, 2020-07-01 at 16:57 -0300, Leonardo Bras wrote:
> > > > It is not necessarily "direct" anymore as the name suggests, you may
> > > > want to change that. DMA64_PROPNAME, may be. Thanks,
> > > >
> > >
> > > Yeah, you are right.
> > > I will change this for next version, also changing the string name to
> > > reflect this.
> > >
> > > -#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> > > +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
> > >
> > > Is that ok?
> > >
> > > Thank you for helping!
> >
> > In fact, there is a lot of places in this file where it's called direct
> > window. Should I replace everything?
> > Should it be in a separated patch?
>
> If it looks simple and you write a nice commit log explaining all that
> and why you are not reusing the existing ibm,dma-window property (to
> provide a clue what "reset" will reset to? is there any other reason?)
> for that - sure, do it :)
>
v3 available here:
http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348&state=%2A&archive=both
Best regards,
Leonardo
^ permalink raw reply
* Re: [PATCH V3 (RESEND) 2/3] mm/sparsemem: Enable vmem_altmap support in vmemmap_alloc_block_buf()
From: Anshuman Khandual @ 2020-07-03 6:32 UTC (permalink / raw)
To: Catalin Marinas
Cc: x86, H. Peter Anvin, Peter Zijlstra, Dave Hansen, linuxppc-dev,
linux-kernel, linux-mm, Ingo Molnar, Paul Mackerras,
Andy Lutomirski, Borislav Petkov, Thomas Gleixner, Will Deacon,
Andrew Morton, linux-arm-kernel
In-Reply-To: <20200702140752.GF22241@gaia>
On 07/02/2020 07:37 PM, Catalin Marinas wrote:
> On Thu, Jun 18, 2020 at 06:45:29AM +0530, Anshuman Khandual wrote:
>> There are many instances where vmemap allocation is often switched between
>> regular memory and device memory just based on whether altmap is available
>> or not. vmemmap_alloc_block_buf() is used in various platforms to allocate
>> vmemmap mappings. Lets also enable it to handle altmap based device memory
>> allocation along with existing regular memory allocations. This will help
>> in avoiding the altmap based allocation switch in many places.
>>
>> While here also implement a regular memory allocation fallback mechanism
>> when the first preferred device memory allocation fails. This will ensure
>> preserving the existing semantics on powerpc platform. To summarize there
>> are three different methods to call vmemmap_alloc_block_buf().
>>
>> (., NULL, false) /* Allocate from system RAM */
>> (., altmap, false) /* Allocate from altmap without any fallback */
>> (., altmap, true) /* Allocate from altmap with fallback (system RAM) */
> [...]
>> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
>> index bc73abf0bc25..01e25b56eccb 100644
>> --- a/arch/powerpc/mm/init_64.c
>> +++ b/arch/powerpc/mm/init_64.c
>> @@ -225,12 +225,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
>> * fall back to system memory if the altmap allocation fail.
>> */
>> if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
>> - p = altmap_alloc_block_buf(page_size, altmap);
>> - if (!p)
>> - pr_debug("altmap block allocation failed, falling back to system memory");
>> + p = vmemmap_alloc_block_buf(page_size, node,
>> + altmap, true);
>> + } else {
>> + p = vmemmap_alloc_block_buf(page_size, node,
>> + NULL, false);
>> }
>> - if (!p)
>> - p = vmemmap_alloc_block_buf(page_size, node);
>> if (!p)
>> return -ENOMEM;
>
> Is the fallback argument actually necessary. It may be cleaner to just
> leave the code as is with the choice between altmap and NULL. If an arch
> needs a fallback (only powerpc), they have the fallback in place
> already. I don't see the powerpc code any better after this change.
>
> I'm fine with the altmap argument though.
Okay. Will drop 'fallback' from vmemmap_alloc_block_buf() and update the
callers. There will also be a single change in the subsequent patch i.e
vmemmap_alloc_block_buf(PMD_SIZE, node, altmap).
^ permalink raw reply
* [PATCH] powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc.
From: Anju T Sudhakar @ 2020-07-03 6:36 UTC (permalink / raw)
To: mpe; +Cc: maddy, linuxppc-dev, anju
IMC trace-mode record has MSR[HV PR] bits added in the third DW.
These bits can be used to set the cpumode for the instruction pointer
captured in each sample.
Add support in kernel to use these bits to set the cpumode for
each sample.
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/imc-pmu.h | 5 +++++
arch/powerpc/perf/imc-pmu.c | 29 ++++++++++++++++++++++++-----
2 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h
index 4da4fcba0684..4f897993b710 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -99,6 +99,11 @@ struct trace_imc_data {
*/
#define IMC_TRACE_RECORD_TB1_MASK 0x3ffffffffffULL
+/*
+ * Bit 0:1 in third DW of IMC trace record
+ * specifies the MSR[HV PR] values.
+ */
+#define IMC_TRACE_RECORD_VAL_HVPR(x) ((x) >> 62)
/*
* Device tree parser code detects IMC pmu support and
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..310922fed9eb 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1178,11 +1178,30 @@ static int trace_imc_prepare_sample(struct trace_imc_data *mem,
header->size = sizeof(*header) + event->header_size;
header->misc = 0;
- if (is_kernel_addr(data->ip))
- header->misc |= PERF_RECORD_MISC_KERNEL;
- else
- header->misc |= PERF_RECORD_MISC_USER;
-
+ if (cpu_has_feature(CPU_FTRS_POWER9)) {
+ if (is_kernel_addr(data->ip))
+ header->misc |= PERF_RECORD_MISC_KERNEL;
+ else
+ header->misc |= PERF_RECORD_MISC_USER;
+ } else {
+ switch (IMC_TRACE_RECORD_VAL_HVPR(mem->val)) {
+ case 0:/* when MSR HV and PR not set in the trace-record */
+ header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+ break;
+ case 1: /* MSR HV is 0 and PR is 1 */
+ header->misc |= PERF_RECORD_MISC_GUEST_USER;
+ break;
+ case 2: /* MSR Hv is 1 and PR is 0 */
+ header->misc |= PERF_RECORD_MISC_HYPERVISOR;
+ break;
+ case 3: /* MSR HV is 1 and PR is 1 */
+ header->misc |= PERF_RECORD_MISC_USER;
+ break;
+ default:
+ pr_info("IMC: Unable to set the flag based on MSR bits\n");
+ break;
+ }
+ }
perf_event_header__init_id(header, data, event);
return 0;
--
2.25.4
^ permalink raw reply related
* Re: [PATCH] powerpc/powernv: machine check handler for POWER10
From: kernel test robot @ 2020-07-03 6:40 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev
Cc: Mahesh Salgaonkar, kbuild-all, Nicholas Piggin
In-Reply-To: <20200702233343.1128026-1-npiggin@gmail.com>
[-- Attachment #1: Type: text/plain, Size: 2412 bytes --]
Hi Nicholas,
I love your patch! Perhaps something to improve:
[auto build test WARNING on powerpc/next]
[also build test WARNING on v5.8-rc3 next-20200702]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Nicholas-Piggin/powerpc-powernv-machine-check-handler-for-POWER10/20200703-073739
base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
arch/powerpc/kernel/mce_power.c:709:6: warning: no previous prototype for '__machine_check_early_realmode_p7' [-Wmissing-prototypes]
709 | long __machine_check_early_realmode_p7(struct pt_regs *regs)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
arch/powerpc/kernel/mce_power.c:717:6: warning: no previous prototype for '__machine_check_early_realmode_p8' [-Wmissing-prototypes]
717 | long __machine_check_early_realmode_p8(struct pt_regs *regs)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
arch/powerpc/kernel/mce_power.c:722:6: warning: no previous prototype for '__machine_check_early_realmode_p9' [-Wmissing-prototypes]
722 | long __machine_check_early_realmode_p9(struct pt_regs *regs)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> arch/powerpc/kernel/mce_power.c:740:6: warning: no previous prototype for '__machine_check_early_realmode_p10' [-Wmissing-prototypes]
740 | long __machine_check_early_realmode_p10(struct pt_regs *regs)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
vim +/__machine_check_early_realmode_p10 +740 arch/powerpc/kernel/mce_power.c
739
> 740 long __machine_check_early_realmode_p10(struct pt_regs *regs)
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 69688 bytes --]
^ permalink raw reply
* [PATCH v2 0/6] powerpc: queued spinlocks and rwlocks
From: Nicholas Piggin @ 2020-07-03 7:35 UTC (permalink / raw)
Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
kvm-ppc, Waiman Long, Will Deacon
v2 is updated to account for feedback from Will, Peter, and
Waiman (thank you), and trims off a couple of RFC and unrelated
patches.
Thanks,
Nick
Nicholas Piggin (6):
powerpc/powernv: must include hvcall.h to get PAPR defines
powerpc/pseries: move some PAPR paravirt functions to their own file
powerpc: move spinlock implementation to simple_spinlock
powerpc/64s: implement queued spinlocks and rwlocks
powerpc/pseries: implement paravirt qspinlocks for SPLPAR
powerpc/qspinlock: optimised atomic_try_cmpxchg_lock that adds the
lock hint
arch/powerpc/Kconfig | 13 +
arch/powerpc/include/asm/Kbuild | 2 +
arch/powerpc/include/asm/atomic.h | 28 ++
arch/powerpc/include/asm/paravirt.h | 89 +++++
arch/powerpc/include/asm/qspinlock.h | 80 +++++
arch/powerpc/include/asm/qspinlock_paravirt.h | 5 +
arch/powerpc/include/asm/simple_spinlock.h | 292 +++++++++++++++++
.../include/asm/simple_spinlock_types.h | 21 ++
arch/powerpc/include/asm/spinlock.h | 308 +-----------------
arch/powerpc/include/asm/spinlock_types.h | 17 +-
arch/powerpc/lib/Makefile | 3 +
arch/powerpc/lib/locks.c | 12 +-
arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
arch/powerpc/platforms/pseries/Kconfig | 5 +
arch/powerpc/platforms/pseries/setup.c | 6 +-
include/asm-generic/qspinlock.h | 4 +
16 files changed, 564 insertions(+), 322 deletions(-)
create mode 100644 arch/powerpc/include/asm/paravirt.h
create mode 100644 arch/powerpc/include/asm/qspinlock.h
create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h
--
2.23.0
^ permalink raw reply
* [PATCH v2 1/6] powerpc/powernv: must include hvcall.h to get PAPR defines
From: Nicholas Piggin @ 2020-07-03 7:35 UTC (permalink / raw)
Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>
An include goes away in future patches which breaks compilation
without this.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/platforms/powernv/pci-ioda-tce.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index f923359d8afc..8eba6ece7808 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -15,6 +15,7 @@
#include <asm/iommu.h>
#include <asm/tce.h>
+#include <asm/hvcall.h> /* share error returns with PAPR */
#include "pci.h"
unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
--
2.23.0
^ permalink raw reply related
* [PATCH v2 2/6] powerpc/pseries: move some PAPR paravirt functions to their own file
From: Nicholas Piggin @ 2020-07-03 7:35 UTC (permalink / raw)
Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/paravirt.h | 61 +++++++++++++++++++++++++++++
arch/powerpc/include/asm/spinlock.h | 24 +-----------
arch/powerpc/lib/locks.c | 12 +++---
3 files changed, 68 insertions(+), 29 deletions(-)
create mode 100644 arch/powerpc/include/asm/paravirt.h
diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
new file mode 100644
index 000000000000..7a8546660a63
--- /dev/null
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+#ifdef __KERNEL__
+
+#include <linux/jump_label.h>
+#include <asm/smp.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#include <asm/hvcall.h>
+#endif
+
+#ifdef CONFIG_PPC_SPLPAR
+DECLARE_STATIC_KEY_FALSE(shared_processor);
+
+static inline bool is_shared_processor(void)
+{
+ return static_branch_unlikely(&shared_processor);
+}
+
+/* If bit 0 is set, the cpu has been preempted */
+static inline u32 yield_count_of(int cpu)
+{
+ __be32 yield_count = READ_ONCE(lppaca_of(cpu).yield_count);
+ return be32_to_cpu(yield_count);
+}
+
+static inline void yield_to_preempted(int cpu, u32 yield_count)
+{
+ plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count);
+}
+#else
+static inline bool is_shared_processor(void)
+{
+ return false;
+}
+
+static inline u32 yield_count_of(int cpu)
+{
+ return 0;
+}
+
+extern void ___bad_yield_to_preempted(void);
+static inline void yield_to_preempted(int cpu, u32 yield_count)
+{
+ ___bad_yield_to_preempted(); /* This would be a bug */
+}
+#endif
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+ if (!is_shared_processor())
+ return false;
+ if (yield_count_of(cpu) & 1)
+ return true;
+ return false;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_PARAVIRT_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 2d620896cdae..79be9bb10bbb 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -15,11 +15,10 @@
*
* (the type definitions are in asm/spinlock_types.h)
*/
-#include <linux/jump_label.h>
#include <linux/irqflags.h>
+#include <asm/paravirt.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
-#include <asm/hvcall.h>
#endif
#include <asm/synch.h>
#include <asm/ppc-opcode.h>
@@ -35,18 +34,6 @@
#define LOCK_TOKEN 1
#endif
-#ifdef CONFIG_PPC_PSERIES
-DECLARE_STATIC_KEY_FALSE(shared_processor);
-
-#define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(int cpu)
-{
- if (!static_branch_unlikely(&shared_processor))
- return false;
- return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
-}
-#endif
-
static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
{
return lock.slock == 0;
@@ -110,15 +97,6 @@ static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
#endif
-static inline bool is_shared_processor(void)
-{
-#ifdef CONFIG_PPC_SPLPAR
- return static_branch_unlikely(&shared_processor);
-#else
- return false;
-#endif
-}
-
static inline void spin_yield(arch_spinlock_t *lock)
{
if (is_shared_processor())
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6440d5943c00..04165b7a163f 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -27,14 +27,14 @@ void splpar_spin_yield(arch_spinlock_t *lock)
return;
holder_cpu = lock_value & 0xffff;
BUG_ON(holder_cpu >= NR_CPUS);
- yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+ yield_count = yield_count_of(holder_cpu);
if ((yield_count & 1) == 0)
return; /* virtual cpu is currently running */
rmb();
if (lock->slock != lock_value)
return; /* something has changed */
- plpar_hcall_norets(H_CONFER,
- get_hard_smp_processor_id(holder_cpu), yield_count);
+ yield_to_preempted(holder_cpu, yield_count);
}
EXPORT_SYMBOL_GPL(splpar_spin_yield);
@@ -53,13 +53,13 @@ void splpar_rw_yield(arch_rwlock_t *rw)
return; /* no write lock at present */
holder_cpu = lock_value & 0xffff;
BUG_ON(holder_cpu >= NR_CPUS);
- yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+ yield_count = yield_count_of(holder_cpu);
if ((yield_count & 1) == 0)
return; /* virtual cpu is currently running */
rmb();
if (rw->lock != lock_value)
return; /* something has changed */
- plpar_hcall_norets(H_CONFER,
- get_hard_smp_processor_id(holder_cpu), yield_count);
+ yield_to_preempted(holder_cpu, yield_count);
}
#endif
--
2.23.0
^ permalink raw reply related
* [PATCH v2 3/6] powerpc: move spinlock implementation to simple_spinlock
From: Nicholas Piggin @ 2020-07-03 7:35 UTC (permalink / raw)
Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>
To prepare for queued spinlocks. This is a simple rename except to update
preprocessor guard name and a file reference.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/simple_spinlock.h | 292 ++++++++++++++++++
.../include/asm/simple_spinlock_types.h | 21 ++
arch/powerpc/include/asm/spinlock.h | 285 +----------------
arch/powerpc/include/asm/spinlock_types.h | 12 +-
4 files changed, 315 insertions(+), 295 deletions(-)
create mode 100644 arch/powerpc/include/asm/simple_spinlock.h
create mode 100644 arch/powerpc/include/asm/simple_spinlock_types.h
diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h
new file mode 100644
index 000000000000..e048c041c4a9
--- /dev/null
+++ b/arch/powerpc/include/asm/simple_spinlock.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_SIMPLE_SPINLOCK_H
+#define __ASM_SIMPLE_SPINLOCK_H
+#ifdef __KERNEL__
+
+/*
+ * Simple spin lock operations.
+ *
+ * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
+ * Rework to support virtual processors
+ *
+ * Type of int is used as a full 64b word is not necessary.
+ *
+ * (the type definitions are in asm/simple_spinlock_types.h)
+ */
+#include <linux/irqflags.h>
+#include <asm/paravirt.h>
+#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+#endif
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#ifdef CONFIG_PPC64
+/* use 0x800000yy when locked, where yy == CPU number */
+#ifdef __BIG_ENDIAN__
+#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
+#else
+#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index))
+#endif
+#else
+#define LOCK_TOKEN 1
+#endif
+
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+ return lock.slock == 0;
+}
+
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+ smp_mb();
+ return !arch_spin_value_unlocked(*lock);
+}
+
+/*
+ * This returns the old value in the lock, so we succeeded
+ * in getting the lock if the return value is 0.
+ */
+static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
+{
+ unsigned long tmp, token;
+
+ token = LOCK_TOKEN;
+ __asm__ __volatile__(
+"1: " PPC_LWARX(%0,0,%2,1) "\n\
+ cmpwi 0,%0,0\n\
+ bne- 2f\n\
+ stwcx. %1,0,%2\n\
+ bne- 1b\n"
+ PPC_ACQUIRE_BARRIER
+"2:"
+ : "=&r" (tmp)
+ : "r" (token), "r" (&lock->slock)
+ : "cr0", "memory");
+
+ return tmp;
+}
+
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+ return __arch_spin_trylock(lock) == 0;
+}
+
+/*
+ * On a system with shared processors (that is, where a physical
+ * processor is multiplexed between several virtual processors),
+ * there is no point spinning on a lock if the holder of the lock
+ * isn't currently scheduled on a physical processor. Instead
+ * we detect this situation and ask the hypervisor to give the
+ * rest of our timeslice to the lock holder.
+ *
+ * So that we can tell which virtual processor is holding a lock,
+ * we put 0x80000000 | smp_processor_id() in the lock when it is
+ * held. Conveniently, we have a word in the paca that holds this
+ * value.
+ */
+
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+void splpar_spin_yield(arch_spinlock_t *lock);
+void splpar_rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
+static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
+#endif
+
+static inline void spin_yield(arch_spinlock_t *lock)
+{
+ if (is_shared_processor())
+ splpar_spin_yield(lock);
+ else
+ barrier();
+}
+
+static inline void rw_yield(arch_rwlock_t *lock)
+{
+ if (is_shared_processor())
+ splpar_rw_yield(lock);
+ else
+ barrier();
+}
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+ while (1) {
+ if (likely(__arch_spin_trylock(lock) == 0))
+ break;
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_spin_yield(lock);
+ } while (unlikely(lock->slock != 0));
+ HMT_medium();
+ }
+}
+
+static inline
+void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+{
+ unsigned long flags_dis;
+
+ while (1) {
+ if (likely(__arch_spin_trylock(lock) == 0))
+ break;
+ local_save_flags(flags_dis);
+ local_irq_restore(flags);
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_spin_yield(lock);
+ } while (unlikely(lock->slock != 0));
+ HMT_medium();
+ local_irq_restore(flags_dis);
+ }
+}
+#define arch_spin_lock_flags arch_spin_lock_flags
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+ __asm__ __volatile__("# arch_spin_unlock\n\t"
+ PPC_RELEASE_BARRIER: : :"memory");
+ lock->slock = 0;
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ */
+
+#ifdef CONFIG_PPC64
+#define __DO_SIGN_EXTEND "extsw %0,%0\n"
+#define WRLOCK_TOKEN LOCK_TOKEN /* it's negative */
+#else
+#define __DO_SIGN_EXTEND
+#define WRLOCK_TOKEN (-1)
+#endif
+
+/*
+ * This returns the old value in the lock + 1,
+ * so we got a read lock if the return value is > 0.
+ */
+static inline long __arch_read_trylock(arch_rwlock_t *rw)
+{
+ long tmp;
+
+ __asm__ __volatile__(
+"1: " PPC_LWARX(%0,0,%1,1) "\n"
+ __DO_SIGN_EXTEND
+" addic. %0,%0,1\n\
+ ble- 2f\n"
+" stwcx. %0,0,%1\n\
+ bne- 1b\n"
+ PPC_ACQUIRE_BARRIER
+"2:" : "=&r" (tmp)
+ : "r" (&rw->lock)
+ : "cr0", "xer", "memory");
+
+ return tmp;
+}
+
+/*
+ * This returns the old value in the lock,
+ * so we got the write lock if the return value is 0.
+ */
+static inline long __arch_write_trylock(arch_rwlock_t *rw)
+{
+ long tmp, token;
+
+ token = WRLOCK_TOKEN;
+ __asm__ __volatile__(
+"1: " PPC_LWARX(%0,0,%2,1) "\n\
+ cmpwi 0,%0,0\n\
+ bne- 2f\n"
+" stwcx. %1,0,%2\n\
+ bne- 1b\n"
+ PPC_ACQUIRE_BARRIER
+"2:" : "=&r" (tmp)
+ : "r" (token), "r" (&rw->lock)
+ : "cr0", "memory");
+
+ return tmp;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+ while (1) {
+ if (likely(__arch_read_trylock(rw) > 0))
+ break;
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_rw_yield(rw);
+ } while (unlikely(rw->lock < 0));
+ HMT_medium();
+ }
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+ while (1) {
+ if (likely(__arch_write_trylock(rw) == 0))
+ break;
+ do {
+ HMT_low();
+ if (is_shared_processor())
+ splpar_rw_yield(rw);
+ } while (unlikely(rw->lock != 0));
+ HMT_medium();
+ }
+}
+
+static inline int arch_read_trylock(arch_rwlock_t *rw)
+{
+ return __arch_read_trylock(rw) > 0;
+}
+
+static inline int arch_write_trylock(arch_rwlock_t *rw)
+{
+ return __arch_write_trylock(rw) == 0;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+ long tmp;
+
+ __asm__ __volatile__(
+ "# read_unlock\n\t"
+ PPC_RELEASE_BARRIER
+"1: lwarx %0,0,%1\n\
+ addic %0,%0,-1\n"
+" stwcx. %0,0,%1\n\
+ bne- 1b"
+ : "=&r"(tmp)
+ : "r"(&rw->lock)
+ : "cr0", "xer", "memory");
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+ __asm__ __volatile__("# write_unlock\n\t"
+ PPC_RELEASE_BARRIER: : :"memory");
+ rw->lock = 0;
+}
+
+#define arch_spin_relax(lock) spin_yield(lock)
+#define arch_read_relax(lock) rw_yield(lock)
+#define arch_write_relax(lock) rw_yield(lock)
+
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock() smp_mb()
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_SIMPLE_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h
new file mode 100644
index 000000000000..7c2b48ce62dc
--- /dev/null
+++ b/arch/powerpc/include/asm/simple_spinlock_types.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
+#define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+typedef struct {
+ volatile unsigned int slock;
+} arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
+
+typedef struct {
+ volatile signed int lock;
+} arch_rwlock_t;
+
+#define __ARCH_RW_LOCK_UNLOCKED { 0 }
+
+#endif
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 79be9bb10bbb..21357fe05fe0 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -3,290 +3,7 @@
#define __ASM_SPINLOCK_H
#ifdef __KERNEL__
-/*
- * Simple spin lock operations.
- *
- * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
- * Rework to support virtual processors
- *
- * Type of int is used as a full 64b word is not necessary.
- *
- * (the type definitions are in asm/spinlock_types.h)
- */
-#include <linux/irqflags.h>
-#include <asm/paravirt.h>
-#ifdef CONFIG_PPC64
-#include <asm/paca.h>
-#endif
-#include <asm/synch.h>
-#include <asm/ppc-opcode.h>
-
-#ifdef CONFIG_PPC64
-/* use 0x800000yy when locked, where yy == CPU number */
-#ifdef __BIG_ENDIAN__
-#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
-#else
-#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index))
-#endif
-#else
-#define LOCK_TOKEN 1
-#endif
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
- return lock.slock == 0;
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
- smp_mb();
- return !arch_spin_value_unlocked(*lock);
-}
-
-/*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
- */
-static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
-{
- unsigned long tmp, token;
-
- token = LOCK_TOKEN;
- __asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%2,1) "\n\
- cmpwi 0,%0,0\n\
- bne- 2f\n\
- stwcx. %1,0,%2\n\
- bne- 1b\n"
- PPC_ACQUIRE_BARRIER
-"2:"
- : "=&r" (tmp)
- : "r" (token), "r" (&lock->slock)
- : "cr0", "memory");
-
- return tmp;
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
- return __arch_spin_trylock(lock) == 0;
-}
-
-/*
- * On a system with shared processors (that is, where a physical
- * processor is multiplexed between several virtual processors),
- * there is no point spinning on a lock if the holder of the lock
- * isn't currently scheduled on a physical processor. Instead
- * we detect this situation and ask the hypervisor to give the
- * rest of our timeslice to the lock holder.
- *
- * So that we can tell which virtual processor is holding a lock,
- * we put 0x80000000 | smp_processor_id() in the lock when it is
- * held. Conveniently, we have a word in the paca that holds this
- * value.
- */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-void splpar_spin_yield(arch_spinlock_t *lock);
-void splpar_rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
-static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
-#endif
-
-static inline void spin_yield(arch_spinlock_t *lock)
-{
- if (is_shared_processor())
- splpar_spin_yield(lock);
- else
- barrier();
-}
-
-static inline void rw_yield(arch_rwlock_t *lock)
-{
- if (is_shared_processor())
- splpar_rw_yield(lock);
- else
- barrier();
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
- while (1) {
- if (likely(__arch_spin_trylock(lock) == 0))
- break;
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_spin_yield(lock);
- } while (unlikely(lock->slock != 0));
- HMT_medium();
- }
-}
-
-static inline
-void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
- unsigned long flags_dis;
-
- while (1) {
- if (likely(__arch_spin_trylock(lock) == 0))
- break;
- local_save_flags(flags_dis);
- local_irq_restore(flags);
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_spin_yield(lock);
- } while (unlikely(lock->slock != 0));
- HMT_medium();
- local_irq_restore(flags_dis);
- }
-}
-#define arch_spin_lock_flags arch_spin_lock_flags
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
- __asm__ __volatile__("# arch_spin_unlock\n\t"
- PPC_RELEASE_BARRIER: : :"memory");
- lock->slock = 0;
-}
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-
-#ifdef CONFIG_PPC64
-#define __DO_SIGN_EXTEND "extsw %0,%0\n"
-#define WRLOCK_TOKEN LOCK_TOKEN /* it's negative */
-#else
-#define __DO_SIGN_EXTEND
-#define WRLOCK_TOKEN (-1)
-#endif
-
-/*
- * This returns the old value in the lock + 1,
- * so we got a read lock if the return value is > 0.
- */
-static inline long __arch_read_trylock(arch_rwlock_t *rw)
-{
- long tmp;
-
- __asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%1,1) "\n"
- __DO_SIGN_EXTEND
-" addic. %0,%0,1\n\
- ble- 2f\n"
-" stwcx. %0,0,%1\n\
- bne- 1b\n"
- PPC_ACQUIRE_BARRIER
-"2:" : "=&r" (tmp)
- : "r" (&rw->lock)
- : "cr0", "xer", "memory");
-
- return tmp;
-}
-
-/*
- * This returns the old value in the lock,
- * so we got the write lock if the return value is 0.
- */
-static inline long __arch_write_trylock(arch_rwlock_t *rw)
-{
- long tmp, token;
-
- token = WRLOCK_TOKEN;
- __asm__ __volatile__(
-"1: " PPC_LWARX(%0,0,%2,1) "\n\
- cmpwi 0,%0,0\n\
- bne- 2f\n"
-" stwcx. %1,0,%2\n\
- bne- 1b\n"
- PPC_ACQUIRE_BARRIER
-"2:" : "=&r" (tmp)
- : "r" (token), "r" (&rw->lock)
- : "cr0", "memory");
-
- return tmp;
-}
-
-static inline void arch_read_lock(arch_rwlock_t *rw)
-{
- while (1) {
- if (likely(__arch_read_trylock(rw) > 0))
- break;
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_rw_yield(rw);
- } while (unlikely(rw->lock < 0));
- HMT_medium();
- }
-}
-
-static inline void arch_write_lock(arch_rwlock_t *rw)
-{
- while (1) {
- if (likely(__arch_write_trylock(rw) == 0))
- break;
- do {
- HMT_low();
- if (is_shared_processor())
- splpar_rw_yield(rw);
- } while (unlikely(rw->lock != 0));
- HMT_medium();
- }
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *rw)
-{
- return __arch_read_trylock(rw) > 0;
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *rw)
-{
- return __arch_write_trylock(rw) == 0;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *rw)
-{
- long tmp;
-
- __asm__ __volatile__(
- "# read_unlock\n\t"
- PPC_RELEASE_BARRIER
-"1: lwarx %0,0,%1\n\
- addic %0,%0,-1\n"
-" stwcx. %0,0,%1\n\
- bne- 1b"
- : "=&r"(tmp)
- : "r"(&rw->lock)
- : "cr0", "xer", "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *rw)
-{
- __asm__ __volatile__("# write_unlock\n\t"
- PPC_RELEASE_BARRIER: : :"memory");
- rw->lock = 0;
-}
-
-#define arch_spin_relax(lock) spin_yield(lock)
-#define arch_read_relax(lock) rw_yield(lock)
-#define arch_write_relax(lock) rw_yield(lock)
-
-/* See include/linux/spinlock.h */
-#define smp_mb__after_spinlock() smp_mb()
+#include <asm/simple_spinlock.h>
#endif /* __KERNEL__ */
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 87adaf13b7e8..3906f52dae65 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -6,16 +6,6 @@
# error "please don't include this file directly"
#endif
-typedef struct {
- volatile unsigned int slock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
-
-typedef struct {
- volatile signed int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED { 0 }
+#include <asm/simple_spinlock_types.h>
#endif
--
2.23.0
^ permalink raw reply related
* [PATCH v2 4/6] powerpc/64s: implement queued spinlocks and rwlocks
From: Nicholas Piggin @ 2020-07-03 7:35 UTC (permalink / raw)
Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Boqun Feng,
linux-kernel, Nicholas Piggin, virtualization, Ingo Molnar,
kvm-ppc, Waiman Long, Will Deacon
In-Reply-To: <20200703073516.1354108-1-npiggin@gmail.com>
These have shown significantly improved performance and fairness when
spinlock contention is moderate to high on very large systems.
[ Numbers hopefully forthcoming after more testing, but initial
results look good ]
Thanks to the fast path, single threaded performance is not noticably
hurt.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/Kconfig | 13 ++++++++++++
arch/powerpc/include/asm/Kbuild | 2 ++
arch/powerpc/include/asm/qspinlock.h | 25 +++++++++++++++++++++++
arch/powerpc/include/asm/spinlock.h | 5 +++++
arch/powerpc/include/asm/spinlock_types.h | 5 +++++
arch/powerpc/lib/Makefile | 3 +++
include/asm-generic/qspinlock.h | 2 ++
7 files changed, 55 insertions(+)
create mode 100644 arch/powerpc/include/asm/qspinlock.h
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fa23eb320ff..b17575109876 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -145,6 +145,8 @@ config PPC
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
+ select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS
+ select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WEAK_RELEASE_ACQUIRE
select BINFMT_ELF
@@ -490,6 +492,17 @@ config HOTPLUG_CPU
Say N if you are unsure.
+config PPC_QUEUED_SPINLOCKS
+ bool "Queued spinlocks"
+ depends on SMP
+ default "y" if PPC_BOOK3S_64
+ help
+ Say Y here to use to use queued spinlocks which are more complex
+ but give better salability and fairness on large SMP and NUMA
+ systems.
+
+ If unsure, say "Y" if you have lots of cores, otherwise "N".
+
config ARCH_CPU_PROBE_RELEASE
def_bool y
depends on HOTPLUG_CPU
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index dadbcf3a0b1e..1dd8b6adff5e 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h
generic-y += export.h
generic-y += local64.h
generic-y += mcs_spinlock.h
+generic-y += qrwlock.h
+generic-y += qspinlock.h
generic-y += vtime.h
generic-y += early_ioremap.h
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000000000000..c49e33e24edd
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */
+
+#define smp_mb__after_spinlock() smp_mb()
+
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+ /*
+ * This barrier was added to simple spinlocks by commit 51d7d5205d338,
+ * but it should now be possible to remove it, asm arm64 has done with
+ * commit c6f5d02b6a0f.
+ */
+ smp_mb();
+ return atomic_read(&lock->val);
+}
+#define queued_spin_is_locked queued_spin_is_locked
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 21357fe05fe0..434615f1d761 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -3,7 +3,12 @@
#define __ASM_SPINLOCK_H
#ifdef __KERNEL__
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#include <asm/qrwlock.h>
+#else
#include <asm/simple_spinlock.h>
+#endif
#endif /* __KERNEL__ */
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 3906f52dae65..c5d742f18021 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -6,6 +6,11 @@
# error "please don't include this file directly"
#endif
+#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qrwlock_types.h>
+#else
#include <asm/simple_spinlock_types.h>
+#endif
#endif
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 5e994cda8e40..d66a645503eb 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
memcpy_64.o memcpy_mcsafe_64.o
+ifndef CONFIG_PPC_QUEUED_SPINLOCKS
obj64-$(CONFIG_SMP) += locks.o
+endif
+
obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \
test_emulate_step_exec_instr.o
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index fde943d180e0..fb0a814d4395 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -12,6 +12,7 @@
#include <asm-generic/qspinlock_types.h>
+#ifndef queued_spin_is_locked
/**
* queued_spin_is_locked - is the spinlock locked?
* @lock: Pointer to queued spinlock structure
@@ -25,6 +26,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
*/
return atomic_read(&lock->val);
}
+#endif
/**
* queued_spin_value_unlocked - is the spinlock structure unlocked?
--
2.23.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox