LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 10/23] powerpc/irq: Add stub irq_soft_mask_return() for PPC32
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

To allow building syscall_64.c smoothly on PPC32, add stub version
of irq_soft_mask_return().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/hw_irq.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 4739f61e632c..56a98936a6a9 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -330,6 +330,11 @@ static inline void irq_soft_mask_regs_set_state(struct pt_regs *regs, unsigned l
 }
 #else /* CONFIG_PPC64 */
 
+static inline notrace unsigned long irq_soft_mask_return(void)
+{
+	return 0;
+}
+
 static inline unsigned long arch_local_save_flags(void)
 {
 	return mfmsr();
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 11/23] powerpc/syscall: Rename syscall_64.c into syscall.c
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

syscall_64.c will be reused almost as is for PPC32.

Rename it syscall.c

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/Makefile                    | 2 +-
 arch/powerpc/kernel/{syscall_64.c => syscall.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename arch/powerpc/kernel/{syscall_64.c => syscall.c} (100%)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fe2ef598e2ea..1cbc51fc82fd 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -49,7 +49,7 @@ obj-y				:= cputable.o syscalls.o \
 				   hw_breakpoint_constraints.o
 obj-y				+= ptrace/
 obj-$(CONFIG_PPC64)		+= setup_64.o \
-				   paca.o nvram_64.o note.o syscall_64.o
+				   paca.o nvram_64.o note.o syscall.o
 obj-$(CONFIG_COMPAT)		+= sys_ppc32.o signal_32.o
 obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_PPC_WATCHDOG)	+= watchdog.o
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall.c
similarity index 100%
rename from arch/powerpc/kernel/syscall_64.c
rename to arch/powerpc/kernel/syscall.c
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 12/23] powerpc/syscall: Make syscall.c buildable on PPC32
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

ifdef out specific PPC64 stuff to allow building
syscall.c on PPC32.

Modify Makefile to always build syscall.o

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/Makefile  | 4 ++--
 arch/powerpc/kernel/syscall.c | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1cbc51fc82fd..23c127db0d0c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -46,10 +46,10 @@ obj-y				:= cputable.o syscalls.o \
 				   prom.o traps.o setup-common.o \
 				   udbg.o misc.o io.o misc_$(BITS).o \
 				   of_platform.o prom_parse.o firmware.o \
-				   hw_breakpoint_constraints.o
+				   hw_breakpoint_constraints.o syscall.o
 obj-y				+= ptrace/
 obj-$(CONFIG_PPC64)		+= setup_64.o \
-				   paca.o nvram_64.o note.o syscall.o
+				   paca.o nvram_64.o note.o
 obj-$(CONFIG_COMPAT)		+= sys_ppc32.o signal_32.o
 obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_PPC_WATCHDOG)	+= watchdog.o
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index b627a6384029..bf9bf4b5bc41 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -39,7 +39,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 		BUG_ON(!(regs->msr & MSR_RI));
 	BUG_ON(!(regs->msr & MSR_PR));
 	BUG_ON(!FULL_REGS(regs));
-	BUG_ON(regs->softe != IRQS_ENABLED);
+	BUG_ON(arch_irq_disabled_regs(regs));
 
 #ifdef CONFIG_PPC_PKEY
 	if (mmu_has_feature(MMU_FTR_PKEY)) {
@@ -77,7 +77,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 	 * frame, or if the unwinder was taught the first stack frame always
 	 * returns to user with IRQS_ENABLED, this store could be avoided!
 	 */
-	regs->softe = IRQS_ENABLED;
+	irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
 
 	local_irq_enable();
 
@@ -147,6 +147,7 @@ static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri)
 		__hard_EE_RI_disable();
 	else
 		__hard_irq_disable();
+#ifdef CONFIG_PPC64
 	if (unlikely(lazy_irq_pending_nocheck())) {
 		/* Took an interrupt, may have more exit work to do. */
 		if (clear_ri)
@@ -158,7 +159,7 @@ static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri)
 	}
 	local_paca->irq_happened = 0;
 	irq_soft_mask_set(IRQS_ENABLED);
-
+#endif
 	return true;
 }
 
@@ -281,7 +282,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 	return ret;
 }
 
-#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */
+#ifdef CONFIG_PPC_BOOK3S_64 /* BOOK3E not yet using this */
 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr)
 {
 #ifdef CONFIG_PPC_BOOK3E
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 13/23] powerpc/syscall: Use is_compat_task()
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

Instead of hard comparing task flags with _TIF_32BIT, use
is_compat_task(). The advantage is that it returns 0 on PPC32
allthough _TIF_32BIT is always set.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index bf9bf4b5bc41..cb415170b8f2 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -2,6 +2,8 @@
 
 #include <linux/context_tracking.h>
 #include <linux/err.h>
+#include <linux/compat.h>
+
 #include <asm/asm-prototypes.h>
 #include <asm/kup.h>
 #include <asm/cputime.h>
@@ -116,7 +118,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 	/* May be faster to do array_index_nospec? */
 	barrier_nospec();
 
-	if (unlikely(is_32bit_task())) {
+	if (unlikely(is_compat_task())) {
 		f = (void *)compat_sys_call_table[r0];
 
 		r3 &= 0x00000000ffffffffULL;
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 14/23] powerpc/syscall: Save r3 in regs->orig_r3
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

Save r3 in regs->orig_r3 in system_call_exception()

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_64.S | 1 -
 arch/powerpc/kernel/syscall.c  | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index aa1af139d947..a562a4240aa6 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -278,7 +278,6 @@ END_BTB_FLUSH_SECTION
 	std	r10,_LINK(r1)
 	std	r11,_TRAP(r1)
 	std	r12,_CCR(r1)
-	std	r3,ORIG_GPR3(r1)
 	addi	r10,r1,STACK_FRAME_OVERHEAD
 	ld	r11,exception_marker@toc(r2)
 	std	r11,-16(r10)		/* "regshere" marker */
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index cb415170b8f2..b66cfcbcb755 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -29,6 +29,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
 {
 	syscall_fn f;
 
+	regs->orig_gpr3 = r3;
+
 	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
 		BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
 
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 15/23] powerpc/syscall: Change condition to check MSR_RI
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

In system_call_exception(), MSR_RI also needs to be checked on 8xx.
Only booke and 40x doesn't have MSR_RI.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index b66cfcbcb755..a3510fa4e641 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -39,7 +39,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
 	trace_hardirqs_off(); /* finish reconciling */
 
-	if (IS_ENABLED(CONFIG_PPC_BOOK3S))
+	if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
 		BUG_ON(!(regs->msr & MSR_RI));
 	BUG_ON(!(regs->msr & MSR_PR));
 	BUG_ON(!FULL_REGS(regs));
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 16/23] powerpc/32: Always save non volatile GPRs at syscall entry
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

In preparation for porting syscall entry/exit to C, inconditionally
save non volatile general purpose registers.

Commit 965dd3ad3076 ("powerpc/64/syscall: Remove non-volatile GPR save
optimisation") provides detailed explanation.

This increases the number of cycles by 24 cycles on 8xx with
null_syscall benchmark (280 => 304 cycles)

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S           | 46 +-----------------------
 arch/powerpc/kernel/head_32.h            |  2 +-
 arch/powerpc/kernel/head_booke.h         |  2 +-
 arch/powerpc/kernel/syscalls/syscall.tbl | 20 +++--------
 4 files changed, 8 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index b1e36602c013..97dc28a68465 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -351,6 +351,7 @@ trace_syscall_entry_irq_off:
 
 	.globl	transfer_to_syscall
 transfer_to_syscall:
+	SAVE_NVGPRS(r1)
 #ifdef CONFIG_PPC_BOOK3S_32
 	kuep_lock r11, r12
 #endif
@@ -614,51 +615,6 @@ ret_from_kernel_syscall:
 #endif
 _ASM_NOKPROBE_SYMBOL(ret_from_kernel_syscall)
 
-/*
- * The fork/clone functions need to copy the full register set into
- * the child process. Therefore we need to save all the nonvolatile
- * registers (r13 - r31) before calling the C code.
- */
-	.globl	ppc_fork
-ppc_fork:
-	SAVE_NVGPRS(r1)
-	lwz	r0,_TRAP(r1)
-	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
-	stw	r0,_TRAP(r1)		/* register set saved */
-	b	sys_fork
-
-	.globl	ppc_vfork
-ppc_vfork:
-	SAVE_NVGPRS(r1)
-	lwz	r0,_TRAP(r1)
-	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
-	stw	r0,_TRAP(r1)		/* register set saved */
-	b	sys_vfork
-
-	.globl	ppc_clone
-ppc_clone:
-	SAVE_NVGPRS(r1)
-	lwz	r0,_TRAP(r1)
-	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
-	stw	r0,_TRAP(r1)		/* register set saved */
-	b	sys_clone
-
-	.globl	ppc_clone3
-ppc_clone3:
-	SAVE_NVGPRS(r1)
-	lwz	r0,_TRAP(r1)
-	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
-	stw	r0,_TRAP(r1)		/* register set saved */
-	b	sys_clone3
-
-	.globl	ppc_swapcontext
-ppc_swapcontext:
-	SAVE_NVGPRS(r1)
-	lwz	r0,_TRAP(r1)
-	rlwinm	r0,r0,0,0,30		/* clear LSB to indicate full */
-	stw	r0,_TRAP(r1)		/* register set saved */
-	b	sys_swapcontext
-
 /*
  * Top-level page fault handling.
  * This is in assembler because if do_page_fault tells us that
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 24dc326e0d56..7b12736ec546 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -148,7 +148,7 @@
 	stw	r2,GPR2(r11)
 	addi	r10,r10,STACK_FRAME_REGS_MARKER@l
 	stw	r9,_MSR(r11)
-	li	r2, \trapno + 1
+	li	r2, \trapno
 	stw	r10,8(r11)
 	stw	r2,_TRAP(r11)
 	SAVE_GPR(0, r11)
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index b3c502c503a0..626e716576ce 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -124,7 +124,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
 	stw	r2,GPR2(r11)
 	addi	r12, r12, STACK_FRAME_REGS_MARKER@l
 	stw	r9,_MSR(r11)
-	li	r2, \trapno + 1
+	li	r2, \trapno
 	stw	r12, 8(r11)
 	stw	r2,_TRAP(r11)
 	SAVE_GPR(0, r11)
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index f744eb5cba88..96b2157f0371 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -9,9 +9,7 @@
 #
 0	nospu	restart_syscall			sys_restart_syscall
 1	nospu	exit				sys_exit
-2	32	fork				ppc_fork			sys_fork
-2	64	fork				sys_fork
-2	spu	fork				sys_ni_syscall
+2	nospu	fork				sys_fork
 3	common	read				sys_read
 4	common	write				sys_write
 5	common	open				sys_open			compat_sys_open
@@ -160,9 +158,7 @@
 119	32	sigreturn			sys_sigreturn			compat_sys_sigreturn
 119	64	sigreturn			sys_ni_syscall
 119	spu	sigreturn			sys_ni_syscall
-120	32	clone				ppc_clone			sys_clone
-120	64	clone				sys_clone
-120	spu	clone				sys_ni_syscall
+120	nospu	clone				sys_clone
 121	common	setdomainname			sys_setdomainname
 122	common	uname				sys_newuname
 123	common	modify_ldt			sys_ni_syscall
@@ -244,9 +240,7 @@
 186	spu	sendfile			sys_sendfile64
 187	common	getpmsg				sys_ni_syscall
 188	common 	putpmsg				sys_ni_syscall
-189	32	vfork				ppc_vfork			sys_vfork
-189	64	vfork				sys_vfork
-189	spu	vfork				sys_ni_syscall
+189	nospu	vfork				sys_vfork
 190	common	ugetrlimit			sys_getrlimit			compat_sys_getrlimit
 191	common	readahead			sys_readahead			compat_sys_readahead
 192	32	mmap2				sys_mmap2			compat_sys_mmap2
@@ -322,9 +316,7 @@
 248	32	clock_nanosleep			sys_clock_nanosleep_time32
 248	64	clock_nanosleep			sys_clock_nanosleep
 248	spu	clock_nanosleep			sys_clock_nanosleep
-249	32	swapcontext			ppc_swapcontext			compat_sys_swapcontext
-249	64	swapcontext			sys_swapcontext
-249	spu	swapcontext			sys_ni_syscall
+249	nospu	swapcontext			sys_swapcontext			compat_sys_swapcontext
 250	common	tgkill				sys_tgkill
 251	32	utimes				sys_utimes_time32
 251	64	utimes				sys_utimes
@@ -522,9 +514,7 @@
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
-435	32	clone3				ppc_clone3			sys_clone3
-435	64	clone3				sys_clone3
-435	spu	clone3				sys_ni_syscall
+435	nospu	clone3				sys_clone3
 436	common	close_range			sys_close_range
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 17/23] powerpc/syscall: implement system call entry/exit logic in C for PPC32
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

That's port of PPC64 syscall entry/exit logic in C to PPC32.

Performancewise on 8xx:
Before : 304 cycles on null_syscall
After  : 348 cycles on null_syscall

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S   | 227 ++++---------------------------
 arch/powerpc/kernel/head_32.h    |  16 ---
 arch/powerpc/kernel/head_booke.h |  15 --
 3 files changed, 29 insertions(+), 229 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 97dc28a68465..ce5fdb23ed7c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -329,117 +329,22 @@ stack_ovf:
 _ASM_NOKPROBE_SYMBOL(stack_ovf)
 #endif
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-trace_syscall_entry_irq_off:
-	/*
-	 * Syscall shouldn't happen while interrupts are disabled,
-	 * so let's do a warning here.
-	 */
-0:	trap
-	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-	bl	trace_hardirqs_on
-
-	/* Now enable for real */
-	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
-	mtmsr	r10
-
-	REST_GPR(0, r1)
-	REST_4GPRS(3, r1)
-	REST_2GPRS(7, r1)
-	b	DoSyscall
-#endif /* CONFIG_TRACE_IRQFLAGS */
-
 	.globl	transfer_to_syscall
 transfer_to_syscall:
 	SAVE_NVGPRS(r1)
 #ifdef CONFIG_PPC_BOOK3S_32
 	kuep_lock r11, r12
 #endif
-#ifdef CONFIG_TRACE_IRQFLAGS
-	andi.	r12,r9,MSR_EE
-	beq-	trace_syscall_entry_irq_off
-#endif /* CONFIG_TRACE_IRQFLAGS */
 
-/*
- * Handle a system call.
- */
-	.stabs	"arch/powerpc/kernel/",N_SO,0,0,0f
-	.stabs	"entry_32.S",N_SO,0,0,0f
-0:
-
-_GLOBAL(DoSyscall)
-	stw	r3,ORIG_GPR3(r1)
-	li	r12,0
-	stw	r12,RESULT(r1)
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/* Make sure interrupts are enabled */
-	mfmsr	r11
-	andi.	r12,r11,MSR_EE
-	/* We came in with interrupts disabled, we WARN and mark them enabled
-	 * for lockdep now */
-0:	tweqi	r12, 0
-	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-#endif /* CONFIG_TRACE_IRQFLAGS */
-	lwz	r11,TI_FLAGS(r2)
-	andi.	r11,r11,_TIF_SYSCALL_DOTRACE
-	bne-	syscall_dotrace
-syscall_dotrace_cont:
-	cmplwi	0,r0,NR_syscalls
-	lis	r10,sys_call_table@h
-	ori	r10,r10,sys_call_table@l
-	slwi	r0,r0,2
-	bge-	66f
-
-	barrier_nospec_asm
-	/*
-	 * Prevent the load of the handler below (based on the user-passed
-	 * system call number) being speculatively executed until the test
-	 * against NR_syscalls and branch to .66f above has
-	 * committed.
-	 */
+	/* Calling convention has r9 = orig r0, r10 = regs */
+	mr	r9,r0
+	addi	r10,r1,STACK_FRAME_OVERHEAD
+	bl	system_call_exception
 
-	lwzx	r10,r10,r0	/* Fetch system call handler [ptr] */
-	mtlr	r10
-	addi	r9,r1,STACK_FRAME_OVERHEAD
-	PPC440EP_ERR42
-	blrl			/* Call handler */
-	.globl	ret_from_syscall
 ret_from_syscall:
-#ifdef CONFIG_DEBUG_RSEQ
-	/* Check whether the syscall is issued inside a restartable sequence */
-	stw	r3,GPR3(r1)
-	addi    r3,r1,STACK_FRAME_OVERHEAD
-	bl      rseq_syscall
-	lwz	r3,GPR3(r1)
-#endif
-	mr	r6,r3
-	/* disable interrupts so current_thread_info()->flags can't change */
-	LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)	/* doesn't include MSR_EE */
-	/* Note: We don't bother telling lockdep about it */
-	mtmsr	r10
-	lwz	r9,TI_FLAGS(r2)
-	li	r8,-MAX_ERRNO
-	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
-	bne-	syscall_exit_work
-	cmplw	0,r3,r8
-	blt+	syscall_exit_cont
-	lwz	r11,_CCR(r1)			/* Load CR */
-	neg	r3,r3
-	oris	r11,r11,0x1000	/* Set SO bit in CR */
-	stw	r11,_CCR(r1)
-syscall_exit_cont:
-	lwz	r8,_MSR(r1)
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/* If we are going to return from the syscall with interrupts
-	 * off, we trace that here. It shouldn't normally happen.
-	 */
-	andi.	r10,r8,MSR_EE
-	bne+	1f
-	stw	r3,GPR3(r1)
-	bl      trace_hardirqs_off
-	lwz	r3,GPR3(r1)
-1:
-#endif /* CONFIG_TRACE_IRQFLAGS */
+	addi    r4,r1,STACK_FRAME_OVERHEAD
+	li	r5,0
+	bl	syscall_exit_prepare
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
 	/* If the process has its own DBCR0 value, load it up.  The internal
 	   debug mode bit tells us that dbcr0 should be loaded. */
@@ -453,34 +358,39 @@ syscall_exit_cont:
 	cmplwi	cr0,r5,0
 	bne-	2f
 #endif /* CONFIG_PPC_47x */
-1:
-BEGIN_FTR_SECTION
-	lwarx	r7,0,r1
-END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
-	stwcx.	r0,0,r1			/* to clear the reservation */
-	ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
 #ifdef CONFIG_PPC_BOOK3S_32
 	kuep_unlock r5, r7
 #endif
-	kuap_check r2, r4
 	lwz	r4,_LINK(r1)
 	lwz	r5,_CCR(r1)
 	mtlr	r4
-	mtcr	r5
 	lwz	r7,_NIP(r1)
-	lwz	r2,GPR2(r1)
-	lwz	r1,GPR1(r1)
+	lwz	r8,_MSR(r1)
+	cmpwi	r3,0
+	lwz	r3,GPR3(r1)
 syscall_exit_finish:
-#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
-	mtspr	SPRN_NRI, r0
-#endif
 	mtspr	SPRN_SRR0,r7
 	mtspr	SPRN_SRR1,r8
+
+	bne	3f
+	mtcr	r5
+
+1:	lwz	r2,GPR2(r1)
+	lwz	r1,GPR1(r1)
 	rfi
-#ifdef CONFIG_40x
-	b .	/* Prevent prefetch past rfi */
-#endif
-_ASM_NOKPROBE_SYMBOL(syscall_exit_finish)
+
+3:	mtcr	r5
+	lwz	r4,_CTR(r1)
+	lwz	r5,_XER(r1)
+	REST_NVGPRS(r1)
+	mtctr	r4
+	mtxer	r5
+	lwz	r0,GPR0(r1)
+	lwz	r3,GPR3(r1)
+	REST_8GPRS(4,r1)
+	lwz	r12,GPR12(r1)
+	b	1b
+
 #ifdef CONFIG_44x
 2:	li	r7,0
 	iccci	r0,r0
@@ -488,9 +398,6 @@ _ASM_NOKPROBE_SYMBOL(syscall_exit_finish)
 	b	1b
 #endif  /* CONFIG_44x */
 
-66:	li	r3,-ENOSYS
-	b	ret_from_syscall
-
 	.globl	ret_from_fork
 ret_from_fork:
 	REST_NVGPRS(r1)
@@ -509,82 +416,6 @@ ret_from_kernel_thread:
 	li	r3,0
 	b	ret_from_syscall
 
-/* Traced system call support */
-syscall_dotrace:
-	SAVE_NVGPRS(r1)
-	li	r0,0xc00
-	stw	r0,_TRAP(r1)
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	do_syscall_trace_enter
-	/*
-	 * Restore argument registers possibly just changed.
-	 * We use the return value of do_syscall_trace_enter
-	 * for call number to look up in the table (r0).
-	 */
-	mr	r0,r3
-	lwz	r3,GPR3(r1)
-	lwz	r4,GPR4(r1)
-	lwz	r5,GPR5(r1)
-	lwz	r6,GPR6(r1)
-	lwz	r7,GPR7(r1)
-	lwz	r8,GPR8(r1)
-	REST_NVGPRS(r1)
-
-	cmplwi	r0,NR_syscalls
-	/* Return code is already in r3 thanks to do_syscall_trace_enter() */
-	bge-	ret_from_syscall
-	b	syscall_dotrace_cont
-
-syscall_exit_work:
-	andi.	r0,r9,_TIF_RESTOREALL
-	beq+	0f
-	REST_NVGPRS(r1)
-	b	2f
-0:	cmplw	0,r3,r8
-	blt+	1f
-	andi.	r0,r9,_TIF_NOERROR
-	bne-	1f
-	lwz	r11,_CCR(r1)			/* Load CR */
-	neg	r3,r3
-	oris	r11,r11,0x1000	/* Set SO bit in CR */
-	stw	r11,_CCR(r1)
-
-1:	stw	r6,RESULT(r1)	/* Save result */
-	stw	r3,GPR3(r1)	/* Update return value */
-2:	andi.	r0,r9,(_TIF_PERSYSCALL_MASK)
-	beq	4f
-
-	/* Clear per-syscall TIF flags if any are set.  */
-
-	li	r11,_TIF_PERSYSCALL_MASK
-	addi	r12,r2,TI_FLAGS
-3:	lwarx	r8,0,r12
-	andc	r8,r8,r11
-	stwcx.	r8,0,r12
-	bne-	3b
-	
-4:	/* Anything which requires enabling interrupts? */
-	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP)
-	beq	ret_from_except
-
-	/* Re-enable interrupts. There is no need to trace that with
-	 * lockdep as we are supposed to have IRQs on at this point
-	 */
-	ori	r10,r10,MSR_EE
-	mtmsr	r10
-
-	/* Save NVGPRS if they're not saved already */
-	lwz	r4,_TRAP(r1)
-	andi.	r4,r4,1
-	beq	5f
-	SAVE_NVGPRS(r1)
-	li	r4,0xc00
-	stw	r4,_TRAP(r1)
-5:
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	do_syscall_trace_leave
-	b	ret_from_except_full
-
 	/*
 	 * System call was called from kernel. We get here with SRR1 in r9.
 	 * Mark the exception as recoverable once we have retrieved SRR0,
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 7b12736ec546..c2aa0d8f1f63 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -162,9 +162,6 @@
 	   internal debug mode bit to do this. */
 	lwz	r12,THREAD_DBCR0(r12)
 	andis.	r12,r12,DBCR0_IDM@h
-#endif
-	ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
-#if defined(CONFIG_40x)
 	beq+	3f
 	/* From user and task is ptraced - load up global dbcr0 */
 	li	r12,-1			/* clear all pending debug events */
@@ -176,21 +173,8 @@
 	lwz	r12,4(r11)
 	addi	r12,r12,-1
 	stw	r12,4(r11)
-#endif
-
 3:
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/*
-	 * If MSR is changing we need to keep interrupts disabled at this point
-	 * otherwise we might risk taking an interrupt before we tell lockdep
-	 * they are enabled.
-	 */
-	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL)
-	rlwimi	r10, r9, 0, MSR_EE
-#else
-	LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
 #endif
-	mtmsr	r10
 	b	transfer_to_syscall		/* jump to handler */
 99:	b	ret_from_kernel_syscall
 .endm
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 626e716576ce..faff094b650e 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -138,7 +138,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
 	   internal debug mode bit to do this. */
 	lwz	r12,THREAD_DBCR0(r10)
 	andis.	r12,r12,DBCR0_IDM@h
-	ACCOUNT_CPU_USER_ENTRY(r2, r11, r12)
 	beq+	3f
 	/* From user and task is ptraced - load up global dbcr0 */
 	li	r12,-1			/* clear all pending debug events */
@@ -157,20 +156,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
 	stw	r12,4(r11)
 
 3:
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/*
-	 * If MSR is changing we need to keep interrupts disabled at this point
-	 * otherwise we might risk taking an interrupt before we tell lockdep
-	 * they are enabled.
-	 */
-	lis	r10, MSR_KERNEL@h
-	ori	r10, r10, MSR_KERNEL@l
-	rlwimi	r10, r9, 0, MSR_EE
-#else
-	lis	r10, (MSR_KERNEL | MSR_EE)@h
-	ori	r10, r10, (MSR_KERNEL | MSR_EE)@l
-#endif
-	mtmsr	r10
 	b	transfer_to_syscall	/* jump to handler */
 99:	b	ret_from_kernel_syscall
 .endm
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 18/23] powerpc/32: Remove verification of MSR_PR on syscall in the ASM entry
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

system_call_exception() checks MSR_PR and BUGs if a syscall
is issued from kernel mode.

No need to handle it anymore from the ASM entry code.

null_syscall reduction 2 cycles (348 => 346 cycles)

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S   | 30 ------------------------------
 arch/powerpc/kernel/head_32.h    |  3 ---
 arch/powerpc/kernel/head_booke.h |  3 ---
 3 files changed, 36 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index ce5fdb23ed7c..9922a04650f7 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -416,36 +416,6 @@ ret_from_kernel_thread:
 	li	r3,0
 	b	ret_from_syscall
 
-	/*
-	 * System call was called from kernel. We get here with SRR1 in r9.
-	 * Mark the exception as recoverable once we have retrieved SRR0,
-	 * trap a warning and return ENOSYS with CR[SO] set.
-	 */
-	.globl	ret_from_kernel_syscall
-ret_from_kernel_syscall:
-	mfspr	r9, SPRN_SRR0
-	mfspr	r10, SPRN_SRR1
-#if !defined(CONFIG_4xx) && !defined(CONFIG_BOOKE)
-	LOAD_REG_IMMEDIATE(r11, MSR_KERNEL & ~(MSR_IR|MSR_DR))
-	mtmsr	r11
-#endif
-
-0:	trap
-	EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-
-	li	r3, ENOSYS
-	crset	so
-#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
-	mtspr	SPRN_NRI, r0
-#endif
-	mtspr	SPRN_SRR0, r9
-	mtspr	SPRN_SRR1, r10
-	rfi
-#ifdef CONFIG_40x
-	b .	/* Prevent prefetch past rfi */
-#endif
-_ASM_NOKPROBE_SYMBOL(ret_from_kernel_syscall)
-
 /*
  * Top-level page fault handling.
  * This is in assembler because if do_page_fault tells us that
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index c2aa0d8f1f63..c0de4acbe3f8 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -118,8 +118,6 @@
 .macro SYSCALL_ENTRY trapno
 	mfspr	r9, SPRN_SRR1
 	mfspr	r10, SPRN_SRR0
-	andi.	r11, r9, MSR_PR
-	beq-	99f
 	LOAD_REG_IMMEDIATE(r11, MSR_KERNEL)		/* can take exceptions */
 	lis	r12, 1f@h
 	ori	r12, r12, 1f@l
@@ -176,7 +174,6 @@
 3:
 #endif
 	b	transfer_to_syscall		/* jump to handler */
-99:	b	ret_from_kernel_syscall
 .endm
 
 .macro save_dar_dsisr_on_stack reg1, reg2, sp
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index faff094b650e..7af84e1e717b 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -106,10 +106,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
 #endif
 	mfspr	r9, SPRN_SRR1
 	BOOKE_CLEAR_BTB(r11)
-	andi.	r11, r9, MSR_PR
 	lwz	r11, TASK_STACK - THREAD(r10)
 	rlwinm	r12,r12,0,4,2	/* Clear SO bit in CR */
-	beq-	99f
 	ALLOC_STACK_FRAME(r11, THREAD_SIZE - INT_FRAME_SIZE)
 	stw	r12, _CCR(r11)		/* save various registers */
 	mflr	r12
@@ -157,7 +155,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
 
 3:
 	b	transfer_to_syscall	/* jump to handler */
-99:	b	ret_from_kernel_syscall
 .endm
 
 /* To handle the additional exception priority levels on 40x and Book-E
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 19/23] powerpc/syscall: Avoid stack frame in likely part of system_call_exception()
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

When r3 is not modified, reload it from regs->orig_r3 to free
volatile registers. This avoids a stack frame for the likely part
of system_call_exception()

Before the patch:

c000b4d4 <system_call_exception>:
c000b4d4:	7c 08 02 a6 	mflr    r0
c000b4d8:	94 21 ff e0 	stwu    r1,-32(r1)
c000b4dc:	93 e1 00 1c 	stw     r31,28(r1)
c000b4e0:	90 01 00 24 	stw     r0,36(r1)
c000b4e4:	90 6a 00 88 	stw     r3,136(r10)
c000b4e8:	81 6a 00 84 	lwz     r11,132(r10)
c000b4ec:	69 6b 00 02 	xori    r11,r11,2
c000b4f0:	55 6b ff fe 	rlwinm  r11,r11,31,31,31
c000b4f4:	0f 0b 00 00 	twnei   r11,0
c000b4f8:	81 6a 00 a0 	lwz     r11,160(r10)
c000b4fc:	55 6b 07 fe 	clrlwi  r11,r11,31
c000b500:	0f 0b 00 00 	twnei   r11,0
c000b504:	7c 0c 42 e6 	mftb    r0
c000b508:	83 e2 00 08 	lwz     r31,8(r2)
c000b50c:	81 82 00 28 	lwz     r12,40(r2)
c000b510:	90 02 00 24 	stw     r0,36(r2)
c000b514:	7d 8c f8 50 	subf    r12,r12,r31
c000b518:	7c 0c 02 14 	add     r0,r12,r0
c000b51c:	90 02 00 08 	stw     r0,8(r2)
c000b520:	7c 10 13 a6 	mtspr   80,r0
c000b524:	81 62 00 70 	lwz     r11,112(r2)
c000b528:	71 60 86 91 	andi.   r0,r11,34449
c000b52c:	40 82 00 34 	bne     c000b560 <system_call_exception+0x8c>
c000b530:	2b 89 01 b6 	cmplwi  cr7,r9,438
c000b534:	41 9d 00 64 	bgt     cr7,c000b598 <system_call_exception+0xc4>
c000b538:	3d 40 c0 5c 	lis     r10,-16292
c000b53c:	55 29 10 3a 	rlwinm  r9,r9,2,0,29
c000b540:	39 4a 41 e8 	addi    r10,r10,16872
c000b544:	80 01 00 24 	lwz     r0,36(r1)
c000b548:	7d 2a 48 2e 	lwzx    r9,r10,r9
c000b54c:	7c 08 03 a6 	mtlr    r0
c000b550:	7d 29 03 a6 	mtctr   r9
c000b554:	83 e1 00 1c 	lwz     r31,28(r1)
c000b558:	38 21 00 20 	addi    r1,r1,32
c000b55c:	4e 80 04 20 	bctr

After the patch:

c000b4d4 <system_call_exception>:
c000b4d4:	81 6a 00 84 	lwz     r11,132(r10)
c000b4d8:	90 6a 00 88 	stw     r3,136(r10)
c000b4dc:	69 6b 00 02 	xori    r11,r11,2
c000b4e0:	55 6b ff fe 	rlwinm  r11,r11,31,31,31
c000b4e4:	0f 0b 00 00 	twnei   r11,0
c000b4e8:	80 6a 00 a0 	lwz     r3,160(r10)
c000b4ec:	54 63 07 fe 	clrlwi  r3,r3,31
c000b4f0:	0f 03 00 00 	twnei   r3,0
c000b4f4:	7d 6c 42 e6 	mftb    r11
c000b4f8:	81 82 00 08 	lwz     r12,8(r2)
c000b4fc:	80 02 00 28 	lwz     r0,40(r2)
c000b500:	91 62 00 24 	stw     r11,36(r2)
c000b504:	7c 00 60 50 	subf    r0,r0,r12
c000b508:	7d 60 5a 14 	add     r11,r0,r11
c000b50c:	91 62 00 08 	stw     r11,8(r2)
c000b510:	7c 10 13 a6 	mtspr   80,r0
c000b514:	80 62 00 70 	lwz     r3,112(r2)
c000b518:	70 6b 86 91 	andi.   r11,r3,34449
c000b51c:	40 82 00 28 	bne     c000b544 <system_call_exception+0x70>
c000b520:	2b 89 01 b6 	cmplwi  cr7,r9,438
c000b524:	41 9d 00 84 	bgt     cr7,c000b5a8 <system_call_exception+0xd4>
c000b528:	80 6a 00 88 	lwz     r3,136(r10)
c000b52c:	3d 40 c0 5c 	lis     r10,-16292
c000b530:	55 29 10 3a 	rlwinm  r9,r9,2,0,29
c000b534:	39 4a 41 e4 	addi    r10,r10,16868
c000b538:	7d 2a 48 2e 	lwzx    r9,r10,r9
c000b53c:	7d 29 03 a6 	mtctr   r9
c000b540:	4e 80 04 20 	bctr

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/syscall.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index a3510fa4e641..476909b11051 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -115,6 +115,9 @@ notrace long system_call_exception(long r3, long r4, long r5,
 			return regs->gpr[3];
 		}
 		return -ENOSYS;
+	} else {
+		/* Restore r3 from orig_gpr3 to free up a volatile reg */
+		r3 = regs->orig_gpr3;
 	}
 
 	/* May be faster to do array_index_nospec? */
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 20/23] powerpc/syscall: Do not check unsupported scv vector on PPC32
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

Only PPC64 has scv. No need to check the 0x7ff0 trap on PPC32.

And ignore the scv parameter in syscall_exit_prepare (Save 14 cycles
346 => 332 cycles)

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S | 1 -
 arch/powerpc/kernel/syscall.c  | 7 +++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 9922a04650f7..6ae9c7bcb06c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -343,7 +343,6 @@ transfer_to_syscall:
 
 ret_from_syscall:
 	addi    r4,r1,STACK_FRAME_OVERHEAD
-	li	r5,0
 	bl	syscall_exit_prepare
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
 	/* If the process has its own DBCR0 value, load it up.  The internal
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 476909b11051..30f8a397a522 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -86,7 +86,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 	local_irq_enable();
 
 	if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
-		if (unlikely(regs->trap == 0x7ff0)) {
+		if (IS_ENABLED(CONFIG_PPC64) && unlikely(regs->trap == 0x7ff0)) {
 			/* Unsupported scv vector */
 			_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 			return regs->gpr[3];
@@ -109,7 +109,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 		r8 = regs->gpr[8];
 
 	} else if (unlikely(r0 >= NR_syscalls)) {
-		if (unlikely(regs->trap == 0x7ff0)) {
+		if (IS_ENABLED(CONFIG_PPC64) && unlikely(regs->trap == 0x7ff0)) {
 			/* Unsupported scv vector */
 			_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 			return regs->gpr[3];
@@ -187,6 +187,9 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 	unsigned long ti_flags;
 	unsigned long ret = 0;
 
+	if (IS_ENABLED(CONFIG_PPC32))
+		scv = 0;
+
 	CT_WARN_ON(ct_state() == CONTEXT_USER);
 
 	kuap_check();
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 21/23] powerpc/syscall: Remove FULL_REGS verification in system_call_exception
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

For book3s/64, FULL_REGS() is 'true' at all time, so the test voids.
For others, non volatile registers are saved inconditionally.

So the verification is pointless.

Should one fail to do it, it would anyway be caught by the
CHECK_FULL_REGS() in copy_thread() as we have removed the
special versions ppc_fork() and friends.

null_syscall benchmark reduction 4 cycles (332 => 328 cycles)

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/syscall.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 30f8a397a522..a40775daa88b 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -42,7 +42,6 @@ notrace long system_call_exception(long r3, long r4, long r5,
 	if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
 		BUG_ON(!(regs->msr & MSR_RI));
 	BUG_ON(!(regs->msr & MSR_PR));
-	BUG_ON(!FULL_REGS(regs));
 	BUG_ON(arch_irq_disabled_regs(regs));
 
 #ifdef CONFIG_PPC_PKEY
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 22/23] powerpc/syscall: Optimise checks in beginning of system_call_exception()
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

Combine all tests of regs->msr into a single logical one.

Before the patch:

   0:	81 6a 00 84 	lwz     r11,132(r10)
   4:	90 6a 00 88 	stw     r3,136(r10)
   8:	69 60 00 02 	xori    r0,r11,2
   c:	54 00 ff fe 	rlwinm  r0,r0,31,31,31
  10:	0f 00 00 00 	twnei   r0,0
  14:	69 63 40 00 	xori    r3,r11,16384
  18:	54 63 97 fe 	rlwinm  r3,r3,18,31,31
  1c:	0f 03 00 00 	twnei   r3,0
  20:	69 6b 80 00 	xori    r11,r11,32768
  24:	55 6b 8f fe 	rlwinm  r11,r11,17,31,31
  28:	0f 0b 00 00 	twnei   r11,0

After the patch:

   0:	81 6a 00 84 	lwz     r11,132(r10)
   4:	90 6a 00 88 	stw     r3,136(r10)
   8:	7d 6b 58 f8 	not     r11,r11
   c:	71 6b c0 02 	andi.   r11,r11,49154
  10:	0f 0b 00 00 	twnei   r11,0

6 cycles less on powerpc 8xx (328 => 322 cycles).

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/syscall.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index a40775daa88b..47ae55f94d1c 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -28,6 +28,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 				   unsigned long r0, struct pt_regs *regs)
 {
 	syscall_fn f;
+	unsigned long expected_msr;
 
 	regs->orig_gpr3 = r3;
 
@@ -39,10 +40,13 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
 	trace_hardirqs_off(); /* finish reconciling */
 
+	expected_msr = MSR_PR;
 	if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
-		BUG_ON(!(regs->msr & MSR_RI));
-	BUG_ON(!(regs->msr & MSR_PR));
-	BUG_ON(arch_irq_disabled_regs(regs));
+		expected_msr |= MSR_RI;
+	if (IS_ENABLED(CONFIG_PPC32))
+		expected_msr |= MSR_EE;
+	BUG_ON((regs->msr & expected_msr) ^ expected_msr);
+	BUG_ON(IS_ENABLED(CONFIG_PPC64) && arch_irq_disabled_regs(regs));
 
 #ifdef CONFIG_PPC_PKEY
 	if (mmu_has_feature(MMU_FTR_PKEY)) {
-- 
2.25.0


^ permalink raw reply related

* [PATCH v4 23/23] powerpc/syscall: Avoid storing 'current' in another pointer
From: Christophe Leroy @ 2021-01-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin,
	msuchanek
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1611585031.git.christophe.leroy@csgroup.eu>

By saving the pointer pointing to thread_info.flags, gcc copies r2
in a non-volatile register.

We know 'current' doesn't change, so avoid that intermediaite pointer.

Reduces null_syscall benchmark by 2 cycles (322 => 320 cycles)

On PPC64, gcc seems to know that 'current' is not changing, and it keeps
it in a non volatile register to avoid multiple read of 'current' in paca.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/syscall.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 47ae55f94d1c..72e0b18b88d8 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -186,7 +186,6 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 					   struct pt_regs *regs,
 					   long scv)
 {
-	unsigned long *ti_flagsp = &current_thread_info()->flags;
 	unsigned long ti_flags;
 	unsigned long ret = 0;
 
@@ -202,7 +201,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 	/* Check whether the syscall is issued inside a restartable sequence */
 	rseq_syscall(regs);
 
-	ti_flags = *ti_flagsp;
+	ti_flags = current_thread_info()->flags;
 
 	if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && !scv) {
 		if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
@@ -216,7 +215,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 			ret = _TIF_RESTOREALL;
 		else
 			regs->gpr[3] = r3;
-		clear_bits(_TIF_PERSYSCALL_MASK, ti_flagsp);
+		clear_bits(_TIF_PERSYSCALL_MASK, &current_thread_info()->flags);
 	} else {
 		regs->gpr[3] = r3;
 	}
@@ -228,7 +227,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 
 again:
 	local_irq_disable();
-	ti_flags = READ_ONCE(*ti_flagsp);
+	ti_flags = READ_ONCE(current_thread_info()->flags);
 	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
 		local_irq_enable();
 		if (ti_flags & _TIF_NEED_RESCHED) {
@@ -244,7 +243,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 			do_notify_resume(regs, ti_flags);
 		}
 		local_irq_disable();
-		ti_flags = READ_ONCE(*ti_flagsp);
+		ti_flags = READ_ONCE(current_thread_info()->flags);
 	}
 
 	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Lorenzo Pieralisi @ 2021-01-25 16:50 UTC (permalink / raw)
  To: Michael Walle
  Cc: Rob Herring, Saravana Kannan, Roy Zang, PCI, linux-kernel,
	Minghuan Lian, Mingkai Hu, Greg Kroah-Hartman, Bjorn Helgaas,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <c3e35b90e173b15870a859fd7001a712@walle.cc>

On Wed, Jan 20, 2021 at 08:28:36PM +0100, Michael Walle wrote:
> [RESEND, fat-fingered the buttons of my mail client and converted
> all CCs to BCCs :(]
> 
> Am 2021-01-20 20:02, schrieb Saravana Kannan:
> > On Wed, Jan 20, 2021 at 6:24 AM Rob Herring <robh@kernel.org> wrote:
> > > 
> > > On Wed, Jan 20, 2021 at 4:53 AM Michael Walle <michael@walle.cc>
> > > wrote:
> > > >
> > > > fw_devlink will defer the probe until all suppliers are ready. We can't
> > > > use builtin_platform_driver_probe() because it doesn't retry after probe
> > > > deferral. Convert it to builtin_platform_driver().
> > > 
> > > If builtin_platform_driver_probe() doesn't work with fw_devlink, then
> > > shouldn't it be fixed or removed?
> > 
> > I was actually thinking about this too. The problem with fixing
> > builtin_platform_driver_probe() to behave like
> > builtin_platform_driver() is that these probe functions could be
> > marked with __init. But there are also only 20 instances of
> > builtin_platform_driver_probe() in the kernel:
> > $ git grep ^builtin_platform_driver_probe | wc -l
> > 20
> > 
> > So it might be easier to just fix them to not use
> > builtin_platform_driver_probe().
> > 
> > Michael,
> > 
> > Any chance you'd be willing to help me by converting all these to
> > builtin_platform_driver() and delete builtin_platform_driver_probe()?
> 
> If it just moving the probe function to the _driver struct and
> remove the __init annotations. I could look into that.

Can I drop this patch then ?

Thanks,
Lorenzo

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Saravana Kannan @ 2021-01-25 18:58 UTC (permalink / raw)
  To: Lorenzo Pieralisi
  Cc: Rob Herring, Roy Zang, PCI, LKML, Minghuan Lian, Michael Walle,
	Mingkai Hu, Greg Kroah-Hartman, Bjorn Helgaas, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <20210125165041.GA5979@e121166-lin.cambridge.arm.com>

On Mon, Jan 25, 2021 at 8:50 AM Lorenzo Pieralisi
<lorenzo.pieralisi@arm.com> wrote:
>
> On Wed, Jan 20, 2021 at 08:28:36PM +0100, Michael Walle wrote:
> > [RESEND, fat-fingered the buttons of my mail client and converted
> > all CCs to BCCs :(]
> >
> > Am 2021-01-20 20:02, schrieb Saravana Kannan:
> > > On Wed, Jan 20, 2021 at 6:24 AM Rob Herring <robh@kernel.org> wrote:
> > > >
> > > > On Wed, Jan 20, 2021 at 4:53 AM Michael Walle <michael@walle.cc>
> > > > wrote:
> > > > >
> > > > > fw_devlink will defer the probe until all suppliers are ready. We can't
> > > > > use builtin_platform_driver_probe() because it doesn't retry after probe
> > > > > deferral. Convert it to builtin_platform_driver().
> > > >
> > > > If builtin_platform_driver_probe() doesn't work with fw_devlink, then
> > > > shouldn't it be fixed or removed?
> > >
> > > I was actually thinking about this too. The problem with fixing
> > > builtin_platform_driver_probe() to behave like
> > > builtin_platform_driver() is that these probe functions could be
> > > marked with __init. But there are also only 20 instances of
> > > builtin_platform_driver_probe() in the kernel:
> > > $ git grep ^builtin_platform_driver_probe | wc -l
> > > 20
> > >
> > > So it might be easier to just fix them to not use
> > > builtin_platform_driver_probe().
> > >
> > > Michael,
> > >
> > > Any chance you'd be willing to help me by converting all these to
> > > builtin_platform_driver() and delete builtin_platform_driver_probe()?
> >
> > If it just moving the probe function to the _driver struct and
> > remove the __init annotations. I could look into that.
>
> Can I drop this patch then ?

No, please pick it up. Michael and I were talking about doing similar
changes for other drivers.

-Saravana

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Michael Walle @ 2021-01-25 19:44 UTC (permalink / raw)
  To: Saravana Kannan
  Cc: Rob Herring, Lorenzo Pieralisi, Roy Zang, PCI, LKML,
	Minghuan Lian, Mingkai Hu, Greg Kroah-Hartman, Bjorn Helgaas,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <CAGETcx-P-MxM+49XdUGBmg5YgnHS=fmz8uewywXvLSFKj=MqRQ@mail.gmail.com>

Am 2021-01-25 19:58, schrieb Saravana Kannan:
> On Mon, Jan 25, 2021 at 8:50 AM Lorenzo Pieralisi
> <lorenzo.pieralisi@arm.com> wrote:
>> 
>> On Wed, Jan 20, 2021 at 08:28:36PM +0100, Michael Walle wrote:
>> > [RESEND, fat-fingered the buttons of my mail client and converted
>> > all CCs to BCCs :(]
>> >
>> > Am 2021-01-20 20:02, schrieb Saravana Kannan:
>> > > On Wed, Jan 20, 2021 at 6:24 AM Rob Herring <robh@kernel.org> wrote:
>> > > >
>> > > > On Wed, Jan 20, 2021 at 4:53 AM Michael Walle <michael@walle.cc>
>> > > > wrote:
>> > > > >
>> > > > > fw_devlink will defer the probe until all suppliers are ready. We can't
>> > > > > use builtin_platform_driver_probe() because it doesn't retry after probe
>> > > > > deferral. Convert it to builtin_platform_driver().
>> > > >
>> > > > If builtin_platform_driver_probe() doesn't work with fw_devlink, then
>> > > > shouldn't it be fixed or removed?
>> > >
>> > > I was actually thinking about this too. The problem with fixing
>> > > builtin_platform_driver_probe() to behave like
>> > > builtin_platform_driver() is that these probe functions could be
>> > > marked with __init. But there are also only 20 instances of
>> > > builtin_platform_driver_probe() in the kernel:
>> > > $ git grep ^builtin_platform_driver_probe | wc -l
>> > > 20
>> > >
>> > > So it might be easier to just fix them to not use
>> > > builtin_platform_driver_probe().
>> > >
>> > > Michael,
>> > >
>> > > Any chance you'd be willing to help me by converting all these to
>> > > builtin_platform_driver() and delete builtin_platform_driver_probe()?
>> >
>> > If it just moving the probe function to the _driver struct and
>> > remove the __init annotations. I could look into that.
>> 
>> Can I drop this patch then ?
> 
> No, please pick it up. Michael and I were talking about doing similar
> changes for other drivers.

Yes please, I was just about to answer, but Saravana beat me.

-michael

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Michael Walle @ 2021-01-25 19:49 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Roy Zang, Lorenzo Pieralisi, Saravana Kannan, PCI, LKML,
	Minghuan Lian, linux-arm-kernel, Greg Kroah-Hartman,
	Bjorn Helgaas, linuxppc-dev, Mingkai Hu
In-Reply-To: <CAMuHMdWvFej-6vkaLM44t7eX2LpkDSXu4_7VH-X-3XRueXTO=A@mail.gmail.com>

Am 2021-01-21 12:01, schrieb Geert Uytterhoeven:
> Hi Saravana,
> 
> On Thu, Jan 21, 2021 at 1:05 AM Saravana Kannan <saravanak@google.com> 
> wrote:
>> On Wed, Jan 20, 2021 at 3:53 PM Michael Walle <michael@walle.cc> 
>> wrote:
>> > Am 2021-01-20 20:47, schrieb Saravana Kannan:
>> > > On Wed, Jan 20, 2021 at 11:28 AM Michael Walle <michael@walle.cc>
>> > > wrote:
>> > >>
>> > >> [RESEND, fat-fingered the buttons of my mail client and converted
>> > >> all CCs to BCCs :(]
>> > >>
>> > >> Am 2021-01-20 20:02, schrieb Saravana Kannan:
>> > >> > On Wed, Jan 20, 2021 at 6:24 AM Rob Herring <robh@kernel.org> wrote:
>> > >> >>
>> > >> >> On Wed, Jan 20, 2021 at 4:53 AM Michael Walle <michael@walle.cc>
>> > >> >> wrote:
>> > >> >> >
>> > >> >> > fw_devlink will defer the probe until all suppliers are ready. We can't
>> > >> >> > use builtin_platform_driver_probe() because it doesn't retry after probe
>> > >> >> > deferral. Convert it to builtin_platform_driver().
>> > >> >>
>> > >> >> If builtin_platform_driver_probe() doesn't work with fw_devlink, then
>> > >> >> shouldn't it be fixed or removed?
>> > >> >
>> > >> > I was actually thinking about this too. The problem with fixing
>> > >> > builtin_platform_driver_probe() to behave like
>> > >> > builtin_platform_driver() is that these probe functions could be
>> > >> > marked with __init. But there are also only 20 instances of
>> > >> > builtin_platform_driver_probe() in the kernel:
>> > >> > $ git grep ^builtin_platform_driver_probe | wc -l
>> > >> > 20
>> > >> >
>> > >> > So it might be easier to just fix them to not use
>> > >> > builtin_platform_driver_probe().
>> > >> >
>> > >> > Michael,
>> > >> >
>> > >> > Any chance you'd be willing to help me by converting all these to
>> > >> > builtin_platform_driver() and delete builtin_platform_driver_probe()?
>> > >>
>> > >> If it just moving the probe function to the _driver struct and
>> > >> remove the __init annotations. I could look into that.
>> > >
>> > > Yup. That's pretty much it AFAICT.
>> > >
>> > > builtin_platform_driver_probe() also makes sure the driver doesn't ask
>> > > for async probe, etc. But I doubt anyone is actually setting async
>> > > flags and still using builtin_platform_driver_probe().
>> >
>> > Hasn't module_platform_driver_probe() the same problem? And there
>> > are ~80 drivers which uses that.
>> 
>> Yeah. The biggest problem with all of these is the __init markers.
>> Maybe some familiar with coccinelle can help?
> 
> And dropping them will increase memory usage.

Although I do have the changes for the builtin_platform_driver_probe()
ready, I don't think it makes much sense to send these unless we agree
on the increased memory footprint. While there are just a few
builtin_platform_driver_probe() and memory increase _might_ be
negligible, there are many more module_platform_driver_probe().

-michael

^ permalink raw reply

* Re: [PATCH] PCI: dwc: layerscape: convert to builtin_platform_driver()
From: Saravana Kannan @ 2021-01-25 22:41 UTC (permalink / raw)
  To: Michael Walle
  Cc: Roy Zang, Lorenzo Pieralisi, PCI, LKML, Minghuan Lian,
	Geert Uytterhoeven, linux-arm-kernel, Greg Kroah-Hartman,
	Bjorn Helgaas, linuxppc-dev, Mingkai Hu
In-Reply-To: <a24391e62b107040435766fff52bdd31@walle.cc>

On Mon, Jan 25, 2021 at 11:49 AM Michael Walle <michael@walle.cc> wrote:
>
> Am 2021-01-21 12:01, schrieb Geert Uytterhoeven:
> > Hi Saravana,
> >
> > On Thu, Jan 21, 2021 at 1:05 AM Saravana Kannan <saravanak@google.com>
> > wrote:
> >> On Wed, Jan 20, 2021 at 3:53 PM Michael Walle <michael@walle.cc>
> >> wrote:
> >> > Am 2021-01-20 20:47, schrieb Saravana Kannan:
> >> > > On Wed, Jan 20, 2021 at 11:28 AM Michael Walle <michael@walle.cc>
> >> > > wrote:
> >> > >>
> >> > >> [RESEND, fat-fingered the buttons of my mail client and converted
> >> > >> all CCs to BCCs :(]
> >> > >>
> >> > >> Am 2021-01-20 20:02, schrieb Saravana Kannan:
> >> > >> > On Wed, Jan 20, 2021 at 6:24 AM Rob Herring <robh@kernel.org> wrote:
> >> > >> >>
> >> > >> >> On Wed, Jan 20, 2021 at 4:53 AM Michael Walle <michael@walle.cc>
> >> > >> >> wrote:
> >> > >> >> >
> >> > >> >> > fw_devlink will defer the probe until all suppliers are ready. We can't
> >> > >> >> > use builtin_platform_driver_probe() because it doesn't retry after probe
> >> > >> >> > deferral. Convert it to builtin_platform_driver().
> >> > >> >>
> >> > >> >> If builtin_platform_driver_probe() doesn't work with fw_devlink, then
> >> > >> >> shouldn't it be fixed or removed?
> >> > >> >
> >> > >> > I was actually thinking about this too. The problem with fixing
> >> > >> > builtin_platform_driver_probe() to behave like
> >> > >> > builtin_platform_driver() is that these probe functions could be
> >> > >> > marked with __init. But there are also only 20 instances of
> >> > >> > builtin_platform_driver_probe() in the kernel:
> >> > >> > $ git grep ^builtin_platform_driver_probe | wc -l
> >> > >> > 20
> >> > >> >
> >> > >> > So it might be easier to just fix them to not use
> >> > >> > builtin_platform_driver_probe().
> >> > >> >
> >> > >> > Michael,
> >> > >> >
> >> > >> > Any chance you'd be willing to help me by converting all these to
> >> > >> > builtin_platform_driver() and delete builtin_platform_driver_probe()?
> >> > >>
> >> > >> If it just moving the probe function to the _driver struct and
> >> > >> remove the __init annotations. I could look into that.
> >> > >
> >> > > Yup. That's pretty much it AFAICT.
> >> > >
> >> > > builtin_platform_driver_probe() also makes sure the driver doesn't ask
> >> > > for async probe, etc. But I doubt anyone is actually setting async
> >> > > flags and still using builtin_platform_driver_probe().
> >> >
> >> > Hasn't module_platform_driver_probe() the same problem? And there
> >> > are ~80 drivers which uses that.
> >>
> >> Yeah. The biggest problem with all of these is the __init markers.
> >> Maybe some familiar with coccinelle can help?
> >
> > And dropping them will increase memory usage.
>
> Although I do have the changes for the builtin_platform_driver_probe()
> ready, I don't think it makes much sense to send these unless we agree
> on the increased memory footprint. While there are just a few
> builtin_platform_driver_probe() and memory increase _might_ be
> negligible, there are many more module_platform_driver_probe().

While it's good to drop code that'll not be used past kernel init, the
module_platform_driver_probe() is going even more extreme. It doesn't
even allow deferred probe (well before kernel init is done). I don't
think that behavior is right and that's why we should delete it. Also,
I doubt if any of these probe functions even take up 4KB of memory.

-Saravana

^ permalink raw reply

* Re: [PATCH v2 2/2] memblock: do not start bottom-up allocations with kernel_end
From: Thiago Jung Bauermann @ 2021-01-26  0:30 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: riel, kernel-team, Ram Pai, linux-kernel, guro, linux-mm,
	Satheesh Rajendran, Konrad Rzeszutek Wilk, iamjoonsoo.kim, mhocko,
	linuxppc-dev, Andrew Morton
In-Reply-To: <20210124073421.GG6332@kernel.org>


Mike Rapoport <rppt@kernel.org> writes:

> On Sat, Jan 23, 2021 at 06:09:11PM -0800, Andrew Morton wrote:
>> On Fri, 22 Jan 2021 01:37:14 -0300 Thiago Jung Bauermann <bauerman@linux.ibm.com> wrote:
>> 
>> > Mike Rapoport <rppt@kernel.org> writes:
>> > 
>> > > > Signed-off-by: Roman Gushchin <guro@fb.com>
>> > > 
>> > > Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
>> > 
>> > I've seen a couple of spurious triggers of the WARN_ONCE() removed by this
>> > patch. This happens on some ppc64le bare metal (powernv) server machines with
>> > CONFIG_SWIOTLB=y and crashkernel=4G, as described in a candidate patch I posted
>> > to solve this issue in a different way:
>> > 
>> > https://lore.kernel.org/linuxppc-dev/20201218062103.76102-1-bauerman@linux.ibm.com/
>> > 
>> > Since this patch solves that problem, is it possible to include it in the next
>> > feasible v5.11-rcX, with the following tag?
>> 
>> We could do this,

Thanks!

>> if we're confident that this patch doesn't depend on
>> [1/2] "mm: cma: allocate cma areas bottom-up"?  I think it is...
>
> A think it does not depend on cma bottom-up allocation, it's rather the other
> way around: without this CMA bottom-up allocation could fail with KASLR
> enabled.

I agree. Conceptually, this could have been patch 1 in this series.

> Still, this patch may need updates to the way x86 does early reservations:
>
> https://lore.kernel.org/lkml/20210115083255.12744-1-rppt@kernel.org

Ah, I wasn't aware of this. Thanks for fixing those issues. That series
seems to be well accepted.

>> > Fixes: 8fabc623238e ("powerpc: Ensure that swiotlb buffer is allocated from low memory")
>> 
>> I added that.

Thanks!
-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH] powerpc/mm: Limit allocation of SWIOTLB on server machines
From: Thiago Jung Bauermann @ 2021-01-26  1:02 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: linuxppc-dev, Satheesh Rajendran, Ram Pai, linux-kernel
In-Reply-To: <20210121155448.GA97019@fedora>


Konrad Rzeszutek Wilk <konrad@darnok.org> writes:

> On Fri, Jan 08, 2021 at 09:27:01PM -0300, Thiago Jung Bauermann wrote:
>> 
>> Ram Pai <linuxram@us.ibm.com> writes:
>> 
>> > On Wed, Dec 23, 2020 at 09:06:01PM -0300, Thiago Jung Bauermann wrote:
>> >> 
>> >> Hi Ram,
>> >> 
>> >> Thanks for reviewing this patch.
>> >> 
>> >> Ram Pai <linuxram@us.ibm.com> writes:
>> >> 
>> >> > On Fri, Dec 18, 2020 at 03:21:03AM -0300, Thiago Jung Bauermann wrote:
>> >> >> On server-class POWER machines, we don't need the SWIOTLB unless we're a
>> >> >> secure VM. Nevertheless, if CONFIG_SWIOTLB is enabled we unconditionally
>> >> >> allocate it.
>> >> >> 
>> >> >> In most cases this is harmless, but on a few machine configurations (e.g.,
>> >> >> POWER9 powernv systems with 4 GB area reserved for crashdump kernel) it can
>> >> >> happen that memblock can't find a 64 MB chunk of memory for the SWIOTLB and
>> >> >> fails with a scary-looking WARN_ONCE:
>> >> >> 
>> >> >>  ------------[ cut here ]------------
>> >> >>  memblock: bottom-up allocation failed, memory hotremove may be affected
>> >> >>  WARNING: CPU: 0 PID: 0 at mm/memblock.c:332 memblock_find_in_range_node+0x328/0x340
>> >> >>  Modules linked in:
>> >> >>  CPU: 0 PID: 0 Comm: swapper Not tainted 5.10.0-rc2-orig+ #6
>> >> >>  NIP:  c000000000442f38 LR: c000000000442f34 CTR: c0000000001e0080
>> >> >>  REGS: c000000001def900 TRAP: 0700   Not tainted  (5.10.0-rc2-orig+)
>> >> >>  MSR:  9000000002021033 <SF,HV,VEC,ME,IR,DR,RI,LE>  CR: 28022222  XER: 20040000
>> >> >>  CFAR: c00000000014b7b4 IRQMASK: 1
>> >> >>  GPR00: c000000000442f34 c000000001defba0 c000000001deff00 0000000000000047
>> >> >>  GPR04: 00000000ffff7fff c000000001def828 c000000001def820 0000000000000000
>> >> >>  GPR08: 0000001ffc3e0000 c000000001b75478 c000000001b75478 0000000000000001
>> >> >>  GPR12: 0000000000002000 c000000002030000 0000000000000000 0000000000000000
>> >> >>  GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000002030000
>> >> >>  GPR20: 0000000000000000 0000000000010000 0000000000010000 c000000001defc10
>> >> >>  GPR24: c000000001defc08 c000000001c91868 c000000001defc18 c000000001c91890
>> >> >>  GPR28: 0000000000000000 ffffffffffffffff 0000000004000000 00000000ffffffff
>> >> >>  NIP [c000000000442f38] memblock_find_in_range_node+0x328/0x340
>> >> >>  LR [c000000000442f34] memblock_find_in_range_node+0x324/0x340
>> >> >>  Call Trace:
>> >> >>  [c000000001defba0] [c000000000442f34] memblock_find_in_range_node+0x324/0x340 (unreliable)
>> >> >>  [c000000001defc90] [c0000000015ac088] memblock_alloc_range_nid+0xec/0x1b0
>> >> >>  [c000000001defd40] [c0000000015ac1f8] memblock_alloc_internal+0xac/0x110
>> >> >>  [c000000001defda0] [c0000000015ac4d0] memblock_alloc_try_nid+0x94/0xcc
>> >> >>  [c000000001defe30] [c00000000159c3c8] swiotlb_init+0x78/0x104
>> >> >>  [c000000001defea0] [c00000000158378c] mem_init+0x4c/0x98
>> >> >>  [c000000001defec0] [c00000000157457c] start_kernel+0x714/0xac8
>> >> >>  [c000000001deff90] [c00000000000d244] start_here_common+0x1c/0x58
>> >> >>  Instruction dump:
>> >> >>  2c230000 4182ffd4 ea610088 ea810090 4bfffe84 39200001 3d42fff4 3c62ff60
>> >> >>  3863c560 992a8bfc 4bd0881d 60000000 <0fe00000> ea610088 4bfffd94 60000000
>> >> >>  random: get_random_bytes called from __warn+0x128/0x184 with crng_init=0
>> >> >>  ---[ end trace 0000000000000000 ]---
>> >> >>  software IO TLB: Cannot allocate buffer
>> >> >> 
>> >> >> Unless this is a secure VM the message can actually be ignored, because the
>> >> >> SWIOTLB isn't needed. Therefore, let's avoid the SWIOTLB in those cases.
>> >> >
>> >> > The above warn_on is conveying a genuine warning. Should it be silenced?
>> >> 
>> >> Not sure I understand your point. This patch doesn't silence the
>> >> warning, it avoids the problem it is warning about.
>> >
>> > Sorry, I should have explained it better. My point is...  
>> >
>> > 	If CONFIG_SWIOTLB is enabled, it means that the kernel is
>> > 	promising the bounce buffering capability. I know, currently we
>> > 	do not have any kernel subsystems that use bounce buffers on
>> > 	non-secure-pseries-kernel or powernv-kernel.  But that does not
>> > 	mean, there wont be any. In case there is such a third-party
>> > 	module needing bounce buffering, it wont be able to operate,
>> > 	because of the proposed change in your patch.
>> >
>> > 	Is that a good thing or a bad thing, I do not know. I will let
>> > 	the experts opine.
>> 
>> Ping? Does anyone else has an opinion on this? The other option I can
>> think of is changing the crashkernel code to not reserve so much memory
>> below 4 GB. Other people are considering this option, but it's not
>> planned for the near future.
>
> That seems a more suitable solution regardless, but there is always
> the danger of not being enough or being too big.
>
> There was some autocrashkernel allocation patches going around
> for x86 and ARM that perhaps could be re-used?

Do you mean this?

https://lore.kernel.org/lkml/20201118232431.21832-1-saeed.mirzamohammadi@oracle.com/

I agree it would be great to have crashkernel=auto upstream.

>> Also, there's a patch currently in linux-next which removes the scary
>> warning because of unrelated reasons:
>> 
>> https://lore.kernel.org/lkml/20201217201214.3414100-2-guro@fb.com
>> 
>> So assuming that the patch above goes in and keeping the assumption that
>> the swiotlb won't be needed in the powernv machines where I've seen the
>> warning happen, we can just leave things as they are now.
>
> If that solves the problem, then that is OK.

Awesome! Today Mike Rapoport mentioned that the patch above uncovers a
few issues with late memory reservations on on x86 that he is fixing:

https://lore.kernel.org/lkml/20210115083255.12744-1-rppt@kernel.org/

So that would be the last piece of the puzzle.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* [PATCH v2] tpm: ibmvtpm: fix error return code in tpm_ibmvtpm_probe()
From: Stefan Berger @ 2021-01-26  1:47 UTC (permalink / raw)
  To: jarkko, linux-integrity
  Cc: Wang Hai, linux-kernel, Hulk Robot, paulus, linuxppc-dev,
	Stefan Berger

From: Stefan Berger <stefanb@linux.ibm.com>

Return error code -ETIMEDOUT rather than '0' when waiting for the
rtce_buf to be set has timed out.

Fixes: d8d74ea3c002 ("tpm: ibmvtpm: Wait for buffer to be set before proceeding")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
---
 drivers/char/tpm/tpm_ibmvtpm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c
index 994385bf37c0..813eb2cac0ce 100644
--- a/drivers/char/tpm/tpm_ibmvtpm.c
+++ b/drivers/char/tpm/tpm_ibmvtpm.c
@@ -687,6 +687,7 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
 				ibmvtpm->rtce_buf != NULL,
 				HZ)) {
 		dev_err(dev, "CRQ response timed out\n");
+		rc = -ETIMEDOUT;
 		goto init_irq_cleanup;
 	}
 
-- 
2.25.4


^ permalink raw reply related

* [PATCH v11 00/13] huge vmalloc mappings
From: Nicholas Piggin @ 2021-01-26  4:44 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Jonathan Cameron, Rick Edgecombe, linuxppc-dev

I think I ended up implementing all Christoph's comments because
they turned out better in the end. Cleanups coming in another
series though.

Thanks,
Nick

Since v10:
- Fixed code style, most > 80 colums, tweak patch titles, etc [thanks Christoph]
- Made huge vmalloc code and data structure compile away if unselected
  [Christoph]
- Archs only have to provide arch_vmap_p?d_supported for levels they
  implement [Christoph]

Since v9:
- Fixed intermediate build breakage on x86-32 !PAE [thanks Ding]
- Fixed small page fallback case vm_struct double-free [thanks Ding]

Since v8:
- Fixed nommu compile.
- Added Kconfig option help text
- Added VM_NOHUGE which should help archs implement it [suggested by Rick]

Since v7:
- Rebase, added some acks, compile fix
- Removed "order=" from vmallocinfo, it's a bit confusing (nr_pages
  is in small page size for compatibility).
- Added arch_vmap_pmd_supported() test before starting to allocate
  the large page, rather than only testing it when doing the map, to
  avoid unsupported configs trying to allocate huge pages for no
  reason.

Since v6:
- Fixed a false positive warning introduced in patch 2, found by
  kbuild test robot.

Since v5:
- Split arch changes out better and make the constant folding work
- Avoid most of the 80 column wrap, fix a reference to lib/ioremap.c
- Fix compile error on some archs

Since v4:
- Fixed an off-by-page-order bug in v4
- Several minor cleanups.
- Added page order to /proc/vmallocinfo
- Added hugepage to alloc_large_system_hage output.
- Made an architecture config option, powerpc only for now.

Since v3:
- Fixed an off-by-one bug in a loop
- Fix !CONFIG_HAVE_ARCH_HUGE_VMAP build fail

Nicholas Piggin (13):
  mm/vmalloc: fix HUGE_VMAP regression by enabling huge pages in
    vmalloc_to_page
  mm: apply_to_pte_range warn and fail if a large pte is encountered
  mm/vmalloc: rename vmap_*_range vmap_pages_*_range
  mm/ioremap: rename ioremap_*_range to vmap_*_range
  mm: HUGE_VMAP arch support cleanup
  powerpc: inline huge vmap supported functions
  arm64: inline huge vmap supported functions
  x86: inline huge vmap supported functions
  mm/vmalloc: provide fallback arch huge vmap support functions
  mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c
  mm/vmalloc: add vmap_range_noflush variant
  mm/vmalloc: Hugepage vmalloc mappings
  powerpc/64s/radix: Enable huge vmalloc mappings

 .../admin-guide/kernel-parameters.txt         |   2 +
 arch/Kconfig                                  |  11 +
 arch/arm64/include/asm/vmalloc.h              |  24 +
 arch/arm64/mm/mmu.c                           |  26 -
 arch/powerpc/Kconfig                          |   1 +
 arch/powerpc/include/asm/vmalloc.h            |  20 +
 arch/powerpc/kernel/module.c                  |  21 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c      |  21 -
 arch/x86/include/asm/vmalloc.h                |  20 +
 arch/x86/mm/ioremap.c                         |  19 -
 arch/x86/mm/pgtable.c                         |  13 -
 include/linux/io.h                            |   9 -
 include/linux/vmalloc.h                       |  46 ++
 init/main.c                                   |   1 -
 mm/ioremap.c                                  | 225 +-------
 mm/memory.c                                   |  66 ++-
 mm/page_alloc.c                               |   5 +-
 mm/vmalloc.c                                  | 484 +++++++++++++++---
 18 files changed, 614 insertions(+), 400 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH v11 01/13] mm/vmalloc: fix HUGE_VMAP regression by enabling huge pages in vmalloc_to_page
From: Nicholas Piggin @ 2021-01-26  4:44 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Jonathan Cameron, Rick Edgecombe, linuxppc-dev,
	Christoph Hellwig
In-Reply-To: <20210126044510.2491820-1-npiggin@gmail.com>

vmalloc_to_page returns NULL for addresses mapped by larger pages[*].
Whether or not a vmap is huge depends on the architecture details,
alignments, boot options, etc., which the caller can not be expected
to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page.

This change teaches vmalloc_to_page about larger pages, and returns
the struct page that corresponds to the offset within the large page.
This makes the API agnostic to mapping implementation details.

[*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap:
    fail gracefully on unexpected huge vmap mappings")

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 mm/vmalloc.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e6f352bf0498..62372f9e0167 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,7 +34,7 @@
 #include <linux/bitops.h>
 #include <linux/rbtree_augmented.h>
 #include <linux/overflow.h>
-
+#include <linux/pgtable.h>
 #include <linux/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
@@ -343,7 +343,9 @@ int is_vmalloc_or_module_addr(const void *x)
 }
 
 /*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
  */
 struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
@@ -363,25 +365,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 
 	if (pgd_none(*pgd))
 		return NULL;
+	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+		return NULL; /* XXX: no allowance for huge pgd */
+	if (WARN_ON_ONCE(pgd_bad(*pgd)))
+		return NULL;
+
 	p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d))
 		return NULL;
-	pud = pud_offset(p4d, addr);
+	if (p4d_leaf(*p4d))
+		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+	if (WARN_ON_ONCE(p4d_bad(*p4d)))
+		return NULL;
 
-	/*
-	 * Don't dereference bad PUD or PMD (below) entries. This will also
-	 * identify huge mappings, which we may encounter on architectures
-	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
-	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
-	 * not [unambiguously] associated with a struct page, so there is
-	 * no correct value to return for them.
-	 */
-	WARN_ON_ONCE(pud_bad(*pud));
-	if (pud_none(*pud) || pud_bad(*pud))
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return NULL;
+	if (pud_leaf(*pud))
+		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	if (WARN_ON_ONCE(pud_bad(*pud)))
 		return NULL;
+
 	pmd = pmd_offset(pud, addr);
-	WARN_ON_ONCE(pmd_bad(*pmd));
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
+	if (pmd_none(*pmd))
+		return NULL;
+	if (pmd_leaf(*pmd))
+		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	if (WARN_ON_ONCE(pmd_bad(*pmd)))
 		return NULL;
 
 	ptep = pte_offset_map(pmd, addr);
@@ -389,6 +399,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	if (pte_present(pte))
 		page = pte_page(pte);
 	pte_unmap(ptep);
+
 	return page;
 }
 EXPORT_SYMBOL(vmalloc_to_page);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v11 02/13] mm: apply_to_pte_range warn and fail if a large pte is encountered
From: Nicholas Piggin @ 2021-01-26  4:44 UTC (permalink / raw)
  To: linux-mm, Andrew Morton
  Cc: linux-arch, Ding Tianhong, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Jonathan Cameron, Rick Edgecombe, linuxppc-dev,
	Christoph Hellwig
In-Reply-To: <20210126044510.2491820-1-npiggin@gmail.com>

apply_to_pte_range might mistake a large pte for bad, or treat it as a
page table, resulting in a crash or corruption. Add a test to warn and
return error if large entries are found.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 mm/memory.c | 66 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..672e39a72788 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2440,13 +2440,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 	}
 	do {
 		next = pmd_addr_end(addr, end);
-		if (create || !pmd_none_or_clear_bad(pmd)) {
-			err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
-						 create, mask);
-			if (err)
-				break;
+		if (pmd_none(*pmd) && !create)
+			continue;
+		if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+			return -EINVAL;
+		if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+			if (!create)
+				continue;
+			pmd_clear_bad(pmd);
 		}
+		err = apply_to_pte_range(mm, pmd, addr, next,
+					 fn, data, create, mask);
+		if (err)
+			break;
 	} while (pmd++, addr = next, addr != end);
+
 	return err;
 }
 
@@ -2468,13 +2476,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
 	}
 	do {
 		next = pud_addr_end(addr, end);
-		if (create || !pud_none_or_clear_bad(pud)) {
-			err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
-						 create, mask);
-			if (err)
-				break;
+		if (pud_none(*pud) && !create)
+			continue;
+		if (WARN_ON_ONCE(pud_leaf(*pud)))
+			return -EINVAL;
+		if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+			if (!create)
+				continue;
+			pud_clear_bad(pud);
 		}
+		err = apply_to_pmd_range(mm, pud, addr, next,
+					 fn, data, create, mask);
+		if (err)
+			break;
 	} while (pud++, addr = next, addr != end);
+
 	return err;
 }
 
@@ -2496,13 +2512,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 	}
 	do {
 		next = p4d_addr_end(addr, end);
-		if (create || !p4d_none_or_clear_bad(p4d)) {
-			err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
-						 create, mask);
-			if (err)
-				break;
+		if (p4d_none(*p4d) && !create)
+			continue;
+		if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+			return -EINVAL;
+		if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+			if (!create)
+				continue;
+			p4d_clear_bad(p4d);
 		}
+		err = apply_to_pud_range(mm, p4d, addr, next,
+					 fn, data, create, mask);
+		if (err)
+			break;
 	} while (p4d++, addr = next, addr != end);
+
 	return err;
 }
 
@@ -2522,9 +2546,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (!create && pgd_none_or_clear_bad(pgd))
+		if (pgd_none(*pgd) && !create)
 			continue;
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+		if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+			return -EINVAL;
+		if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+			if (!create)
+				continue;
+			pgd_clear_bad(pgd);
+		}
+		err = apply_to_p4d_range(mm, pgd, addr, next,
+					 fn, data, create, &mask);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-- 
2.23.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox