LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v7 36/42] powerpc/64s: reconcile interrupts in C
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

There is no need for this to be in asm, use the new intrrupt entry wrapper.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h | 15 +++++++++++----
 arch/powerpc/kernel/exceptions-64s.S | 26 --------------------------
 2 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index 1c966e47b36f..e96d215f518a 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -14,11 +14,14 @@ struct interrupt_state {
 
 static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrupt_state *state)
 {
-#ifdef CONFIG_PPC_BOOK3E_64
-	state->ctx_state = exception_enter();
-#endif
-
+	/*
+	 * Book3E reconciles irq soft mask in asm
+	 */
 #ifdef CONFIG_PPC_BOOK3S_64
+	if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED)
+		trace_hardirqs_off();
+	local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
 	if (user_mode(regs)) {
 		CT_WARN_ON(ct_state() != CONTEXT_USER);
 		user_exit_irqoff();
@@ -31,6 +34,10 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
 			CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	}
 #endif
+
+#ifdef CONFIG_PPC_BOOK3E_64
+	state->ctx_state = exception_enter();
+#endif
 }
 
 /*
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index fe33197ea8fb..39630b3f78b0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -139,7 +139,6 @@ name:
 #define IKVM_VIRT	.L_IKVM_VIRT_\name\()	/* Virt entry tests KVM */
 #define ISTACK		.L_ISTACK_\name\()	/* Set regular kernel stack */
 #define __ISTACK(name)	.L_ISTACK_ ## name
-#define IRECONCILE	.L_IRECONCILE_\name\()	/* Do RECONCILE_IRQ_STATE */
 #define IKUAP		.L_IKUAP_\name\()	/* Do KUAP lock */
 
 #define INT_DEFINE_BEGIN(n)						\
@@ -203,9 +202,6 @@ do_define_int n
 	.ifndef ISTACK
 		ISTACK=1
 	.endif
-	.ifndef IRECONCILE
-		IRECONCILE=1
-	.endif
 	.ifndef IKUAP
 		IKUAP=1
 	.endif
@@ -653,10 +649,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	.if ISTACK
 	ACCOUNT_STOLEN_TIME
 	.endif
-
-	.if IRECONCILE
-	RECONCILE_IRQ_STATE(r10, r11)
-	.endif
 .endm
 
 /*
@@ -935,7 +927,6 @@ INT_DEFINE_BEGIN(system_reset)
 	 */
 	ISET_RI=0
 	ISTACK=0
-	IRECONCILE=0
 	IKVM_REAL=1
 INT_DEFINE_END(system_reset)
 
@@ -1123,7 +1114,6 @@ INT_DEFINE_BEGIN(machine_check_early)
 	ISTACK=0
 	IDAR=1
 	IDSISR=1
-	IRECONCILE=0
 	IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */
 INT_DEFINE_END(machine_check_early)
 
@@ -1483,7 +1473,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 INT_DEFINE_BEGIN(data_access_slb)
 	IVEC=0x380
 	IAREA=PACA_EXSLB
-	IRECONCILE=0
 	IDAR=1
 	IKVM_SKIP=1
 	IKVM_REAL=1
@@ -1510,7 +1499,6 @@ MMU_FTR_SECTION_ELSE
 	li	r3,-EFAULT
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 	std	r3,RESULT(r1)
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_bad_slb_fault
 	b	interrupt_return
@@ -1568,7 +1556,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 INT_DEFINE_BEGIN(instruction_access_slb)
 	IVEC=0x480
 	IAREA=PACA_EXSLB
-	IRECONCILE=0
 	IISIDE=1
 	IDAR=1
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@@ -1597,7 +1584,6 @@ MMU_FTR_SECTION_ELSE
 	li	r3,-EFAULT
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 	std	r3,RESULT(r1)
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_bad_slb_fault
 	b	interrupt_return
@@ -1757,7 +1743,6 @@ EXC_COMMON_BEGIN(program_check_common)
  */
 INT_DEFINE_BEGIN(fp_unavailable)
 	IVEC=0x800
-	IRECONCILE=0
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	IKVM_REAL=1
 #endif
@@ -1772,7 +1757,6 @@ EXC_VIRT_END(fp_unavailable, 0x4800, 0x100)
 EXC_COMMON_BEGIN(fp_unavailable_common)
 	GEN_COMMON fp_unavailable
 	bne	1f			/* if from user, just load it up */
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	kernel_fp_unavailable_exception
 0:	trap
@@ -1791,7 +1775,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 	b	fast_interrupt_return
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 2:	/* User process was in a transaction */
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	fp_unavailable_tm
 	b	interrupt_return
@@ -1856,7 +1839,6 @@ INT_DEFINE_BEGIN(hdecrementer)
 	IVEC=0x980
 	IHSRR=1
 	ISTACK=0
-	IRECONCILE=0
 	IKVM_REAL=1
 	IKVM_VIRT=1
 INT_DEFINE_END(hdecrementer)
@@ -2230,7 +2212,6 @@ INT_DEFINE_BEGIN(hmi_exception_early)
 	IHSRR=1
 	IREALMODE_COMMON=1
 	ISTACK=0
-	IRECONCILE=0
 	IKUAP=0 /* We don't touch AMR here, we never go to virtual mode */
 	IKVM_REAL=1
 INT_DEFINE_END(hmi_exception_early)
@@ -2404,7 +2385,6 @@ EXC_COMMON_BEGIN(performance_monitor_common)
  */
 INT_DEFINE_BEGIN(altivec_unavailable)
 	IVEC=0xf20
-	IRECONCILE=0
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	IKVM_REAL=1
 #endif
@@ -2434,7 +2414,6 @@ BEGIN_FTR_SECTION
 	b	fast_interrupt_return
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 2:	/* User process was in a transaction */
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	altivec_unavailable_tm
 	b	interrupt_return
@@ -2442,7 +2421,6 @@ BEGIN_FTR_SECTION
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	altivec_unavailable_exception
 	b	interrupt_return
@@ -2458,7 +2436,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  */
 INT_DEFINE_BEGIN(vsx_unavailable)
 	IVEC=0xf40
-	IRECONCILE=0
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	IKVM_REAL=1
 #endif
@@ -2487,7 +2464,6 @@ BEGIN_FTR_SECTION
 	b	load_up_vsx
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 2:	/* User process was in a transaction */
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	vsx_unavailable_tm
 	b	interrupt_return
@@ -2495,7 +2471,6 @@ BEGIN_FTR_SECTION
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
-	RECONCILE_IRQ_STATE(r10, r11)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	vsx_unavailable_exception
 	b	interrupt_return
@@ -2830,7 +2805,6 @@ EXC_VIRT_NONE(0x5800, 0x100)
 INT_DEFINE_BEGIN(soft_nmi)
 	IVEC=0x900
 	ISTACK=0
-	IRECONCILE=0	/* Soft-NMI may fire under local_irq_disable */
 INT_DEFINE_END(soft_nmi)
 
 /*
-- 
2.23.0


^ permalink raw reply related

* [PATCH v7 35/42] powerpc/64s: move context tracking exit to interrupt exit path
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

The interrupt handler wrapper functions are not the ideal place to
maintain context tracking because after they return, the low level exit
code must then determine if there are interrupts to replay, or if the
task should be preempted, etc. Those paths (e.g., schedule_user) include
their own exception_enter/exit pairs to fix this up but it's a bit hacky
(see schedule_user() comments).

Ideally context tracking will go to user mode only when there are no
more interrupts or context switches or other exit processing work to
handle.

64e can not do this because it does not use the C interrupt exit code.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h | 34 +++++++++++++++++++++++++---
 arch/powerpc/kernel/syscall_64.c     | 18 +++++++++++++--
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index 5a1395499508..1c966e47b36f 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -7,16 +7,30 @@
 #include <asm/ftrace.h>
 
 struct interrupt_state {
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3E_64
 	enum ctx_state ctx_state;
 #endif
 };
 
 static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrupt_state *state)
 {
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3E_64
 	state->ctx_state = exception_enter();
 #endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (user_mode(regs)) {
+		CT_WARN_ON(ct_state() != CONTEXT_USER);
+		user_exit_irqoff();
+	} else {
+		/*
+		 * CT_WARN_ON comes here via program_check_exception,
+		 * so avoid recursion.
+		 */
+		if (TRAP(regs) != 0x700)
+			CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	}
+#endif
 }
 
 /*
@@ -35,9 +49,23 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
  */
 static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt_state *state)
 {
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3E_64
 	exception_exit(state->ctx_state);
 #endif
+
+	/*
+	 * Book3S exits to user via interrupt_exit_user_prepare(), which does
+	 * context tracking, which is a cleaner way to handle PREEMPT=y
+	 * and avoid context entry/exit in e.g., preempt_schedule_irq()),
+	 * which is likely to be where the core code wants to end up.
+	 *
+	 * The above comment explains why we can't do the
+	 *
+	 *     if (user_mode(regs))
+	 *         user_exit_irqoff();
+	 *
+	 * sequence here.
+	 */
 }
 
 static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct interrupt_state *state)
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
index 45c4420fe339..a2102e7a2713 100644
--- a/arch/powerpc/kernel/syscall_64.c
+++ b/arch/powerpc/kernel/syscall_64.c
@@ -255,9 +255,9 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 		ret |= _TIF_RESTOREALL;
 	}
 
-again:
 	local_irq_disable();
 
+again:
 	ti_flags = READ_ONCE(*ti_flagsp);
 	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
 		local_irq_enable();
@@ -307,6 +307,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 	if (unlikely(!__prep_irq_for_enabled_exit(!scv))) {
 		user_exit_irqoff();
 		local_irq_enable();
+		local_irq_disable();
 		goto again;
 	}
 
@@ -341,6 +342,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
 	BUG_ON(!(regs->msr & MSR_PR));
 	BUG_ON(!FULL_REGS(regs));
 	BUG_ON(regs->softe != IRQS_ENABLED);
+	CT_WARN_ON(ct_state() == CONTEXT_USER);
 
 	/*
 	 * We don't need to restore AMR on the way back to userspace for KUAP.
@@ -383,8 +385,14 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
 		}
 	}
 
-	if (unlikely(!prep_irq_for_enabled_exit(true, !irqs_disabled_flags(flags))))
+	user_enter_irqoff();
+
+	if (unlikely(!__prep_irq_for_enabled_exit(true))) {
+		user_exit_irqoff();
+		local_irq_enable();
+		local_irq_disable();
 		goto again;
+	}
 
 #ifdef CONFIG_PPC_BOOK3E
 	if (unlikely(ts->debug.dbcr0 & DBCR0_IDM)) {
@@ -425,6 +433,12 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 		unrecoverable_exception(regs);
 	BUG_ON(regs->msr & MSR_PR);
 	BUG_ON(!FULL_REGS(regs));
+	/*
+	 * CT_WARN_ON comes here via program_check_exception,
+	 * so avoid recursion.
+	 */
+	if (TRAP(regs) != 0x700)
+		CT_WARN_ON(ct_state() == CONTEXT_USER);
 
 	amr = kuap_get_and_check_amr();
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH v7 37/42] powerpc/64: move account_stolen_time into its own function
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

This will be used by interrupt entry as well.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/cputime.h | 14 ++++++++++++++
 arch/powerpc/kernel/syscall_64.c   | 10 +---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index ed75d1c318e3..504f7fe6711a 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -87,6 +87,17 @@ static notrace inline void account_cpu_user_exit(void)
 	acct->starttime_user = tb;
 }
 
+static notrace inline void account_stolen_time(void)
+{
+#ifdef CONFIG_PPC_SPLPAR
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		struct lppaca *lp = local_paca->lppaca_ptr;
+
+		if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
+			accumulate_stolen_time();
+	}
+#endif
+}
 
 #endif /* __KERNEL__ */
 #else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
@@ -96,5 +107,8 @@ static inline void account_cpu_user_entry(void)
 static inline void account_cpu_user_exit(void)
 {
 }
+static notrace inline void account_stolen_time(void)
+{
+}
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 #endif /* __POWERPC_CPUTIME_H */
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
index a2102e7a2713..d6be4f9a67e5 100644
--- a/arch/powerpc/kernel/syscall_64.c
+++ b/arch/powerpc/kernel/syscall_64.c
@@ -69,15 +69,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
 	account_cpu_user_entry();
 
-#ifdef CONFIG_PPC_SPLPAR
-	if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) &&
-	    firmware_has_feature(FW_FEATURE_SPLPAR)) {
-		struct lppaca *lp = local_paca->lppaca_ptr;
-
-		if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
-			accumulate_stolen_time();
-	}
-#endif
+	account_stolen_time();
 
 	/*
 	 * This is not required for the syscall exit path, but makes the
-- 
2.23.0


^ permalink raw reply related

* [PATCH v7 38/42] powerpc/64: entry cpu time accounting in C
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

There is no need for this to be in asm, use the new interrupt entry wrapper.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h |  6 ++++++
 arch/powerpc/include/asm/ppc_asm.h   | 24 ------------------------
 arch/powerpc/kernel/exceptions-64e.S |  1 -
 arch/powerpc/kernel/exceptions-64s.S |  5 -----
 4 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index e96d215f518a..ca8e08b18a16 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -4,6 +4,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/hardirq.h>
+#include <asm/cputime.h>
 #include <asm/ftrace.h>
 
 struct interrupt_state {
@@ -25,6 +26,9 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
 	if (user_mode(regs)) {
 		CT_WARN_ON(ct_state() != CONTEXT_USER);
 		user_exit_irqoff();
+
+		account_cpu_user_entry();
+		account_stolen_time();
 	} else {
 		/*
 		 * CT_WARN_ON comes here via program_check_exception,
@@ -37,6 +41,8 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
 
 #ifdef CONFIG_PPC_BOOK3E_64
 	state->ctx_state = exception_enter();
+	if (user_mode(regs))
+		account_cpu_user_entry();
 #endif
 }
 
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index cc1bca571332..3dceb64fc9af 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -25,7 +25,6 @@
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #define ACCOUNT_CPU_USER_ENTRY(ptr, ra, rb)
 #define ACCOUNT_CPU_USER_EXIT(ptr, ra, rb)
-#define ACCOUNT_STOLEN_TIME
 #else
 #define ACCOUNT_CPU_USER_ENTRY(ptr, ra, rb)				\
 	MFTB(ra);			/* get timebase */		\
@@ -44,29 +43,6 @@
 	PPC_LL	ra, ACCOUNT_SYSTEM_TIME(ptr);				\
 	add	ra,ra,rb;		/* add on to system time */	\
 	PPC_STL	ra, ACCOUNT_SYSTEM_TIME(ptr)
-
-#ifdef CONFIG_PPC_SPLPAR
-#define ACCOUNT_STOLEN_TIME						\
-BEGIN_FW_FTR_SECTION;							\
-	beq	33f;							\
-	/* from user - see if there are any DTL entries to process */	\
-	ld	r10,PACALPPACAPTR(r13);	/* get ptr to VPA */		\
-	ld	r11,PACA_DTL_RIDX(r13);	/* get log read index */	\
-	addi	r10,r10,LPPACA_DTLIDX;					\
-	LDX_BE	r10,0,r10;		/* get log write index */	\
-	cmpd	cr1,r11,r10;						\
-	beq+	cr1,33f;						\
-	bl	accumulate_stolen_time;				\
-	ld	r12,_MSR(r1);						\
-	andi.	r10,r12,MSR_PR;		/* Restore cr0 (coming from user) */ \
-33:									\
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
-
-#else  /* CONFIG_PPC_SPLPAR */
-#define ACCOUNT_STOLEN_TIME
-
-#endif /* CONFIG_PPC_SPLPAR */
-
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 /*
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 003999c7836c..e8eb9992a270 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -398,7 +398,6 @@ exc_##n##_common:							    \
 	std	r10,_NIP(r1);		/* save SRR0 to stackframe */	    \
 	std	r11,_MSR(r1);		/* save SRR1 to stackframe */	    \
 	beq	2f;			/* if from kernel mode */	    \
-	ACCOUNT_CPU_USER_ENTRY(r13,r10,r11);/* accounting (uses cr0+eq) */  \
 2:	ld	r3,excf+EX_R10(r13);	/* get back r10 */		    \
 	ld	r4,excf+EX_R11(r13);	/* get back r11 */		    \
 	mfspr	r5,scratch;		/* get back r13 */		    \
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 39630b3f78b0..94b89ea123f3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -577,7 +577,6 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
 	kuap_save_amr_and_lock r9, r10, cr1, cr0
 	.endif
 	beq	101f			/* if from kernel mode		*/
-	ACCOUNT_CPU_USER_ENTRY(r13, r9, r10)
 BEGIN_FTR_SECTION
 	ld	r9,IAREA+EX_PPR(r13)	/* Read PPR from paca		*/
 	std	r9,_PPR(r1)
@@ -645,10 +644,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	ld	r11,exception_marker@toc(r2)
 	std	r10,RESULT(r1)		/* clear regs->result		*/
 	std	r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame	*/
-
-	.if ISTACK
-	ACCOUNT_STOLEN_TIME
-	.endif
 .endm
 
 /*
-- 
2.23.0


^ permalink raw reply related

* [PATCH v7 39/42] powerpc: move NMI entry/exit code into wrapper
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

This moves the common NMI entry and exit code into the interrupt handler
wrappers.

This changes the behaviour of soft-NMI (watchdog) and HMI interrupts, and
also MCE interrupts on 64e, by adding missing parts of the NMI entry to
them.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h | 28 ++++++++++++++++++++++
 arch/powerpc/kernel/mce.c            | 11 ---------
 arch/powerpc/kernel/traps.c          | 35 +++++-----------------------
 arch/powerpc/kernel/watchdog.c       | 10 ++++----
 4 files changed, 38 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index ca8e08b18a16..cd819d42573c 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -94,14 +94,42 @@ static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct int
 }
 
 struct interrupt_nmi_state {
+#ifdef CONFIG_PPC64
+	u8 ftrace_enabled;
+#endif
 };
 
 static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state)
 {
+#ifdef CONFIG_PPC64
+	/* Allow DEC and PMI to be traced when they are soft-NMI */
+	if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260) {
+		state->ftrace_enabled = this_cpu_get_ftrace_enabled();
+		this_cpu_set_ftrace_enabled(0);
+	}
+#endif
+
+	/*
+	 * Do not use nmi_enter() for pseries hash guest taking a real-mode
+	 * NMI because not everything it touches is within the RMA limit.
+	 */
+	if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
+			!firmware_has_feature(FW_FEATURE_LPAR) ||
+			radix_enabled() || (mfmsr() & MSR_DR))
+		nmi_enter();
 }
 
 static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state)
 {
+	if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
+			!firmware_has_feature(FW_FEATURE_LPAR) ||
+			radix_enabled() || (mfmsr() & MSR_DR))
+		nmi_exit();
+
+#ifdef CONFIG_PPC64
+	if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260)
+		this_cpu_set_ftrace_enabled(state->ftrace_enabled);
+#endif
 }
 
 /**
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 54269947113d..51456217ec40 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -592,12 +592,6 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 DEFINE_INTERRUPT_HANDLER_NMI(machine_check_early)
 {
 	long handled = 0;
-	u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
-
-	this_cpu_set_ftrace_enabled(0);
-	/* Do not use nmi_enter/exit for pseries hpte guest */
-	if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
-		nmi_enter();
 
 	hv_nmi_check_nonrecoverable(regs);
 
@@ -607,11 +601,6 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_early)
 	if (ppc_md.machine_check_early)
 		handled = ppc_md.machine_check_early(regs);
 
-	if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
-		nmi_exit();
-
-	this_cpu_set_ftrace_enabled(ftrace_enabled);
-
 	return handled;
 }
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 1623400a0a7b..e61391f6948f 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -435,11 +435,6 @@ DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception)
 {
 	unsigned long hsrr0, hsrr1;
 	bool saved_hsrrs = false;
-	u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
-
-	this_cpu_set_ftrace_enabled(0);
-
-	nmi_enter();
 
 	/*
 	 * System reset can interrupt code where HSRRs are live and MSR[RI]=1.
@@ -514,10 +509,6 @@ DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception)
 		mtspr(SPRN_HSRR1, hsrr1);
 	}
 
-	nmi_exit();
-
-	this_cpu_set_ftrace_enabled(ftrace_enabled);
-
 	/* What should we do here? We could issue a shutdown or hard reset. */
 
 	return 0;
@@ -809,6 +800,12 @@ void die_mce(const char *str, struct pt_regs *regs, long err)
 }
 NOKPROBE_SYMBOL(die_mce);
 
+/*
+ * BOOK3S_64 does not call this handler as a non-maskable interrupt
+ * (it uses its own early real-mode handler to handle the MCE proper
+ * and then raises irq_work to call this handler when interrupts are
+ * enabled).
+ */
 #ifdef CONFIG_PPC_BOOK3S_64
 DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
 #else
@@ -817,20 +814,6 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
 {
 	int recover = 0;
 
-	/*
-	 * BOOK3S_64 does not call this handler as a non-maskable interrupt
-	 * (it uses its own early real-mode handler to handle the MCE proper
-	 * and then raises irq_work to call this handler when interrupts are
-	 * enabled).
-	 *
-	 * This is silly. The BOOK3S_64 should just call a different function
-	 * rather than expecting semantics to magically change. Something
-	 * like 'non_nmi_machine_check_exception()', perhaps?
-	 */
-	const bool nmi = !IS_ENABLED(CONFIG_PPC_BOOK3S_64);
-
-	if (nmi) nmi_enter();
-
 	__this_cpu_inc(irq_stat.mce_exceptions);
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -862,8 +845,6 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
 	if (!(regs->msr & MSR_RI))
 		die_mce("Unrecoverable Machine check", regs, SIGBUS);
 
-	if (nmi) nmi_exit();
-
 #ifdef CONFIG_PPC_BOOK3S_64
 	return;
 #else
@@ -1886,14 +1867,10 @@ DEFINE_INTERRUPT_HANDLER(vsx_unavailable_tm)
 DECLARE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi);
 DEFINE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi)
 {
-	nmi_enter();
-
 	__this_cpu_inc(irq_stat.pmu_irqs);
 
 	perf_irq(regs);
 
-	nmi_exit();
-
 	return 0;
 }
 #endif
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 824b9376ac35..dc39534836a3 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -254,11 +254,12 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 	int cpu = raw_smp_processor_id();
 	u64 tb;
 
+	/* should only arrive from kernel, with irqs disabled */
+	WARN_ON_ONCE(!arch_irq_disabled_regs(regs));
+
 	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
 		return 0;
 
-	nmi_enter();
-
 	__this_cpu_inc(irq_stat.soft_nmi_irqs);
 
 	tb = get_tb();
@@ -266,7 +267,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 		wd_smp_lock(&flags);
 		if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
 			wd_smp_unlock(&flags);
-			goto out;
+			return 0;
 		}
 		set_cpu_stuck(cpu, tb);
 
@@ -290,9 +291,6 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 	if (wd_panic_timeout_tb < 0x7fffffff)
 		mtspr(SPRN_DEC, wd_panic_timeout_tb);
 
-out:
-	nmi_exit();
-
 	return 0;
 }
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH v7 40/42] powerpc/64s: move NMI soft-mask handling to C
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

Saving and restoring soft-mask state can now be done in C using the
interrupt handler wrapper functions.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h | 25 ++++++++++++
 arch/powerpc/kernel/exceptions-64s.S | 60 ----------------------------
 2 files changed, 25 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index cd819d42573c..72a585084066 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -95,6 +95,10 @@ static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct int
 
 struct interrupt_nmi_state {
 #ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S_64
+	u8 irq_soft_mask;
+	u8 irq_happened;
+#endif
 	u8 ftrace_enabled;
 #endif
 };
@@ -102,6 +106,20 @@ struct interrupt_nmi_state {
 static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state)
 {
 #ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S_64
+	state->irq_soft_mask = local_paca->irq_soft_mask;
+	state->irq_happened = local_paca->irq_happened;
+
+	/*
+	 * Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does
+	 * the right thing, and set IRQ_HARD_DIS. We do not want to reconcile
+	 * because that goes through irq tracing which we don't want in NMI.
+	 */
+	local_paca->irq_soft_mask = IRQS_ALL_DISABLED;
+	local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+	/* Don't do any per-CPU operations until interrupt state is fixed */
+#endif
 	/* Allow DEC and PMI to be traced when they are soft-NMI */
 	if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260) {
 		state->ftrace_enabled = this_cpu_get_ftrace_enabled();
@@ -129,6 +147,13 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
 #ifdef CONFIG_PPC64
 	if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260)
 		this_cpu_set_ftrace_enabled(state->ftrace_enabled);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Check we didn't change the pending interrupt mask. */
+	WARN_ON_ONCE((state->irq_happened | PACA_IRQ_HARD_DIS) != local_paca->irq_happened);
+	local_paca->irq_happened = state->irq_happened;
+	local_paca->irq_soft_mask = state->irq_soft_mask;
+#endif
 #endif
 }
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 94b89ea123f3..2fca2bad6b02 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1008,20 +1008,6 @@ EXC_COMMON_BEGIN(system_reset_common)
 	ld	r1,PACA_NMI_EMERG_SP(r13)
 	subi	r1,r1,INT_FRAME_SIZE
 	__GEN_COMMON_BODY system_reset
-	/*
-	 * Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does
-	 * the right thing. We do not want to reconcile because that goes
-	 * through irq tracing which we don't want in NMI.
-	 *
-	 * Save PACAIRQHAPPENED to RESULT (otherwise unused), and set HARD_DIS
-	 * as we are running with MSR[EE]=0.
-	 */
-	li	r10,IRQS_ALL_DISABLED
-	stb	r10,PACAIRQSOFTMASK(r13)
-	lbz	r10,PACAIRQHAPPENED(r13)
-	std	r10,RESULT(r1)
-	ori	r10,r10,PACA_IRQ_HARD_DIS
-	stb	r10,PACAIRQHAPPENED(r13)
 
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	system_reset_exception
@@ -1037,14 +1023,6 @@ EXC_COMMON_BEGIN(system_reset_common)
 	subi	r10,r10,1
 	sth	r10,PACA_IN_NMI(r13)
 
-	/*
-	 * Restore soft mask settings.
-	 */
-	ld	r10,RESULT(r1)
-	stb	r10,PACAIRQHAPPENED(r13)
-	ld	r10,SOFTE(r1)
-	stb	r10,PACAIRQSOFTMASK(r13)
-
 	kuap_kernel_restore r9, r10
 	EXCEPTION_RESTORE_REGS
 	RFI_TO_USER_OR_KERNEL
@@ -1190,30 +1168,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	li	r10,MSR_RI
 	mtmsrd	r10,1
 
-	/*
-	 * Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see
-	 * system_reset_common)
-	 */
-	li	r10,IRQS_ALL_DISABLED
-	stb	r10,PACAIRQSOFTMASK(r13)
-	lbz	r10,PACAIRQHAPPENED(r13)
-	std	r10,RESULT(r1)
-	ori	r10,r10,PACA_IRQ_HARD_DIS
-	stb	r10,PACAIRQHAPPENED(r13)
-
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_early
 	std	r3,RESULT(r1)	/* Save result */
 	ld	r12,_MSR(r1)
 
-	/*
-	 * Restore soft mask settings.
-	 */
-	ld	r10,RESULT(r1)
-	stb	r10,PACAIRQHAPPENED(r13)
-	ld	r10,SOFTE(r1)
-	stb	r10,PACAIRQSOFTMASK(r13)
-
 #ifdef CONFIG_PPC_P7_NAP
 	/*
 	 * Check if thread was in power saving mode. We come here when any
@@ -2818,17 +2777,6 @@ EXC_COMMON_BEGIN(soft_nmi_common)
 	subi	r1,r1,INT_FRAME_SIZE
 	__GEN_COMMON_BODY soft_nmi
 
-	/*
-	 * Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see
-	 * system_reset_common)
-	 */
-	li	r10,IRQS_ALL_DISABLED
-	stb	r10,PACAIRQSOFTMASK(r13)
-	lbz	r10,PACAIRQHAPPENED(r13)
-	std	r10,RESULT(r1)
-	ori	r10,r10,PACA_IRQ_HARD_DIS
-	stb	r10,PACAIRQHAPPENED(r13)
-
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	soft_nmi_interrupt
 
@@ -2836,14 +2784,6 @@ EXC_COMMON_BEGIN(soft_nmi_common)
 	li	r9,0
 	mtmsrd	r9,1
 
-	/*
-	 * Restore soft mask settings.
-	 */
-	ld	r10,RESULT(r1)
-	stb	r10,PACAIRQHAPPENED(r13)
-	ld	r10,SOFTE(r1)
-	stb	r10,PACAIRQSOFTMASK(r13)
-
 	kuap_kernel_restore r9, r10
 	EXCEPTION_RESTORE_REGS hsrr=0
 	RFI_TO_KERNEL
-- 
2.23.0


^ permalink raw reply related

* [PATCH v7 41/42] powerpc/64s: runlatch interrupt handling in C
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

There is no need for this to be in asm, use the new intrrupt entry wrapper.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h |  7 +++++++
 arch/powerpc/kernel/exceptions-64s.S | 18 ------------------
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index 72a585084066..4badb3e51c19 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -6,6 +6,7 @@
 #include <linux/hardirq.h>
 #include <asm/cputime.h>
 #include <asm/ftrace.h>
+#include <asm/runlatch.h>
 
 struct interrupt_state {
 #ifdef CONFIG_PPC_BOOK3E_64
@@ -83,6 +84,12 @@ static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt
 
 static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct interrupt_state *state)
 {
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (cpu_has_feature(CPU_FTR_CTRL) &&
+	    !test_thread_local_flags(_TLF_RUNLATCH))
+		__ppc64_runlatch_on();
+#endif
+
 	interrupt_enter_prepare(regs, state);
 	irq_enter();
 }
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 2fca2bad6b02..27351276c54b 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -692,14 +692,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	ld	r1,GPR1(r1)
 .endm
 
-#define RUNLATCH_ON				\
-BEGIN_FTR_SECTION				\
-	ld	r3, PACA_THREAD_INFO(r13);	\
-	ld	r4,TI_LOCAL_FLAGS(r3);		\
-	andi.	r0,r4,_TLF_RUNLATCH;		\
-	beql	ppc64_runlatch_on_trampoline;	\
-END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
-
 /*
  * When the idle code in power4_idle puts the CPU into NAP mode,
  * it has to do so in a loop, and relies on the external interrupt
@@ -1585,7 +1577,6 @@ EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
 EXC_COMMON_BEGIN(hardware_interrupt_common)
 	GEN_COMMON hardware_interrupt
 	FINISH_NAP
-	RUNLATCH_ON
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_IRQ
 	b	interrupt_return
@@ -1771,7 +1762,6 @@ EXC_VIRT_END(decrementer, 0x4900, 0x80)
 EXC_COMMON_BEGIN(decrementer_common)
 	GEN_COMMON decrementer
 	FINISH_NAP
-	RUNLATCH_ON
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	timer_interrupt
 	b	interrupt_return
@@ -1857,7 +1847,6 @@ EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
 EXC_COMMON_BEGIN(doorbell_super_common)
 	GEN_COMMON doorbell_super
 	FINISH_NAP
-	RUNLATCH_ON
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_DOORBELL
 	bl	doorbell_exception
@@ -2212,7 +2201,6 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 EXC_COMMON_BEGIN(hmi_exception_common)
 	GEN_COMMON hmi_exception
 	FINISH_NAP
-	RUNLATCH_ON
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	handle_hmi_exception
 	b	interrupt_return
@@ -2242,7 +2230,6 @@ EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
 EXC_COMMON_BEGIN(h_doorbell_common)
 	GEN_COMMON h_doorbell
 	FINISH_NAP
-	RUNLATCH_ON
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_DOORBELL
 	bl	doorbell_exception
@@ -2276,7 +2263,6 @@ EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
 EXC_COMMON_BEGIN(h_virt_irq_common)
 	GEN_COMMON h_virt_irq
 	FINISH_NAP
-	RUNLATCH_ON
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_IRQ
 	b	interrupt_return
@@ -2323,7 +2309,6 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
 EXC_COMMON_BEGIN(performance_monitor_common)
 	GEN_COMMON performance_monitor
 	FINISH_NAP
-	RUNLATCH_ON
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	performance_monitor_exception
 	b	interrupt_return
@@ -3057,9 +3042,6 @@ kvmppc_skip_Hinterrupt:
 	 * come here.
 	 */
 
-EXC_COMMON_BEGIN(ppc64_runlatch_on_trampoline)
-	b	__ppc64_runlatch_on
-
 USE_FIXED_SECTION(virt_trampolines)
 	/*
 	 * All code below __end_interrupts is treated as soft-masked. If
-- 
2.23.0


^ permalink raw reply related

* [PATCH v7 42/42] powerpc/64s: power4 nap fixup in C
From: Nicholas Piggin @ 2021-01-30 13:08 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Athira Rajeev, Nicholas Piggin
In-Reply-To: <20210130130852.2952424-1-npiggin@gmail.com>

There is no need for this to be in asm, use the new intrrupt entry wrapper.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h   | 15 +++++++++
 arch/powerpc/include/asm/processor.h   |  1 +
 arch/powerpc/include/asm/thread_info.h |  6 ++++
 arch/powerpc/kernel/exceptions-64s.S   | 45 --------------------------
 arch/powerpc/kernel/idle_book3s.S      |  4 +++
 5 files changed, 26 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index 4badb3e51c19..25f2420b1965 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -8,6 +8,16 @@
 #include <asm/ftrace.h>
 #include <asm/runlatch.h>
 
+static inline void nap_adjust_return(struct pt_regs *regs)
+{
+#ifdef CONFIG_PPC_970_NAP
+	if (unlikely(test_thread_local_flags(_TLF_NAPPING))) {
+		clear_thread_local_flags(_TLF_NAPPING);
+		regs->nip = (unsigned long)power4_idle_nap_return;
+	}
+#endif
+}
+
 struct interrupt_state {
 #ifdef CONFIG_PPC_BOOK3E_64
 	enum ctx_state ctx_state;
@@ -98,6 +108,9 @@ static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct int
 {
 	irq_exit();
 	interrupt_exit_prepare(regs, state);
+
+	/* Adjust at exit so the main handler sees the true NIA */
+	nap_adjust_return(regs);
 }
 
 struct interrupt_nmi_state {
@@ -151,6 +164,8 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter
 			radix_enabled() || (mfmsr() & MSR_DR))
 		nmi_exit();
 
+	nap_adjust_return(regs);
+
 #ifdef CONFIG_PPC64
 	if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260)
 		this_cpu_set_ftrace_enabled(state->ftrace_enabled);
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 8acc3590c971..eedc3c775141 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -393,6 +393,7 @@ extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
 extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
 #ifdef CONFIG_PPC_970_NAP
 extern void power4_idle_nap(void);
+void power4_idle_nap_return(void);
 #endif
 
 extern unsigned long cpuidle_disable;
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 386d576673a1..bf137151100b 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -152,6 +152,12 @@ void arch_setup_new_exec(void);
 
 #ifndef __ASSEMBLY__
 
+static inline void clear_thread_local_flags(unsigned int flags)
+{
+	struct thread_info *ti = current_thread_info();
+	ti->local_flags &= ~flags;
+}
+
 static inline bool test_thread_local_flags(unsigned int flags)
 {
 	struct thread_info *ti = current_thread_info();
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 27351276c54b..5478ffa85603 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -692,25 +692,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	ld	r1,GPR1(r1)
 .endm
 
-/*
- * When the idle code in power4_idle puts the CPU into NAP mode,
- * it has to do so in a loop, and relies on the external interrupt
- * and decrementer interrupt entry code to get it out of the loop.
- * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags
- * to signal that it is in the loop and needs help to get out.
- */
-#ifdef CONFIG_PPC_970_NAP
-#define FINISH_NAP				\
-BEGIN_FTR_SECTION				\
-	ld	r11, PACA_THREAD_INFO(r13);	\
-	ld	r9,TI_LOCAL_FLAGS(r11);		\
-	andi.	r10,r9,_TLF_NAPPING;		\
-	bnel	power4_fixup_nap;		\
-END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
-#else
-#define FINISH_NAP
-#endif
-
 /*
  * There are a few constraints to be concerned with.
  * - Real mode exceptions code/data must be located at their physical location.
@@ -1248,7 +1229,6 @@ EXC_COMMON_BEGIN(machine_check_common)
 	 */
 	GEN_COMMON machine_check
 
-	FINISH_NAP
 	/* Enable MSR_RI when finished with PACA_EXMC */
 	li	r10,MSR_RI
 	mtmsrd 	r10,1
@@ -1576,7 +1556,6 @@ EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
 EXC_COMMON_BEGIN(hardware_interrupt_common)
 	GEN_COMMON hardware_interrupt
-	FINISH_NAP
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_IRQ
 	b	interrupt_return
@@ -1761,7 +1740,6 @@ EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
 EXC_VIRT_END(decrementer, 0x4900, 0x80)
 EXC_COMMON_BEGIN(decrementer_common)
 	GEN_COMMON decrementer
-	FINISH_NAP
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	timer_interrupt
 	b	interrupt_return
@@ -1846,7 +1824,6 @@ EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100)
 EXC_VIRT_END(doorbell_super, 0x4a00, 0x100)
 EXC_COMMON_BEGIN(doorbell_super_common)
 	GEN_COMMON doorbell_super
-	FINISH_NAP
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_DOORBELL
 	bl	doorbell_exception
@@ -2200,7 +2177,6 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
 
 EXC_COMMON_BEGIN(hmi_exception_common)
 	GEN_COMMON hmi_exception
-	FINISH_NAP
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	handle_hmi_exception
 	b	interrupt_return
@@ -2229,7 +2205,6 @@ EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20)
 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20)
 EXC_COMMON_BEGIN(h_doorbell_common)
 	GEN_COMMON h_doorbell
-	FINISH_NAP
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_DOORBELL
 	bl	doorbell_exception
@@ -2262,7 +2237,6 @@ EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20)
 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20)
 EXC_COMMON_BEGIN(h_virt_irq_common)
 	GEN_COMMON h_virt_irq
-	FINISH_NAP
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	do_IRQ
 	b	interrupt_return
@@ -2308,7 +2282,6 @@ EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20)
 EXC_VIRT_END(performance_monitor, 0x4f00, 0x20)
 EXC_COMMON_BEGIN(performance_monitor_common)
 	GEN_COMMON performance_monitor
-	FINISH_NAP
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	performance_monitor_exception
 	b	interrupt_return
@@ -3059,24 +3032,6 @@ USE_FIXED_SECTION(virt_trampolines)
 __end_interrupts:
 DEFINE_FIXED_SYMBOL(__end_interrupts)
 
-#ifdef CONFIG_PPC_970_NAP
-	/*
-	 * Called by exception entry code if _TLF_NAPPING was set, this clears
-	 * the NAPPING flag, and redirects the exception exit to
-	 * power4_fixup_nap_return.
-	 */
-	.globl power4_fixup_nap
-EXC_COMMON_BEGIN(power4_fixup_nap)
-	andc	r9,r9,r10
-	std	r9,TI_LOCAL_FLAGS(r11)
-	LOAD_REG_ADDR(r10, power4_idle_nap_return)
-	std	r10,_NIP(r1)
-	blr
-
-power4_idle_nap_return:
-	blr
-#endif
-
 CLOSE_FIXED_SECTION(real_vectors);
 CLOSE_FIXED_SECTION(real_trampolines);
 CLOSE_FIXED_SECTION(virt_vectors);
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 22f249b6f58d..27d2e6a72ec9 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -201,4 +201,8 @@ _GLOBAL(power4_idle_nap)
 	mtmsrd	r7
 	isync
 	b	1b
+
+	.globl power4_idle_nap_return
+power4_idle_nap_return:
+	blr
 #endif
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH] powerpc64/idle: Fix SP offsets when saving GPRs
From: Nicholas Piggin @ 2021-01-30 13:44 UTC (permalink / raw)
  To: Christopher M. Riedl, linuxppc-dev, Michael Ellerman
In-Reply-To: <87o8h6d5jg.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of January 30, 2021 9:32 pm:
> "Christopher M. Riedl" <cmr@codefail.de> writes:
>> The idle entry/exit code saves/restores GPRs in the stack "red zone"
>> (Protected Zone according to PowerPC64 ELF ABI v2). However, the offset
>> used for the first GPR is incorrect and overwrites the back chain - the
>> Protected Zone actually starts below the current SP. In practice this is
>> probably not an issue, but it's still incorrect so fix it.
> 
> Nice catch.
> 
> Corrupting the back chain means you can't backtrace from there, which
> could be confusing for debugging one day.

Yeah, we seem to have got away without noticing because the CPU will 
wake up and return out of here before it tries to unwind the stack,
but if you tried to walk it by hand if the CPU got stuck in idle or 
something, then we'd get confused.

> It does make me wonder why we don't just create a stack frame and use
> the normal macros? It would use a bit more stack space, but we shouldn't
> be short of stack space when going idle.
> 
> Nick, was there a particular reason for using the red zone?

I don't recall a particular reason, I think a normal stack frame is 
probably a good idea.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH] powerpc/fault: fix wrong KUAP fault for IO_URING
From: Nicholas Piggin @ 2021-01-30 13:54 UTC (permalink / raw)
  To: Aneesh Kumar K.V, Christophe Leroy, Michael Ellerman, Zorro Lang
  Cc: Jens Axboe, linuxppc-dev
In-Reply-To: <87r1m2d5z4.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of January 30, 2021 9:22 pm:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> +Aneesh
>>
>> Le 29/01/2021 à 07:52, Zorro Lang a écrit :
> ..
>>> [   96.200296] ------------[ cut here ]------------
>>> [   96.200304] Bug: Read fault blocked by KUAP!
>>> [   96.200309] WARNING: CPU: 3 PID: 1876 at arch/powerpc/mm/fault.c:229 bad_kernel_fault+0x180/0x310
>>
>>> [   96.200734] NIP [c000000000849424] fault_in_pages_readable+0x104/0x350
>>> [   96.200741] LR [c00000000084952c] fault_in_pages_readable+0x20c/0x350
>>> [   96.200747] --- interrupt: 300
>>
>>
>> Problem happens in a section where userspace access is supposed to be granted, so the patch you 
>> proposed is definitely not the right fix.
>>
>> c000000000849408:	2c 01 00 4c 	isync
>> c00000000084940c:	a6 03 3d 7d 	mtspr   29,r9  <== granting userspace access permission
>> c000000000849410:	2c 01 00 4c 	isync
>> c000000000849414:	00 00 36 e9 	ld      r9,0(r22)
>> c000000000849418:	20 00 29 81 	lwz     r9,32(r9)
>> c00000000084941c:	00 02 29 71 	andi.   r9,r9,512
>> c000000000849420:	78 d3 5e 7f 	mr      r30,r26
>> ==> c000000000849424:	00 00 bf 8b 	lbz     r29,0(r31)  <== accessing userspace
>> c000000000849428:	10 00 82 41 	beq     c000000000849438 <fault_in_pages_readable+0x118>
>> c00000000084942c:	2c 01 00 4c 	isync
>> c000000000849430:	a6 03 bd 7e 	mtspr   29,r21  <== clearing userspace access permission
>> c000000000849434:	2c 01 00 4c 	isync
>>
>> My first guess is that the problem is linked to the following function, see the comment
>>
>> /*
>>   * For kernel thread that doesn't have thread.regs return
>>   * default AMR/IAMR values.
>>   */
>> static inline u64 current_thread_amr(void)
>> {
>> 	if (current->thread.regs)
>> 		return current->thread.regs->amr;
>> 	return AMR_KUAP_BLOCKED;
>> }
>>
>> Above function was introduced by commit 48a8ab4eeb82 ("powerpc/book3s64/pkeys: Don't update SPRN_AMR 
>> when in kernel mode")
> 
> Yeah that's a bit of a curly one.
> 
> At some point io_uring did kthread_use_mm(), which is supposed to mean
> the kthread can operate on behalf of the original process that submitted
> the IO.
> 
> But because KUAP is implemented using memory protection keys, it depends
> on the value of the AMR register, which is not part of the mm, it's in
> thread.regs->amr.
> 
> And what's worse by the time we're in kthread_use_mm() we no longer have
> access to the thread.regs->amr of the original process that submitted
> the IO.
> 
> We also can't simply move the AMR into the mm, precisely because it's
> per thread, not per mm.
> 
> So TBH I don't know how we're going to fix this.
> 
> I guess we could return AMR=unblocked for kernel threads, but that's
> arguably a bug because it allows a process to circumvent memory keys by
> asking the kernel to do the access.

We shouldn't need to inherit AMR should we? We only need it to be locked 
for kernel threads until it's explicitly unlocked -- nothing mm specific 
there. I think current_thread_amr could return 0 for kernel threads? Or
I would even avoid using that function for allow_user_access and open
code the kthread case and remove it from current_thread_amr().

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v2] tpm: ibmvtpm: fix error return code in tpm_ibmvtpm_probe()
From: Jarkko Sakkinen @ 2021-01-30 21:14 UTC (permalink / raw)
  To: Stefan Berger, Stefan Berger
  Cc: Wang Hai, linux-kernel, Hulk Robot, paulus, linux-integrity,
	linuxppc-dev
In-Reply-To: <262c3095-d4c0-f124-c8bc-e99685cbe572@linux.ibm.com>

On Fri, 2021-01-29 at 13:57 -0500, Stefan Berger wrote:
> On 1/29/21 12:35 PM, Jarkko Sakkinen wrote:
> > On Mon, Jan 25, 2021 at 08:47:53PM -0500, Stefan Berger wrote:
> > > From: Stefan Berger <stefanb@linux.ibm.com>
> > > 
> > > Return error code -ETIMEDOUT rather than '0' when waiting for the
> > > rtce_buf to be set has timed out.
> > > 
> > > Fixes: d8d74ea3c002 ("tpm: ibmvtpm: Wait for buffer to be set before proceeding")
> > > Reported-by: Hulk Robot <hulkci@huawei.com>
> > > Signed-off-by: Wang Hai <wanghai38@huawei.com>
> > > Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
> > > ---
> > Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
> > 
> > Thanks! Should I add
> > 
> > Cc: stable@vger.kernel.org to this?
> 
> Yes, that would be good! Thank you!


OK, it's applied.


>     Stefan

/Jarkko

^ permalink raw reply

* [PATCH 1/2] powerpc/Kconfig: Fix unmet direct dependency on NET
From: Florian Fainelli @ 2021-01-30 22:55 UTC (permalink / raw)
  To: linux-kernel, arndb
  Cc: Christophe Leroy, Florian Fainelli, kbuild-all, kernel test robot,
	Arnd Bergmann, Michal Simek, Paul Mackerras,
	open list:LINUX FOR POWERPC EMBEDDED PPC4XX

The kbuild test robot was able to generate a configuration where
ETHERNET and NETDEVICES was selected by the Akenobo platform but not
NET, which resulted in various build failures and these Kconfig
warnings:

WARNING: unmet direct dependencies detected for NETDEVICES
  Depends on [n]: NET [=n]
  Selected by [y]:
  - AKEBONO [=y] && PPC_47x [=y]

WARNING: unmet direct dependencies detected for ETHERNET
  Depends on [n]: NETDEVICES [=y] && NET [=n]
  Selected by [y]:
  - AKEBONO [=y] && PPC_47x [=y]

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 arch/powerpc/platforms/44x/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index 78ac6d67a935..68bd647c878f 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -206,6 +206,7 @@ config AKEBONO
 	select PPC4xx_HSTA_MSI
 	select I2C
 	select I2C_IBM_IIC
+	select NET
 	select NETDEVICES
 	select ETHERNET
 	select NET_VENDOR_IBM
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH] powerpc: fix AKEBONO build failures
From: Randy Dunlap @ 2021-01-30 23:21 UTC (permalink / raw)
  To: Michael Ellerman, Yury Norov, linuxppc-dev,
	Linux Kernel Mailing List
  Cc: Paul Mackerras
In-Reply-To: <875z3prcwg.fsf@mpe.ellerman.id.au>

On 1/21/21 5:14 PM, Michael Ellerman wrote:
> Randy Dunlap <rdunlap@infradead.org> writes:
>> On 1/20/21 1:29 PM, Yury Norov wrote:
>>> Hi all,
>>>
>>> I found the power pc build broken on today's
>>> linux-next (647060f3b592).
>>
>> Darn, I was building linux-5.11-rc4.
>>
>> I'll try linux-next after I send this.
>>
>> ---
>> From: Randy Dunlap <rdunlap@infradead.org>
>>
>> Fulfill AKEBONO Kconfig requirements.
>>
>> Fixes these Kconfig warnings (and more) and fixes the subsequent
>> build errors:
>>
>> WARNING: unmet direct dependencies detected for NETDEVICES
>>    Depends on [n]: NET [=n]
>>    Selected by [y]:
>>    - AKEBONO [=y] && PPC_47x [=y]
>>
>> WARNING: unmet direct dependencies detected for MMC_SDHCI
>>    Depends on [n]: MMC [=n] && HAS_DMA [=y]
>>    Selected by [y]:
>>    - AKEBONO [=y] && PPC_47x [=y]
>>
>> Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> Cc: Paul Mackerras <paulus@samba.org>
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Cc: Yury Norov <yury.norov@gmail.com>
>> ---
>>   arch/powerpc/platforms/44x/Kconfig |    2 ++
>>   1 file changed, 2 insertions(+)
>>
>> --- lnx-511-rc4.orig/arch/powerpc/platforms/44x/Kconfig
>> +++ lnx-511-rc4/arch/powerpc/platforms/44x/Kconfig
>> @@ -206,6 +206,7 @@ config AKEBONO
>>   	select PPC4xx_HSTA_MSI
>>   	select I2C
>>   	select I2C_IBM_IIC
>> +	select NET
>>   	select NETDEVICES
>>   	select ETHERNET
>>   	select NET_VENDOR_IBM
> 
> I think the problem here is too much use of select, for things that
> should instead be in the defconfig.
> 
> The patch below results in the same result for make
> 44x/akebono_defconfig. Does it fix the original issue?

Hi Michael,
Sorry for the delay.

Changing the akebono_defconfig doesn't cause the missing symbols
to be set -- the defconfig is not being used here.

I guess that if you have users who set CONFIG_AKEBONO and expect
it to build cleanly, you will need something like my patch or the
patch that Florian just posted.

Changing the akebono_defconfig also would not help 'make randconfig'
builds to build cleanly if they had happened to enable AKEBONO.


> We don't need to add ETHERNET or NET_VENDOR_IBM to the defconfig because
> they're both default y.
> 
> cheers


^ permalink raw reply

* [PATCH] cxl: Simplify bool conversion
From: Yang Li @ 2021-01-29  8:25 UTC (permalink / raw)
  To: fbarrat; +Cc: ajd, arnd, gregkh, linux-kernel, Yang Li, linuxppc-dev

Fix the following coccicheck warning:
./drivers/misc/cxl/sysfs.c:181:48-53: WARNING: conversion to bool not
needed here

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
---
 drivers/misc/cxl/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c
index d97a243..c173a5e 100644
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -178,7 +178,7 @@ static ssize_t perst_reloads_same_image_store(struct device *device,
 	if ((rc != 1) || !(val == 1 || val == 0))
 		return -EINVAL;
 
-	adapter->perst_same_image = (val == 1 ? true : false);
+	adapter->perst_same_image = (val == 1);
 	return count;
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* [RFC 00/20] TLB batching consolidation and enhancements
From: Nadav Amit @ 2021-01-31  0:11 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Andrea Arcangeli, linux-s390, x86, Yu Zhao, Will Deacon,
	Peter Zijlstra, Mel Gorman, Dave Hansen, linux-csky, Nadav Amit,
	Nick Piggin, Andy Lutomirski, Andrew Morton, linuxppc-dev,
	Thomas Gleixner

From: Nadav Amit <namit@vmware.com>

There are currently (at least?) 5 different TLB batching schemes in the
kernel:

1. Using mmu_gather (e.g., zap_page_range()).

2. Using {inc|dec}_tlb_flush_pending() to inform other threads on the
   ongoing deferred TLB flush and flushing the entire range eventually
   (e.g., change_protection_range()).

3. arch_{enter|leave}_lazy_mmu_mode() for sparc and powerpc (and Xen?).

4. Batching per-table flushes (move_ptes()).

5. By setting a flag on that a deferred TLB flush operation takes place,
   flushing when (try_to_unmap_one() on x86).

It seems that (1)-(4) can be consolidated. In addition, it seems that
(5) is racy. It also seems there can be many redundant TLB flushes, and
potentially TLB-shootdown storms, for instance during batched
reclamation (using try_to_unmap_one()) if at the same time mmu_gather
defers TLB flushes.

More aggressive TLB batching may be possible, but this patch-set does
not add such batching. The proposed changes would enable such batching
in a later time.

Admittedly, I do not understand how things are not broken today, which
frightens me to make further batching before getting things in order.
For instance, why is ok for zap_pte_range() to batch dirty-PTE flushes
for each page-table (but not in greater granularity). Can't
ClearPageDirty() be called before the flush, causing writes after
ClearPageDirty() and before the flush to be lost?

This patch-set therefore performs the following changes:

1. Change mprotect, task_mmu and mapping_dirty_helpers to use mmu_gather
   instead of {inc|dec}_tlb_flush_pending().

2. Avoid TLB flushes if PTE permission is not demoted.

3. Cleans up mmu_gather to be less arch-dependant.

4. Uses mm's generations to track in finer granularity, either per-VMA
   or per page-table, whether a pending mmu_gather operation is
   outstanding. This should allow to avoid some TLB flushes when KSM or
   memory reclamation takes place while another operation such as
   munmap() or mprotect() is running.

5. Changes try_to_unmap_one() flushing scheme, as the current seems
   broken to track in a bitmap which CPUs have outstanding TLB flushes
   instead of having a flag.

Further optimizations are possible, such as changing move_ptes() to use
mmu_gather.

The patches were very very lightly tested. I am looking forward for your
feedback regarding the overall approaches, and whether to split them
into multiple patch-sets.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: linux-csky@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: x86@kernel.org
Cc: Yu Zhao <yuzhao@google.com>

Nadav Amit (20):
  mm/tlb: fix fullmm semantics
  mm/mprotect: use mmu_gather
  mm/mprotect: do not flush on permission promotion
  mm/mapping_dirty_helpers: use mmu_gather
  mm/tlb: move BATCHED_UNMAP_TLB_FLUSH to tlb.h
  fs/task_mmu: use mmu_gather interface of clear-soft-dirty
  mm: move x86 tlb_gen to generic code
  mm: store completed TLB generation
  mm: create pte/pmd_tlb_flush_pending()
  mm: add pte_to_page()
  mm/tlb: remove arch-specific tlb_start/end_vma()
  mm/tlb: save the VMA that is flushed during tlb_start_vma()
  mm/tlb: introduce tlb_start_ptes() and tlb_end_ptes()
  mm: move inc/dec_tlb_flush_pending() to mmu_gather.c
  mm: detect deferred TLB flushes in vma granularity
  mm/tlb: per-page table generation tracking
  mm/tlb: updated completed deferred TLB flush conditionally
  mm: make mm_cpumask() volatile
  lib/cpumask: introduce cpumask_atomic_or()
  mm/rmap: avoid potential races

 arch/arm/include/asm/bitops.h         |   4 +-
 arch/arm/include/asm/pgtable.h        |   4 +-
 arch/arm64/include/asm/pgtable.h      |   4 +-
 arch/csky/Kconfig                     |   1 +
 arch/csky/include/asm/tlb.h           |  12 --
 arch/powerpc/Kconfig                  |   1 +
 arch/powerpc/include/asm/tlb.h        |   2 -
 arch/s390/Kconfig                     |   1 +
 arch/s390/include/asm/tlb.h           |   3 -
 arch/sparc/Kconfig                    |   1 +
 arch/sparc/include/asm/pgtable_64.h   |   9 +-
 arch/sparc/include/asm/tlb_64.h       |   2 -
 arch/sparc/mm/init_64.c               |   2 +-
 arch/x86/Kconfig                      |   3 +
 arch/x86/hyperv/mmu.c                 |   2 +-
 arch/x86/include/asm/mmu.h            |  10 -
 arch/x86/include/asm/mmu_context.h    |   1 -
 arch/x86/include/asm/paravirt_types.h |   2 +-
 arch/x86/include/asm/pgtable.h        |  24 +--
 arch/x86/include/asm/tlb.h            |  21 +-
 arch/x86/include/asm/tlbbatch.h       |  15 --
 arch/x86/include/asm/tlbflush.h       |  61 ++++--
 arch/x86/mm/tlb.c                     |  52 +++--
 arch/x86/xen/mmu_pv.c                 |   2 +-
 drivers/firmware/efi/efi.c            |   1 +
 fs/proc/task_mmu.c                    |  29 ++-
 include/asm-generic/bitops/find.h     |   8 +-
 include/asm-generic/tlb.h             | 291 +++++++++++++++++++++-----
 include/linux/bitmap.h                |  21 +-
 include/linux/cpumask.h               |  40 ++--
 include/linux/huge_mm.h               |   3 +-
 include/linux/mm.h                    |  29 ++-
 include/linux/mm_types.h              | 166 ++++++++++-----
 include/linux/mm_types_task.h         |  13 --
 include/linux/pgtable.h               |   2 +-
 include/linux/smp.h                   |   6 +-
 init/Kconfig                          |  21 ++
 kernel/fork.c                         |   2 +
 kernel/smp.c                          |   8 +-
 lib/bitmap.c                          |  33 ++-
 lib/cpumask.c                         |   8 +-
 lib/find_bit.c                        |  10 +-
 mm/huge_memory.c                      |   6 +-
 mm/init-mm.c                          |   1 +
 mm/internal.h                         |  16 --
 mm/ksm.c                              |   2 +-
 mm/madvise.c                          |   6 +-
 mm/mapping_dirty_helpers.c            |  52 +++--
 mm/memory.c                           |   2 +
 mm/mmap.c                             |   1 +
 mm/mmu_gather.c                       |  59 +++++-
 mm/mprotect.c                         |  55 ++---
 mm/mremap.c                           |   2 +-
 mm/pgtable-generic.c                  |   2 +-
 mm/rmap.c                             |  42 ++--
 mm/vmscan.c                           |   1 +
 56 files changed, 803 insertions(+), 374 deletions(-)
 delete mode 100644 arch/x86/include/asm/tlbbatch.h

-- 
2.25.1

^ permalink raw reply

* [RFC 11/20] mm/tlb: remove arch-specific tlb_start/end_vma()
From: Nadav Amit @ 2021-01-31  0:11 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Andrea Arcangeli, linux-s390, x86, Yu Zhao, Peter Zijlstra,
	linuxppc-dev, Dave Hansen, Nick Piggin, Nadav Amit, linux-csky,
	Andy Lutomirski, Andrew Morton, Will Deacon, Thomas Gleixner
In-Reply-To: <20210131001132.3368247-1-namit@vmware.com>

From: Nadav Amit <namit@vmware.com>

Architecture-specific tlb_start_vma() and tlb_end_vma() seem
unnecessary. They are currently used for:

1. Avoid per-VMA TLB flushes. This can be determined by introducing
   a new config option.

2. Avoid saving information on the vma that is being flushed. Saving
   this information, even for architectures that do not need it, is
   cheap and we will need it for per-VMA deferred TLB flushing.

3. Avoid calling flush_cache_range().

Remove the architecture specific tlb_start_vma() and tlb_end_vma() in
the following manner, corresponding to the previous requirements:

1. Introduce a new config option -
   ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING - to allow architectures to
   define whether they want aggressive TLB flush batching (instead of
   flushing mappings of each VMA separately).

2. Save information on the vma regardless of architecture. Saving this
   information should have negligible overhead, and they will be
   needed for fine granularity TLB flushes.

3. flush_cache_range() is anyhow not defined for the architectures that
   implement tlb_start/end_vma().

No functional change intended.

Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: linux-csky@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: x86@kernel.org
---
 arch/csky/Kconfig               |  1 +
 arch/csky/include/asm/tlb.h     | 12 ------------
 arch/powerpc/Kconfig            |  1 +
 arch/powerpc/include/asm/tlb.h  |  2 --
 arch/s390/Kconfig               |  1 +
 arch/s390/include/asm/tlb.h     |  3 ---
 arch/sparc/Kconfig              |  1 +
 arch/sparc/include/asm/tlb_64.h |  2 --
 arch/x86/Kconfig                |  1 +
 arch/x86/include/asm/tlb.h      |  3 ---
 include/asm-generic/tlb.h       | 15 +++++----------
 init/Kconfig                    |  8 ++++++++
 12 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 89dd2fcf38fa..924ff5721240 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -8,6 +8,7 @@ config CSKY
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_QUEUED_RWLOCKS if NR_CPUS>2
+	select ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING
 	select ARCH_WANT_FRAME_POINTERS if !CPU_CK610
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select COMMON_CLK
diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h
index fdff9b8d70c8..8130a5f09a6b 100644
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -6,18 +6,6 @@
 
 #include <asm/cacheflush.h>
 
-#define tlb_start_vma(tlb, vma) \
-	do { \
-		if (!(tlb)->fullmm) \
-			flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
-	}  while (0)
-
-#define tlb_end_vma(tlb, vma) \
-	do { \
-		if (!(tlb)->fullmm) \
-			flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
-	}  while (0)
-
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
 
 #include <asm-generic/tlb.h>
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 107bb4319e0e..d9761b6f192a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -151,6 +151,7 @@ config PPC
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_USE_QUEUED_RWLOCKS		if PPC_QUEUED_SPINLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS	if PPC_QUEUED_SPINLOCKS
+	select ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select ARCH_WANT_LD_ORPHAN_WARN
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index 160422a439aa..880b7daf904e 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -19,8 +19,6 @@
 
 #include <linux/pagemap.h>
 
-#define tlb_start_vma(tlb, vma)	do { } while (0)
-#define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
 
 #define tlb_flush tlb_flush
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c72874f09741..5b3dc5ca9873 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -113,6 +113,7 @@ config S390
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+	select ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 954fa8ca6cbd..03f31d59f97c 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 					  struct page *page, int page_size);
 
-#define tlb_start_vma(tlb, vma)			do { } while (0)
-#define tlb_end_vma(tlb, vma)			do { } while (0)
-
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
 #define pmd_free_tlb pmd_free_tlb
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index c9c34dc52b7d..fb46e1b6f177 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -51,6 +51,7 @@ config SPARC
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	select SET_FS
+	select ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index 779a5a0f0608..3037187482db 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct *mm);
 void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *);
 void flush_tlb_pending(void);
 
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma)	do { } while (0)
 #define tlb_flush(tlb)	flush_tlb_pending()
 
 /*
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6bd4d626a6b3..d56b0f5cb00c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -101,6 +101,7 @@ config X86
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_USE_SYM_ANNOTATIONS
+	select ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANT_DEFAULT_BPF_JIT	if X86_64
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 1bfe979bb9bc..580636cdc257 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -2,9 +2,6 @@
 #ifndef _ASM_X86_TLB_H
 #define _ASM_X86_TLB_H
 
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-
 #define tlb_flush tlb_flush
 static inline void tlb_flush(struct mmu_gather *tlb);
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 427bfcc6cdec..b97136b7010b 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -334,8 +334,8 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
 
 #ifdef CONFIG_MMU_GATHER_NO_RANGE
 
-#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma)
-#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma()
+#if defined(tlb_flush)
+#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
 #endif
 
 /*
@@ -362,10 +362,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 
 #ifndef tlb_flush
 
-#if defined(tlb_start_vma) || defined(tlb_end_vma)
-#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
-#endif
-
 /*
  * When an architecture does not provide its own tlb_flush() implementation
  * but does have a reasonably efficient flush_vma_range() implementation
@@ -486,7 +482,6 @@ static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
  * case where we're doing a full MM flush.  When we're doing a munmap,
  * the vmas are adjusted to only cover the region to be torn down.
  */
-#ifndef tlb_start_vma
 static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
 	if (tlb->fullmm)
@@ -495,14 +490,15 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *
 	tlb_update_vma_flags(tlb, vma);
 	flush_cache_range(vma, vma->vm_start, vma->vm_end);
 }
-#endif
 
-#ifndef tlb_end_vma
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
 	if (tlb->fullmm)
 		return;
 
+	if (IS_ENABLED(CONFIG_ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING))
+		return;
+
 	/*
 	 * Do a TLB flush and reset the range at VMA boundaries; this avoids
 	 * the ranges growing with the unused space between consecutive VMAs,
@@ -511,7 +507,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 	 */
 	tlb_flush_mmu_tlbonly(tlb);
 }
-#endif
 
 #ifdef CONFIG_ARCH_HAS_TLB_GENERATIONS
 
diff --git a/init/Kconfig b/init/Kconfig
index 3d11a0f7c8cc..14a599a48738 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -849,6 +849,14 @@ config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 config ARCH_HAS_TLB_GENERATIONS
 	bool
 
+#
+# For architectures that prefer to batch TLB flushes aggressively, i.e.,
+# not to flush after changing or removing each VMA. The architecture must
+# provide its own tlb_flush() function.
+config ARCH_WANT_AGGRESSIVE_TLB_FLUSH_BATCHING
+	bool
+	depends on !CONFIG_MMU_GATHER_NO_GATHER
+
 config CC_HAS_INT128
 	def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT
 
-- 
2.25.1


^ permalink raw reply related

* Re: [RFC 00/20] TLB batching consolidation and enhancements
From: Andy Lutomirski @ 2021-01-31  0:39 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Andrea Arcangeli, linux-s390, X86 ML, Yu Zhao, Will Deacon,
	Peter Zijlstra, Mel Gorman, Dave Hansen, LKML, linux-csky,
	Linux-MM, Nadav Amit, Nick Piggin, Andy Lutomirski, Andrew Morton,
	linuxppc-dev, Thomas Gleixner
In-Reply-To: <20210131001132.3368247-1-namit@vmware.com>

On Sat, Jan 30, 2021 at 4:16 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>
> From: Nadav Amit <namit@vmware.com>
>
> There are currently (at least?) 5 different TLB batching schemes in the
> kernel:
>
> 1. Using mmu_gather (e.g., zap_page_range()).
>
> 2. Using {inc|dec}_tlb_flush_pending() to inform other threads on the
>    ongoing deferred TLB flush and flushing the entire range eventually
>    (e.g., change_protection_range()).
>
> 3. arch_{enter|leave}_lazy_mmu_mode() for sparc and powerpc (and Xen?).
>
> 4. Batching per-table flushes (move_ptes()).
>
> 5. By setting a flag on that a deferred TLB flush operation takes place,
>    flushing when (try_to_unmap_one() on x86).

Are you referring to the arch_tlbbatch_add_mm/flush mechanism?

^ permalink raw reply

* Re: [RFC 00/20] TLB batching consolidation and enhancements
From: Nadav Amit @ 2021-01-31  1:08 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andrea Arcangeli, linux-s390, X86 ML, Yu Zhao, Will Deacon,
	Peter Zijlstra, Mel Gorman, Dave Hansen, LKML,
	linux-csky@vger.kernel.org, Linux-MM, Nick Piggin, Andrew Morton,
	linuxppc-dev, Thomas Gleixner
In-Reply-To: <CALCETrUUe_iijSoTYMjibqxUveaYrG3sVTWawbi3HWCjx_ySYg@mail.gmail.com>

> On Jan 30, 2021, at 4:39 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
> On Sat, Jan 30, 2021 at 4:16 PM Nadav Amit <nadav.amit@gmail.com> wrote:
>> From: Nadav Amit <namit@vmware.com>
>> 
>> There are currently (at least?) 5 different TLB batching schemes in the
>> kernel:
>> 
>> 1. Using mmu_gather (e.g., zap_page_range()).
>> 
>> 2. Using {inc|dec}_tlb_flush_pending() to inform other threads on the
>>   ongoing deferred TLB flush and flushing the entire range eventually
>>   (e.g., change_protection_range()).
>> 
>> 3. arch_{enter|leave}_lazy_mmu_mode() for sparc and powerpc (and Xen?).
>> 
>> 4. Batching per-table flushes (move_ptes()).
>> 
>> 5. By setting a flag on that a deferred TLB flush operation takes place,
>>   flushing when (try_to_unmap_one() on x86).
> 
> Are you referring to the arch_tlbbatch_add_mm/flush mechanism?

Yes.

^ permalink raw reply

* Re: [RFC 00/20] TLB batching consolidation and enhancements
From: Nicholas Piggin @ 2021-01-31  3:30 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Nadav Amit
  Cc: Andrea Arcangeli, linux-s390, x86, Yu Zhao, Will Deacon,
	Peter Zijlstra, Mel Gorman, Dave Hansen, linux-csky, Nadav Amit,
	Andy Lutomirski, Andrew Morton, linuxppc-dev, Thomas Gleixner
In-Reply-To: <20210131001132.3368247-1-namit@vmware.com>

Excerpts from Nadav Amit's message of January 31, 2021 10:11 am:
> From: Nadav Amit <namit@vmware.com>
> 
> There are currently (at least?) 5 different TLB batching schemes in the
> kernel:
> 
> 1. Using mmu_gather (e.g., zap_page_range()).
> 
> 2. Using {inc|dec}_tlb_flush_pending() to inform other threads on the
>    ongoing deferred TLB flush and flushing the entire range eventually
>    (e.g., change_protection_range()).
> 
> 3. arch_{enter|leave}_lazy_mmu_mode() for sparc and powerpc (and Xen?).
> 
> 4. Batching per-table flushes (move_ptes()).
> 
> 5. By setting a flag on that a deferred TLB flush operation takes place,
>    flushing when (try_to_unmap_one() on x86).
> 
> It seems that (1)-(4) can be consolidated. In addition, it seems that
> (5) is racy. It also seems there can be many redundant TLB flushes, and
> potentially TLB-shootdown storms, for instance during batched
> reclamation (using try_to_unmap_one()) if at the same time mmu_gather
> defers TLB flushes.
> 
> More aggressive TLB batching may be possible, but this patch-set does
> not add such batching. The proposed changes would enable such batching
> in a later time.
> 
> Admittedly, I do not understand how things are not broken today, which
> frightens me to make further batching before getting things in order.
> For instance, why is ok for zap_pte_range() to batch dirty-PTE flushes
> for each page-table (but not in greater granularity). Can't
> ClearPageDirty() be called before the flush, causing writes after
> ClearPageDirty() and before the flush to be lost?

Because it's holding the page table lock which stops page_mkclean from 
cleaning the page. Or am I misunderstanding the question?

I'll go through the patches a bit more closely when they all come 
through. Sparc and powerpc of course need the arch lazy mode to get 
per-page/pte information for operations that are not freeing pages, 
which is what mmu gather is designed for.

I wouldn't mind using a similar API so it's less of a black box when 
reading generic code, but it might not quite fit the mmu gather API
exactly (most of these paths don't want a full mmu_gather on stack).

> 
> This patch-set therefore performs the following changes:
> 
> 1. Change mprotect, task_mmu and mapping_dirty_helpers to use mmu_gather
>    instead of {inc|dec}_tlb_flush_pending().
> 
> 2. Avoid TLB flushes if PTE permission is not demoted.
> 
> 3. Cleans up mmu_gather to be less arch-dependant.
> 
> 4. Uses mm's generations to track in finer granularity, either per-VMA
>    or per page-table, whether a pending mmu_gather operation is
>    outstanding. This should allow to avoid some TLB flushes when KSM or
>    memory reclamation takes place while another operation such as
>    munmap() or mprotect() is running.
> 
> 5. Changes try_to_unmap_one() flushing scheme, as the current seems
>    broken to track in a bitmap which CPUs have outstanding TLB flushes
>    instead of having a flag.

Putting fixes first, and cleanups and independent patches (like #2) next
would help with getting stuff merged and backported.

Thanks,
Nick

^ permalink raw reply

* Re: [RFC 00/20] TLB batching consolidation and enhancements
From: Nadav Amit @ 2021-01-31  7:57 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Andrea Arcangeli, linux-s390, X86 ML, Yu Zhao, Will Deacon,
	Peter Zijlstra, Mel Gorman, Dave Hansen, LKML,
	linux-csky@vger.kernel.org, Linux-MM, Andy Lutomirski,
	Andrew Morton, linuxppc-dev, Thomas Gleixner
In-Reply-To: <1612063149.2awdsvvmhj.astroid@bobo.none>

> On Jan 30, 2021, at 7:30 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> Excerpts from Nadav Amit's message of January 31, 2021 10:11 am:
>> From: Nadav Amit <namit@vmware.com>
>> 
>> There are currently (at least?) 5 different TLB batching schemes in the
>> kernel:
>> 
>> 1. Using mmu_gather (e.g., zap_page_range()).
>> 
>> 2. Using {inc|dec}_tlb_flush_pending() to inform other threads on the
>>   ongoing deferred TLB flush and flushing the entire range eventually
>>   (e.g., change_protection_range()).
>> 
>> 3. arch_{enter|leave}_lazy_mmu_mode() for sparc and powerpc (and Xen?).
>> 
>> 4. Batching per-table flushes (move_ptes()).
>> 
>> 5. By setting a flag on that a deferred TLB flush operation takes place,
>>   flushing when (try_to_unmap_one() on x86).
>> 
>> It seems that (1)-(4) can be consolidated. In addition, it seems that
>> (5) is racy. It also seems there can be many redundant TLB flushes, and
>> potentially TLB-shootdown storms, for instance during batched
>> reclamation (using try_to_unmap_one()) if at the same time mmu_gather
>> defers TLB flushes.
>> 
>> More aggressive TLB batching may be possible, but this patch-set does
>> not add such batching. The proposed changes would enable such batching
>> in a later time.
>> 
>> Admittedly, I do not understand how things are not broken today, which
>> frightens me to make further batching before getting things in order.
>> For instance, why is ok for zap_pte_range() to batch dirty-PTE flushes
>> for each page-table (but not in greater granularity). Can't
>> ClearPageDirty() be called before the flush, causing writes after
>> ClearPageDirty() and before the flush to be lost?
> 
> Because it's holding the page table lock which stops page_mkclean from 
> cleaning the page. Or am I misunderstanding the question?

Thanks. I understood this part. Looking again at the code, I now understand
my confusion: I forgot that the reverse mapping is removed after the PTE is
zapped.

Makes me wonder whether it is ok to defer the TLB flush to tlb_finish_mmu(),
by performing set_page_dirty() for the batched pages when needed in
tlb_finish_mmu() [ i.e., by marking for each batched page whether
set_page_dirty() should be issued for that page while collecting them ].

> I'll go through the patches a bit more closely when they all come 
> through. Sparc and powerpc of course need the arch lazy mode to get 
> per-page/pte information for operations that are not freeing pages, 
> which is what mmu gather is designed for.

IIUC you mean any PTE change requires a TLB flush. Even setting up a new PTE
where no previous PTE was set, right?

> I wouldn't mind using a similar API so it's less of a black box when 
> reading generic code, but it might not quite fit the mmu gather API
> exactly (most of these paths don't want a full mmu_gather on stack).

I see your point. It may be possible to create two mmu_gather structs: a
small one that only holds the flush information and another that also holds
the pages. 

>> This patch-set therefore performs the following changes:
>> 
>> 1. Change mprotect, task_mmu and mapping_dirty_helpers to use mmu_gather
>>   instead of {inc|dec}_tlb_flush_pending().
>> 
>> 2. Avoid TLB flushes if PTE permission is not demoted.
>> 
>> 3. Cleans up mmu_gather to be less arch-dependant.
>> 
>> 4. Uses mm's generations to track in finer granularity, either per-VMA
>>   or per page-table, whether a pending mmu_gather operation is
>>   outstanding. This should allow to avoid some TLB flushes when KSM or
>>   memory reclamation takes place while another operation such as
>>   munmap() or mprotect() is running.
>> 
>> 5. Changes try_to_unmap_one() flushing scheme, as the current seems
>>   broken to track in a bitmap which CPUs have outstanding TLB flushes
>>   instead of having a flag.
> 
> Putting fixes first, and cleanups and independent patches (like #2) next
> would help with getting stuff merged and backported.

I tried to do it mostly this way. There are some theoretical races which
I did not manage (or try hard enough) to create, so I did not include in
the “fixes” section. I will restructure the patch-set according to the
feedback.

Thanks,
Nadav

^ permalink raw reply

* Re: [RFC 00/20] TLB batching consolidation and enhancements
From: Nadav Amit @ 2021-01-31  8:14 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Andrea Arcangeli, linux-s390, X86 ML, Yu Zhao, Will Deacon,
	Peter Zijlstra, Mel Gorman, Dave Hansen, LKML, Nicholas Piggin,
	Linux-MM, linux-csky@vger.kernel.org, Andy Lutomirski,
	Andrew Morton, linuxppc-dev, Thomas Gleixner
In-Reply-To: <A1589669-34AE-4E15-8358-79BAD7C72520@vmware.com>

> On Jan 30, 2021, at 11:57 PM, Nadav Amit <namit@vmware.com> wrote:
> 
>> On Jan 30, 2021, at 7:30 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> Excerpts from Nadav Amit's message of January 31, 2021 10:11 am:
>>> From: Nadav Amit <namit@vmware.com>
>>> 
>>> There are currently (at least?) 5 different TLB batching schemes in the
>>> kernel:
>>> 
>>> 1. Using mmu_gather (e.g., zap_page_range()).
>>> 
>>> 2. Using {inc|dec}_tlb_flush_pending() to inform other threads on the
>>>  ongoing deferred TLB flush and flushing the entire range eventually
>>>  (e.g., change_protection_range()).
>>> 
>>> 3. arch_{enter|leave}_lazy_mmu_mode() for sparc and powerpc (and Xen?).
>>> 
>>> 4. Batching per-table flushes (move_ptes()).
>>> 
>>> 5. By setting a flag on that a deferred TLB flush operation takes place,
>>>  flushing when (try_to_unmap_one() on x86).
>>> 
>>> It seems that (1)-(4) can be consolidated. In addition, it seems that
>>> (5) is racy. It also seems there can be many redundant TLB flushes, and
>>> potentially TLB-shootdown storms, for instance during batched
>>> reclamation (using try_to_unmap_one()) if at the same time mmu_gather
>>> defers TLB flushes.
>>> 
>>> More aggressive TLB batching may be possible, but this patch-set does
>>> not add such batching. The proposed changes would enable such batching
>>> in a later time.
>>> 
>>> Admittedly, I do not understand how things are not broken today, which
>>> frightens me to make further batching before getting things in order.
>>> For instance, why is ok for zap_pte_range() to batch dirty-PTE flushes
>>> for each page-table (but not in greater granularity). Can't
>>> ClearPageDirty() be called before the flush, causing writes after
>>> ClearPageDirty() and before the flush to be lost?
>> 
>> Because it's holding the page table lock which stops page_mkclean from 
>> cleaning the page. Or am I misunderstanding the question?
> 
> Thanks. I understood this part. Looking again at the code, I now understand
> my confusion: I forgot that the reverse mapping is removed after the PTE is
> zapped.
> 
> Makes me wonder whether it is ok to defer the TLB flush to tlb_finish_mmu(),
> by performing set_page_dirty() for the batched pages when needed in
> tlb_finish_mmu() [ i.e., by marking for each batched page whether
> set_page_dirty() should be issued for that page while collecting them ].

Correcting myself (I hope): no we cannot do so, since the buffers might be
remove from the page at that point.


^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.11-6 tag
From: Linus Torvalds @ 2021-01-31 19:38 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Linux Kernel Mailing List, Nick Piggin
In-Reply-To: <87lfc9cnuw.fsf@mpe.ellerman.id.au>

On Sun, Jan 31, 2021 at 4:06 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Please pull another powerpc fix for 5.11:

Manual pr-tracker-bot says thanks,

           Linus

^ permalink raw reply

* Re: [PATCH v7 19/42] powerpc/perf: move perf irq/nmi handling details into traps.c
From: Athira Rajeev @ 2021-01-31 12:30 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linuxppc-dev
In-Reply-To: <20210130130852.2952424-20-npiggin@gmail.com>

[-- Attachment #1: Type: text/html, Size: 13382 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc: fix AKEBONO build failures
From: Michael Ellerman @ 2021-02-01  1:24 UTC (permalink / raw)
  To: Randy Dunlap, Yury Norov, linuxppc-dev, Linux Kernel Mailing List
  Cc: Paul Mackerras
In-Reply-To: <4572579a-7208-628d-cbe2-b70a74a84ae7@infradead.org>

Randy Dunlap <rdunlap@infradead.org> writes:
> On 1/21/21 5:14 PM, Michael Ellerman wrote:
>> Randy Dunlap <rdunlap@infradead.org> writes:
>>> On 1/20/21 1:29 PM, Yury Norov wrote:
>>>> Hi all,
>>>>
>>>> I found the power pc build broken on today's
>>>> linux-next (647060f3b592).
>>>
>>> Darn, I was building linux-5.11-rc4.
>>>
>>> I'll try linux-next after I send this.
>>>
>>> ---
>>> From: Randy Dunlap <rdunlap@infradead.org>
>>>
>>> Fulfill AKEBONO Kconfig requirements.
>>>
>>> Fixes these Kconfig warnings (and more) and fixes the subsequent
>>> build errors:
>>>
>>> WARNING: unmet direct dependencies detected for NETDEVICES
>>>    Depends on [n]: NET [=n]
>>>    Selected by [y]:
>>>    - AKEBONO [=y] && PPC_47x [=y]
>>>
>>> WARNING: unmet direct dependencies detected for MMC_SDHCI
>>>    Depends on [n]: MMC [=n] && HAS_DMA [=y]
>>>    Selected by [y]:
>>>    - AKEBONO [=y] && PPC_47x [=y]
>>>
>>> Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
>>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>> Cc: Paul Mackerras <paulus@samba.org>
>>> Cc: linuxppc-dev@lists.ozlabs.org
>>> Cc: Yury Norov <yury.norov@gmail.com>
>>> ---
>>>   arch/powerpc/platforms/44x/Kconfig |    2 ++
>>>   1 file changed, 2 insertions(+)
>>>
>>> --- lnx-511-rc4.orig/arch/powerpc/platforms/44x/Kconfig
>>> +++ lnx-511-rc4/arch/powerpc/platforms/44x/Kconfig
>>> @@ -206,6 +206,7 @@ config AKEBONO
>>>   	select PPC4xx_HSTA_MSI
>>>   	select I2C
>>>   	select I2C_IBM_IIC
>>> +	select NET
>>>   	select NETDEVICES
>>>   	select ETHERNET
>>>   	select NET_VENDOR_IBM
>> 
>> I think the problem here is too much use of select, for things that
>> should instead be in the defconfig.
>> 
>> The patch below results in the same result for make
>> 44x/akebono_defconfig. Does it fix the original issue?
>
> Hi Michael,
> Sorry for the delay.
>
> Changing the akebono_defconfig doesn't cause the missing symbols
> to be set -- the defconfig is not being used here.

Yep, but that's OK. None of those selected symbols are hard dependencies
of AKEBONO, they're just things you probably want in your kernel to
actually boot on an akebono board.

> I guess that if you have users who set CONFIG_AKEBONO and expect
> it to build cleanly, you will need something like my patch or the
> patch that Florian just posted.

It will build cleanly, it just won't necessarily boot on a real board.
Users who enable AKEBONO manually need to know what they're doing, or
they should just use the defconfig.

> Changing the akebono_defconfig also would not help 'make randconfig'
> builds to build cleanly if they had happened to enable AKEBONO.

Changing the defconfig doesn't help randconfig, but dropping the selects
does.

Anyway I'll send a proper version of my patch, which I'm pretty
confident will fix all the issues.

cheers

^ permalink raw reply

* [PATCH] powerpc/akebono: Fix unmet dependency errors
From: Michael Ellerman @ 2021-02-01  1:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: f.fainelli, rdunlap, yury.norov

The AKEBONO config has various selects under it, including some with
user-selectable dependencies, which means those dependencies can be
disabled. This leads to warnings from Kconfig.

This can be seen with eg:

  $ make allnoconfig
  $ ./scripts/config --file build~/.config -k -e CONFIG_44x -k -e CONFIG_PPC_47x -e CONFIG_AKEBONO
  $ make olddefconfig

  WARNING: unmet direct dependencies detected for ATA
    Depends on [n]: HAS_IOMEM [=y] && BLOCK [=n]
    Selected by [y]:
    - AKEBONO [=y] && PPC_47x [=y]

  WARNING: unmet direct dependencies detected for NETDEVICES
    Depends on [n]: NET [=n]
    Selected by [y]:
    - AKEBONO [=y] && PPC_47x [=y]

  WARNING: unmet direct dependencies detected for ETHERNET
    Depends on [n]: NETDEVICES [=y] && NET [=n]
    Selected by [y]:
    - AKEBONO [=y] && PPC_47x [=y]

  WARNING: unmet direct dependencies detected for MMC_SDHCI
    Depends on [n]: MMC [=n] && HAS_DMA [=y]
    Selected by [y]:
    - AKEBONO [=y] && PPC_47x [=y]

  WARNING: unmet direct dependencies detected for MMC_SDHCI_PLTFM
    Depends on [n]: MMC [=n] && MMC_SDHCI [=y]
    Selected by [y]:
    - AKEBONO [=y] && PPC_47x [=y]

The problem is that AKEBONO is using select to enable things that are
not true dependencies, but rather things you probably want enabled in
an AKEBONO kernel. That is what a defconfig is for.

So drop those selects and instead move those symbols into the
defconfig. This fixes all the kconfig warnings, and the result of make
44x/akebono_defconfig is the same before and after the patch.

Reported-by: Yury Norov <yury.norov@gmail.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/44x/akebono_defconfig | 5 +++++
 arch/powerpc/platforms/44x/Kconfig         | 7 -------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/configs/44x/akebono_defconfig b/arch/powerpc/configs/44x/akebono_defconfig
index 3894ba8f8ffc..ae9b7beefdd6 100644
--- a/arch/powerpc/configs/44x/akebono_defconfig
+++ b/arch/powerpc/configs/44x/akebono_defconfig
@@ -21,6 +21,7 @@ CONFIG_IRQ_ALL_CPUS=y
 # CONFIG_COMPACTION is not set
 # CONFIG_SUSPEND is not set
 CONFIG_NET=y
+CONFIG_NETDEVICES=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
@@ -41,7 +42,9 @@ CONFIG_BLK_DEV_RAM_SIZE=35000
 # CONFIG_SCSI_PROC_FS is not set
 CONFIG_BLK_DEV_SD=y
 # CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
 # CONFIG_SATA_PMP is not set
+CONFIG_SATA_AHCI_PLATFORM=y
 # CONFIG_ATA_SFF is not set
 # CONFIG_NET_VENDOR_3COM is not set
 # CONFIG_NET_VENDOR_ADAPTEC is not set
@@ -98,6 +101,8 @@ CONFIG_USB_OHCI_HCD=y
 # CONFIG_USB_OHCI_HCD_PCI is not set
 CONFIG_USB_STORAGE=y
 CONFIG_MMC=y
+CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_M41T80=y
 CONFIG_EXT2_FS=y
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index 78ac6d67a935..7d41e9264510 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -206,17 +206,10 @@ config AKEBONO
 	select PPC4xx_HSTA_MSI
 	select I2C
 	select I2C_IBM_IIC
-	select NETDEVICES
-	select ETHERNET
-	select NET_VENDOR_IBM
 	select IBM_EMAC_EMAC4 if IBM_EMAC
 	select USB if USB_SUPPORT
 	select USB_OHCI_HCD_PLATFORM if USB_OHCI_HCD
 	select USB_EHCI_HCD_PLATFORM if USB_EHCI_HCD
-	select MMC_SDHCI
-	select MMC_SDHCI_PLTFM
-	select ATA
-	select SATA_AHCI_PLATFORM
 	help
 	  This option enables support for the IBM Akebono (476gtr) evaluation board
 
-- 
2.25.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox