LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 46/52] powerpc/64s/exception: fix machine check early should not set AMR
From: Nicholas Piggin @ 2019-06-20  5:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20190620051459.29573-1-npiggin@gmail.com>

The early machine check runs in real mode, so locking is unnecessary.
Worse, the windup does not restore AMR, so this can result in a false
KUAP fault after a recoverable machine check hits inside a user copy
operation.

Fix this similarly to HMI by just avoiding the kuap lock in the
early machine check handler (it will be set by the late handler that
runs in virtual mode if that runs).

Fixes: 890274c2dc4c0 ("powerpc/64s: Implement KUAP for Radix MMU")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index ab22af2509d8..8ed787dc579c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1012,7 +1012,7 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
 	mfspr	r11,SPRN_DSISR		/* Save DSISR */
 	std	r11,_DSISR(r1)
 	std	r9,_CCR(r1)		/* Save CR in stackframe */
-	kuap_save_amr_and_lock r9, r10, cr1
+	/* We don't touch AMR here, we never go to virtual mode */
 	/* Save r9 through r13 from EXMC save area to stack frame. */
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
 	mfmsr	r11			/* get MSR value */
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 47/52] powerpc/64s/exception: machine check restructure handler to be more regular
From: Nicholas Piggin @ 2019-06-20  5:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20190620051459.29573-1-npiggin@gmail.com>

Follow the pattern of sreset and HMI handlers more closely, in using
EXCEPTION_PROLOG_COMMON_1 rather than open-coding it. Run the handler
at the relocated location.

This will help with simplification and code sharing.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 86 ++++++++++++++--------------
 1 file changed, 42 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 8ed787dc579c..384f591ef078 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -958,17 +958,34 @@ BEGIN_FTR_SECTION
 	b	machine_check_pseries
 END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 #endif
-	b	machine_check_common_early
+	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
+	mfctr	r10		/* save ctr */
+	BRANCH_TO_C000(r11, machine_check_early_common)
+	/*
+	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
+	 * nested machine check corrupts it. machine_check_common enables
+	 * MSR_RI.
+	 */
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
-TRAMP_REAL_BEGIN(machine_check_common_early)
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 0, 0, 0
+
+#ifdef CONFIG_PPC_PSERIES
+TRAMP_REAL_BEGIN(machine_check_fwnmi)
+	/* See comment at machine_check exception, don't turn on RI */
+	EXCEPTION_PROLOG_0 PACA_EXMC
+machine_check_pseries:
+	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
+	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
+#endif
+
+TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
+
+EXC_COMMON_BEGIN(machine_check_early_common)
+	mtctr	r10			/* Restore ctr */
+	mfspr	r11,SPRN_SRR0
+	mfspr	r12,SPRN_SRR1
+
 	/*
-	 * Register contents:
-	 * R13		= PACA
-	 * R9		= CR
-	 * Original R9 to R13 is saved on PACA_EXMC
-	 *
 	 * Switch to mc_emergency stack and handle re-entrancy (we limit
 	 * the nested MCE upto level 4 to avoid stack overflow).
 	 * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1
@@ -989,32 +1006,30 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
 	 * the machine check is handled then the idle wakeup code is called
 	 * to restore state.
 	 */
-	mr	r11,r1			/* Save r1 */
 	lhz	r10,PACA_IN_MCE(r13)
 	cmpwi	r10,0			/* Are we in nested machine check */
-	bne	0f			/* Yes, we are. */
-	/* First machine check entry */
-	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
-0:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
+	cmpwi	cr1,r10,MAX_MCE_DEPTH	/* Are we at maximum nesting */
 	addi	r10,r10,1		/* increment paca->in_mce */
 	sth	r10,PACA_IN_MCE(r13)
+
+	mr	r10,r1			/* Save r1 */
+	bne	1f
+	/* First machine check entry */
+	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
+1:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
 	/* Limit nested MCE to level 4 to avoid stack overflow */
-	cmpwi	r10,MAX_MCE_DEPTH
-	bgt	2f			/* Check if we hit limit of 4 */
-	std	r11,GPR1(r1)		/* Save r1 on the stack. */
-	std	r11,0(r1)		/* make stack chain pointer */
-	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
-	std	r11,_NIP(r1)
-	mfspr	r11,SPRN_SRR1		/* Save SRR1 */
-	std	r11,_MSR(r1)
-	mfspr	r11,SPRN_DAR		/* Save DAR */
-	std	r11,_DAR(r1)
-	mfspr	r11,SPRN_DSISR		/* Save DSISR */
-	std	r11,_DSISR(r1)
-	std	r9,_CCR(r1)		/* Save CR in stackframe */
+	bge	cr1,2f			/* Check if we hit limit of 4 */
+
+	EXCEPTION_PROLOG_COMMON_1()
 	/* We don't touch AMR here, we never go to virtual mode */
-	/* Save r9 through r13 from EXMC save area to stack frame. */
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+	EXCEPTION_PROLOG_COMMON_3(0x200)
+
+	ld	r3,PACA_EXMC+EX_DAR(r13)
+	lwz	r4,PACA_EXMC+EX_DSISR(r13)
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+
 	mfmsr	r11			/* get MSR value */
 	ori	r11,r11,MSR_ME|MSR_RI	/* turn on ME, RI */
 	LOAD_HANDLER(r12, machine_check_handle_early)
@@ -1035,21 +1050,6 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
 	b	1b
 	b	.	/* prevent speculative execution */
 
-#ifdef CONFIG_PPC_PSERIES
-TRAMP_REAL_BEGIN(machine_check_fwnmi)
-	EXCEPTION_PROLOG_0 PACA_EXMC
-machine_check_pseries:
-	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
-	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
-	/*
-	 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
-	 * nested machine check corrupts it. machine_check_common enables
-	 * MSR_RI.
-	 */
-#endif
-
-TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
-
 EXC_COMMON_BEGIN(machine_check_common)
 	/*
 	 * Machine check is different because we use a different
@@ -1116,8 +1116,6 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
 	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
 	 */
 EXC_COMMON_BEGIN(machine_check_handle_early)
-	std	r0,GPR0(r1)	/* Save r0 */
-	EXCEPTION_PROLOG_COMMON_3(0x200)
 	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_early
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 48/52] powerpc/64s/exception: simplify machine check early path
From: Nicholas Piggin @ 2019-06-20  5:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20190620051459.29573-1-npiggin@gmail.com>

machine_check_handle_early_common can reach machine_check_handle_early
directly now that it runs at the relocated address. The only reason to
do the rfi sequence is to enable MSR[ME]. Move that into a helper
function to make the normal code path a bit easier to read.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 30 ++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 384f591ef078..be83a4e71814 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1030,13 +1030,12 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	std	r3,_DAR(r1)
 	std	r4,_DSISR(r1)
 
-	mfmsr	r11			/* get MSR value */
-	ori	r11,r11,MSR_ME|MSR_RI	/* turn on ME, RI */
-	LOAD_HANDLER(r12, machine_check_handle_early)
-1:	mtspr	SPRN_SRR0,r12
-	mtspr	SPRN_SRR1,r11
-	RFI_TO_KERNEL
-	b	.	/* prevent speculative execution */
+	li	r10,MSR_RI
+	mtmsrd	r10,1
+
+	bl	enable_machine_check
+	b	machine_check_handle_early
+
 2:
 	/* Stack overflow. Stay on emergency stack and panic.
 	 * Keep the ME bit off while panic-ing, so that if we hit
@@ -1047,7 +1046,9 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	LOAD_HANDLER(r12, unrecover_mce)
 	li	r10,MSR_ME
 	andc	r11,r11,r10		/* Turn off MSR_ME */
-	b	1b
+	mtspr	SPRN_SRR0,r12
+	mtspr	SPRN_SRR1,r11
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 EXC_COMMON_BEGIN(machine_check_common)
@@ -2283,6 +2284,19 @@ CLOSE_FIXED_SECTION(virt_trampolines);
 
 USE_TEXT_SECTION()
 
+enable_machine_check:
+	mflr	r0
+	bcl	20,31,$+4
+0:	mflr	r3
+	addi	r3,r3,(1f - 0b)
+	mtspr	SPRN_SRR0,r3
+	mfmsr	r3
+	ori	r3,r3,MSR_ME
+	mtspr	SPRN_SRR1,r3
+	RFI_TO_KERNEL
+1:	mtlr	r0
+	blr
+
 /*
  * Hash table stuff
  */
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 49/52] powerpc/64s/exceptions: machine check move unrecoverable handling out of line
From: Nicholas Piggin @ 2019-06-20  5:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20190620051459.29573-1-npiggin@gmail.com>

Similarly to the previous patch, move unrecoverable handling out of
line, which makes the regular path less cluttered and easier to
follow.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 83 +++++++++++++---------------
 1 file changed, 39 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index be83a4e71814..e8f644d6f310 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1016,9 +1016,9 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	bne	1f
 	/* First machine check entry */
 	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
-1:	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
-	/* Limit nested MCE to level 4 to avoid stack overflow */
-	bge	cr1,2f			/* Check if we hit limit of 4 */
+1:	/* Limit nested MCE to level 4 to avoid stack overflow */
+	bgt	cr1,unrecoverable_mce	/* Check if we hit limit of 4 */
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame */
 
 	EXCEPTION_PROLOG_COMMON_1()
 	/* We don't touch AMR here, we never go to virtual mode */
@@ -1032,25 +1032,9 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 
 	li	r10,MSR_RI
 	mtmsrd	r10,1
-
 	bl	enable_machine_check
 	b	machine_check_handle_early
 
-2:
-	/* Stack overflow. Stay on emergency stack and panic.
-	 * Keep the ME bit off while panic-ing, so that if we hit
-	 * another machine check we checkstop.
-	 */
-	addi	r1,r1,INT_FRAME_SIZE	/* go back to previous stack frame */
-	ld	r11,PACAKMSR(r13)
-	LOAD_HANDLER(r12, unrecover_mce)
-	li	r10,MSR_ME
-	andc	r11,r11,r10		/* Turn off MSR_ME */
-	mtspr	SPRN_SRR0,r12
-	mtspr	SPRN_SRR1,r11
-	RFI_TO_KERNEL
-	b	.	/* prevent speculative execution */
-
 EXC_COMMON_BEGIN(machine_check_common)
 	/*
 	 * Machine check is different because we use a different
@@ -1166,32 +1150,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	 * If yes, then stay on emergency stack and panic.
 	 */
 	andi.	r11,r12,MSR_RI
-	bne	2f
-1:	mfspr	r11,SPRN_SRR0
-	LOAD_HANDLER(r10,unrecover_mce)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	/*
-	 * We are going down. But there are chances that we might get hit by
-	 * another MCE during panic path and we may run into unstable state
-	 * with no way out. Hence, turn ME bit off while going down, so that
-	 * when another MCE is hit during panic path, system will checkstop
-	 * and hypervisor will get restarted cleanly by SP.
-	 */
-	li	r3,MSR_ME
-	andc	r10,r10,r3		/* Turn off MSR_ME */
-	mtspr	SPRN_SRR1,r10
-	RFI_TO_KERNEL
-	b	.
-2:
+	beq	unrecoverable_mce
+
 	/*
 	 * Check if we have successfully handled/recovered from error, if not
 	 * then stay on emergency stack and panic.
 	 */
 	ld	r3,RESULT(r1)	/* Load result */
 	cmpdi	r3,0		/* see if we handled MCE successfully */
-
-	beq	1b		/* if !handled then panic */
+	beq	unrecoverable_mce /* if !handled then panic */
 
 	/*
 	 * Return from MC interrupt.
@@ -1213,17 +1180,31 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
 	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
 
-EXC_COMMON_BEGIN(unrecover_mce)
+EXC_COMMON_BEGIN(unrecoverable_mce)
+	/*
+	 * We are going down. But there are chances that we might get hit by
+	 * another MCE during panic path and we may run into unstable state
+	 * with no way out. Hence, turn ME bit off while going down, so that
+	 * when another MCE is hit during panic path, system will checkstop
+	 * and hypervisor will get restarted cleanly by SP.
+	 */
+	bl	disable_machine_check
+	ld	r10,PACAKMSR(r13)
+	li	r3,MSR_ME
+	andc	r10,r10,r3
+	mtmsrd	r10
+
 	/* Invoke machine_check_exception to print MCE event and panic. */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_exception
+
 	/*
-	 * We will not reach here. Even if we did, there is no way out. Call
-	 * unrecoverable_exception and die.
+	 * We will not reach here. Even if we did, there is no way out.
+	 * Call unrecoverable_exception and die.
 	 */
-1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	unrecoverable_exception
-	b	1b
+	b	.
 
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
@@ -2297,6 +2278,20 @@ enable_machine_check:
 1:	mtlr	r0
 	blr
 
+disable_machine_check:
+	mflr	r0
+	bcl	20,31,$+4
+0:	mflr	r3
+	addi	r3,r3,(1f - 0b)
+	mtspr	SPRN_SRR0,r3
+	mfmsr	r3
+	li	r4,MSR_ME
+	andc	r3,r3,r4
+	mtspr	SPRN_SRR1,r3
+	RFI_TO_KERNEL
+1:	mtlr	r0
+	blr
+
 /*
  * Hash table stuff
  */
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 50/52] powerpc/64s/exception: untangle early machine check handler
From: Nicholas Piggin @ 2019-06-20  5:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20190620051459.29573-1-npiggin@gmail.com>

machine_check_early_common now branches to machine_check_handle_early
which is its only caller, and they're separated by a bunch of other
code which makes no sense.

This patch moves that other code out of the way, and removes the
branch instruction.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 129 +++++++++++++--------------
 1 file changed, 62 insertions(+), 67 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e8f644d6f310..793d611fa937 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -980,6 +980,16 @@ machine_check_pseries:
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
+#define MACHINE_CHECK_HANDLER_WINDUP			\
+	/* Clear MSR_RI before setting SRR0 and SRR1. */\
+	li	r9,0;					\
+	mtmsrd	r9,1;		/* Clear MSR_RI */	\
+	/* Decrement paca->in_mce now RI is clear. */	\
+	lhz	r12,PACA_IN_MCE(r13);			\
+	subi	r12,r12,1;				\
+	sth	r12,PACA_IN_MCE(r13);			\
+	EXCEPTION_RESTORE_REGS EXC_STD
+
 EXC_COMMON_BEGIN(machine_check_early_common)
 	mtctr	r10			/* Restore ctr */
 	mfspr	r11,SPRN_SRR0
@@ -1033,74 +1043,7 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 	li	r10,MSR_RI
 	mtmsrd	r10,1
 	bl	enable_machine_check
-	b	machine_check_handle_early
 
-EXC_COMMON_BEGIN(machine_check_common)
-	/*
-	 * Machine check is different because we use a different
-	 * save area: PACA_EXMC instead of PACA_EXGEN.
-	 */
-	EXCEPTION_COMMON(PACA_EXMC, 0x200)
-	FINISH_NAP
-	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r3,PACA_EXMC+EX_DAR(r13)
-	lwz	r4,PACA_EXMC+EX_DSISR(r13)
-	/* Enable MSR_RI when finished with PACA_EXMC */
-	li	r10,MSR_RI
-	mtmsrd 	r10,1
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
-	bl	save_nvgprs
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_exception
-	b	ret_from_except
-
-#define MACHINE_CHECK_HANDLER_WINDUP			\
-	/* Clear MSR_RI before setting SRR0 and SRR1. */\
-	li	r9,0;					\
-	mtmsrd	r9,1;		/* Clear MSR_RI */	\
-	/* Decrement paca->in_mce now RI is clear. */	\
-	lhz	r12,PACA_IN_MCE(r13);			\
-	subi	r12,r12,1;				\
-	sth	r12,PACA_IN_MCE(r13);			\
-	EXCEPTION_RESTORE_REGS EXC_STD
-
-#ifdef CONFIG_PPC_P7_NAP
-/*
- * This is an idle wakeup. Low level machine check has already been
- * done. Queue the event then call the idle code to do the wake up.
- */
-EXC_COMMON_BEGIN(machine_check_idle_common)
-	bl	machine_check_queue_event
-
-	/*
-	 * We have not used any non-volatile GPRs here, and as a rule
-	 * most exception code including machine check does not.
-	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
-	 * wakeup will restore volatile registers.
-	 *
-	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
-	 *
-	 * Then decrement MCE nesting after finishing with the stack.
-	 */
-	ld	r3,_MSR(r1)
-	ld	r4,_LINK(r1)
-
-	lhz	r11,PACA_IN_MCE(r13)
-	subi	r11,r11,1
-	sth	r11,PACA_IN_MCE(r13)
-
-	mtlr	r4
-	rlwinm	r10,r3,47-31,30,31
-	cmpwi	cr1,r10,2
-	bltlr	cr1	/* no state loss, return to idle caller */
-	b	idle_return_gpr_loss
-#endif
-	/*
-	 * Handle machine check early in real mode. We come here with
-	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
-	 */
-EXC_COMMON_BEGIN(machine_check_handle_early)
 	bl	save_nvgprs
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_early
@@ -1180,6 +1123,58 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
 	EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
 
+EXC_COMMON_BEGIN(machine_check_common)
+	/*
+	 * Machine check is different because we use a different
+	 * save area: PACA_EXMC instead of PACA_EXGEN.
+	 */
+	EXCEPTION_COMMON(PACA_EXMC, 0x200)
+	FINISH_NAP
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r3,PACA_EXMC+EX_DAR(r13)
+	lwz	r4,PACA_EXMC+EX_DSISR(r13)
+	/* Enable MSR_RI when finished with PACA_EXMC */
+	li	r10,MSR_RI
+	mtmsrd 	r10,1
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	b	ret_from_except
+
+#ifdef CONFIG_PPC_P7_NAP
+/*
+ * This is an idle wakeup. Low level machine check has already been
+ * done. Queue the event then call the idle code to do the wake up.
+ */
+EXC_COMMON_BEGIN(machine_check_idle_common)
+	bl	machine_check_queue_event
+
+	/*
+	 * We have not used any non-volatile GPRs here, and as a rule
+	 * most exception code including machine check does not.
+	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
+	 * wakeup will restore volatile registers.
+	 *
+	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+	 *
+	 * Then decrement MCE nesting after finishing with the stack.
+	 */
+	ld	r3,_MSR(r1)
+	ld	r4,_LINK(r1)
+
+	lhz	r11,PACA_IN_MCE(r13)
+	subi	r11,r11,1
+	sth	r11,PACA_IN_MCE(r13)
+
+	mtlr	r4
+	rlwinm	r10,r3,47-31,30,31
+	cmpwi	cr1,r10,2
+	bltlr	cr1	/* no state loss, return to idle caller */
+	b	idle_return_gpr_loss
+#endif
+
 EXC_COMMON_BEGIN(unrecoverable_mce)
 	/*
 	 * We are going down. But there are chances that we might get hit by
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 51/52] powerpc/64s/exception: machine check improve branch labels
From: Nicholas Piggin @ 2019-06-20  5:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20190620051459.29573-1-npiggin@gmail.com>

Short forward and backward branches can be given number labels,
but larger significant divergences in code path a more readable
if they're given descriptive names.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 793d611fa937..e21bf047156d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1071,11 +1071,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	 * continue in host kernel in V mode to deliver the MC event.
 	 */
 	rldicl.	r11,r12,4,63		/* See if MC hit while in HV mode. */
-	beq	5f
+	beq	1f
 	andi.	r11,r12,MSR_PR		/* See if coming from user. */
-	bne	9f			/* continue in V mode if we are. */
+	bne	deliver_mce		/* continue in V mode if we are. */
+1:
 
-5:
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 	/*
 	 * We are coming from kernel context. Check if we are coming from
@@ -1084,7 +1084,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	 */
 	lbz	r11,HSTATE_IN_GUEST(r13)
 	cmpwi	r11,0			/* Check if coming from guest */
-	bne	9f			/* continue if we are. */
+	bne	deliver_mce		/* continue if we are. */
 #endif
 	/*
 	 * At this point we are not sure about what context we come from.
@@ -1112,7 +1112,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	MACHINE_CHECK_HANDLER_WINDUP
 	RFI_TO_USER_OR_KERNEL
 
-9:
+deliver_mce:
 	/* Deliver the machine check to host kernel in V mode. */
 BEGIN_FTR_SECTION
 	ld	r10,ORIG_GPR3(r1)
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 52/52] powerpc/64s/exception: add missing branch to self after RFI
From: Nicholas Piggin @ 2019-06-20  5:14 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20190620051459.29573-1-npiggin@gmail.com>

For consistency. These may not be required on modern processors,
and they don't quite fit with the RFI_TO macros, but they should
be all removed in that case.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e21bf047156d..cdf7d7ef0c0e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -944,6 +944,7 @@ EXC_COMMON_BEGIN(system_reset_common)
 
 	EXCEPTION_RESTORE_REGS EXC_STD
 	RFI_TO_USER_OR_KERNEL
+	b	.
 
 
 EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
@@ -1111,6 +1112,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 	bl	machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
 	RFI_TO_USER_OR_KERNEL
+	b	.
 
 deliver_mce:
 	/* Deliver the machine check to host kernel in V mode. */
@@ -1686,6 +1688,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 
 	EXCEPTION_RESTORE_REGS EXC_HV
 	HRFI_TO_USER_OR_KERNEL
+	b	.
 
 1:
 	/*
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH v2] ocxl: Allow contexts to be attached with a NULL mm
From: Nicholas Piggin @ 2019-06-20  5:25 UTC (permalink / raw)
  To: Alastair D'Silva, alastair
  Cc: Andrew Donnellan, Arnd Bergmann, linux-kernel, Masahiro Yamada,
	Paul Mackerras, Suraj Jitindar Singh, Greg Kroah-Hartman,
	Frederic Barrat, Andrew Morton, linuxppc-dev, Thomas Gleixner
In-Reply-To: <20190620041203.12274-1-alastair@au1.ibm.com>

Alastair D'Silva's on June 20, 2019 2:12 pm:
> From: Alastair D'Silva <alastair@d-silva.org>
> 
> If an OpenCAPI context is to be used directly by a kernel driver, there
> may not be a suitable mm to use.
> 
> The patch makes the mm parameter to ocxl_context_attach optional.
> 
> Signed-off-by: Alastair D'Silva <alastair@d-silva.org>

Yeah I don't think you need to manage a kernel context explicitly
because it will always be flushed with tlbie, comment helps. For
the powerpc/mm bit,

Acked-by: Nicholas Piggin <npiggin@gmail.com>


^ permalink raw reply

* Re: [PATCH v2 30/52] powerpc/64s/exception: optimise system_reset for idle, clean up non-idle case
From: Nicholas Piggin @ 2019-06-20  5:41 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20190620051459.29573-31-npiggin@gmail.com>

Nicholas Piggin's on June 20, 2019 3:14 pm:
> The idle wake up code in the system reset interrupt is not very
> optimal. There are two requirements: perform idle wake up quickly;
> and save everything including CFAR for non-idle interrupts, with
> no performance requirement.
> 
> The problem with placing the idle test in the middle of the handler
> and using the normal handler code to save CFAR, is that it's quite
> costly (e.g., mfcfar is serialising, speculative workarounds get
> applied, SRR1 has to be reloaded, etc). It also prevents the standard
> interrupt handler boilerplate being used.
> 
> This pain can be avoided by using a dedicated idle interrupt handler
> at the start of the interrupt handler, which restores all registers
> back to the way they were in case it was not an idle wake up. CFAR
> is preserved without saving it before the non-idle case by making that
> the fall-through, and idle is a taken branch.
> 
> Performance seems to be in the noise, but possibly around 0.5% faster,
> the executed instructions certainly look better. The bigger benefit is
> being able to drop in standard interrupt handlers after the idle code,
> which helps with subsequent cleanup and consolidation.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/kernel/exceptions-64s.S | 89 ++++++++++++++--------------
>  1 file changed, 44 insertions(+), 45 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index e0492912ea79..f582ae30f3f7 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -241,7 +241,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
>   * load KBASE for a slight optimisation.
>   */
>  #define BRANCH_TO_C000(reg, label)					\
> -	__LOAD_HANDLER(reg, label);					\
> +	__LOAD_FAR_HANDLER(reg, label);					\
>  	mtctr	reg;							\
>  	bctr
>  
> @@ -784,16 +784,6 @@ EXC_VIRT_NONE(0x4000, 0x100)
>  
>  
>  EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
> -	SET_SCRATCH0(r13)
> -	EXCEPTION_PROLOG_0 PACA_EXNMI
> -
> -	/* This is EXCEPTION_PROLOG_1 with the idle feature section added */
> -	OPT_SAVE_REG_TO_PACA(PACA_EXNMI+EX_PPR, r9, CPU_FTR_HAS_PPR)
> -	OPT_SAVE_REG_TO_PACA(PACA_EXNMI+EX_CFAR, r10, CPU_FTR_CFAR)
> -	INTERRUPT_TO_KERNEL
> -	SAVE_CTR(r10, PACA_EXNMI)
> -	mfcr	r9
> -
>  #ifdef CONFIG_PPC_P7_NAP
>  	/*
>  	 * If running native on arch 2.06 or later, check if we are waking up
> @@ -801,45 +791,67 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
>  	 * bits 46:47. A non-0 value indicates that we are coming from a power
>  	 * saving state. The idle wakeup handler initially runs in real mode,
>  	 * but we branch to the 0xc000... address so we can turn on relocation
> -	 * with mtmsr.
> +	 * with mtmsrd later, after SPRs are restored.
> +	 *
> +	 * Careful to minimise cost for the fast path (idle wakeup) while
> +	 * also avoiding clobbering CFAR for the non-idle case. Once we know
> +	 * it is an idle wake, volatiles don't matter, which is why we use
> +	 * those here, and then re-do the entry in case of non-idle (without
> +	 * branching for the non-idle case, to keep CFAR).
>  	 */
>  BEGIN_FTR_SECTION
> -	mfspr	r10,SPRN_SRR1
> -	rlwinm.	r10,r10,47-31,30,31
> -	beq-	1f
> -	cmpwi	cr1,r10,2
> +	SET_SCRATCH0(r13)
> +	GET_PACA(r13)
> +	std	r3,PACA_EXNMI+0*8(r13)
> +	std	r4,PACA_EXNMI+1*8(r13)
> +	std	r5,PACA_EXNMI+2*8(r13)
>  	mfspr	r3,SPRN_SRR1
> -	bltlr	cr1	/* no state loss, return to idle caller */
> -	BRANCH_TO_C000(r10, system_reset_idle_common)
> -1:
> +	mfocrf	r4,0x80
> +	rlwinm.	r5,r3,47-31,30,31
> +	bne+	system_reset_idle_wake
> +	/* Not powersave wakeup. Restore regs for regular interrupt handler. */
> +	mtocrf	0x80,r4
> +	ld	r12,PACA_EXNMI+0*8(r13)
> +	ld	r4,PACA_EXNMI+1*8(r13)
> +	ld	r5,PACA_EXNMI+2*8(r13)
> +	GET_SCRATCH0(r13)

For the love of... that should be 'ld r3', not 'ld r12', sorry.

Thanks,
Nick

^ permalink raw reply

* [PATCH] KVM: PPC: Book3S HV: Fix CR0 setting in TM emulation
From: Michael Neuling @ 2019-06-20  6:00 UTC (permalink / raw)
  To: mpe; +Cc: mikey, linuxppc-dev, sjitindarsingh, kvm-ppc

When emulating tsr, treclaim and trechkpt, we incorrectly set CR0. The
code currently sets:
    CR0 <- 00 || MSR[TS]
but according to the ISA it should be:
    CR0 <-  0 || MSR[TS] || 0

This fixes the bit shift to put the bits in the correct location.

Tested-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
---
 arch/powerpc/kvm/book3s_hv_tm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
index 888e2609e3..31cd0f327c 100644
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -131,7 +131,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 		}
 		/* Set CR0 to indicate previous transactional state */
 		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
-			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
 		/* L=1 => tresume, L=0 => tsuspend */
 		if (instr & (1 << 21)) {
 			if (MSR_TM_SUSPENDED(msr))
@@ -175,7 +175,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
 		/* Set CR0 to indicate previous transactional state */
 		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
-			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
 		vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
 		return RESUME_GUEST;
 
@@ -205,7 +205,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
 		/* Set CR0 to indicate previous transactional state */
 		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
-			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
 		vcpu->arch.shregs.msr = msr | MSR_TS_S;
 		return RESUME_GUEST;
 	}
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v4 1/4] lib/scatterlist: Fix mapping iterator when sg->offset is greater than PAGE_SIZE
From: Herbert Xu @ 2019-06-20  6:02 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: horia.geanta, Imre Deak, linux-kernel, linux-crypto, linuxppc-dev,
	David S. Miller
In-Reply-To: <f28c6b0e2f9510f42ca934f19c4315084e668c21.1560805614.git.christophe.leroy@c-s.fr>

On Mon, Jun 17, 2019 at 09:15:02PM +0000, Christophe Leroy wrote:
> All mapping iterator logic is based on the assumption that sg->offset
> is always lower than PAGE_SIZE.
> 
> But there are situations where sg->offset is such that the SG item
> is on the second page. In that case sg_copy_to_buffer() fails
> properly copying the data into the buffer. One of the reason is
> that the data will be outside the kmapped area used to access that
> data.
> 
> This patch fixes the issue by adjusting the mapping iterator
> offset and pgoffset fields such that offset is always lower than
> PAGE_SIZE.
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> Fixes: 4225fc8555a9 ("lib/scatterlist: use page iterator in the mapping iterator")
> Cc: stable@vger.kernel.org
> ---
>  lib/scatterlist.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)

Good catch.

> @@ -686,7 +686,12 @@ static bool sg_miter_get_next_page(struct sg_mapping_iter *miter)
>  		sg = miter->piter.sg;
>  		pgoffset = miter->piter.sg_pgoffset;
>  
> -		miter->__offset = pgoffset ? 0 : sg->offset;
> +		offset = pgoffset ? 0 : sg->offset;
> +		while (offset >= PAGE_SIZE) {
> +			miter->piter.sg_pgoffset = ++pgoffset;
> +			offset -= PAGE_SIZE;
> +		}

How about

	miter->piter.sg_pgoffset += offset >> PAGE_SHIFT;
	offset &= PAGE_SIZE - 1;

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code
From: Christoph Hellwig @ 2019-06-20  6:03 UTC (permalink / raw)
  To: Linus Torvalds, Alexey Kardashevskiy
  Cc: linux-kernel, Paul Mackerras, linuxppc-dev, Christoph Hellwig
In-Reply-To: <b0ce7d72-5de7-63d3-cb4e-ea78342cb3fa@ozlabs.ru>

Hi Linus,

this goes back to the discussion at last years kernel summit, where
we had the discussion on removing code never used by any in-kernel
user an no prospects of one.  The IBM folks are unfortunately still
dragging their feet on the powerpc side.  Can we revise this discussion?

The use case here is a IBM specific bus for which they only have an
out of tree driver that their partner doesn't want to submit for mainline,
but keep insisting on keeping the code around (which is also built
uncondіtionally for the platform).

I hope we had settled that argument back then, but it seems like Big
Blue insists they are special.

On Thu, Jun 20, 2019 at 11:45:42AM +1000, Alexey Kardashevskiy wrote:
> 
> 
> On 19/06/2019 17:28, Christoph Hellwig wrote:
> > On Wed, Jun 19, 2019 at 10:34:54AM +1000, Alexey Kardashevskiy wrote:
> >>
> >>
> >> On 23/05/2019 17:49, Christoph Hellwig wrote:
> >>> None of these routines were ever used since they were added to the
> >>> kernel.
> >>
> >>
> >> It is still being used exactly in the way as it was explained before in
> >> previous respins. Thanks.
> > 
> > Please point to the in-kernel user, because that is the only relevant
> > one.  This is not just my opinion but we had a clear discussion on that
> > at least years kernel summit.
> 
> 
> There is no in-kernel user which still does not mean that the code is
> dead. If it is irrelevant - put this to the commit log instead of saying
> it is dead; also if there was a clear outcome from that discussion, then
> please point me to that, I do not get to attend these discussions. Thanks,
> 
> 
> -- 
> Alexey
---end quoted text---

^ permalink raw reply

* Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code
From: Alexey Kardashevskiy @ 2019-06-20  6:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linuxppc-dev, linux-kernel, Paul Mackerras, Linus Torvalds
In-Reply-To: <20190620060354.GA20279@lst.de>



On 20/06/2019 16:03, Christoph Hellwig wrote:
> Hi Linus,
> 
> this goes back to the discussion at last years kernel summit, where
> we had the discussion on removing code never used by any in-kernel
> user an no prospects of one.  The IBM folks are unfortunately still
> dragging their feet on the powerpc side.  Can we revise this discussion?
> 
> The use case here is a IBM specific bus for which they only have an
> out of tree driver that their partner doesn't want to submit for mainline,
> but keep insisting on keeping the code around (which is also built
> uncondіtionally for the platform).


I personally keep insisting on correct commit logs, i.e. not calling
working code dead and providing actual reasons for the change. Thanks,


> 
> I hope we had settled that argument back then, but it seems like Big
> Blue insists they are special.
> 
> On Thu, Jun 20, 2019 at 11:45:42AM +1000, Alexey Kardashevskiy wrote:
>>
>>
>> On 19/06/2019 17:28, Christoph Hellwig wrote:
>>> On Wed, Jun 19, 2019 at 10:34:54AM +1000, Alexey Kardashevskiy wrote:
>>>>
>>>>
>>>> On 23/05/2019 17:49, Christoph Hellwig wrote:
>>>>> None of these routines were ever used since they were added to the
>>>>> kernel.
>>>>
>>>>
>>>> It is still being used exactly in the way as it was explained before in
>>>> previous respins. Thanks.
>>>
>>> Please point to the in-kernel user, because that is the only relevant
>>> one.  This is not just my opinion but we had a clear discussion on that
>>> at least years kernel summit.
>>
>>
>> There is no in-kernel user which still does not mean that the code is
>> dead. If it is irrelevant - put this to the commit log instead of saying
>> it is dead; also if there was a clear outcome from that discussion, then
>> please point me to that, I do not get to attend these discussions. Thanks,


-- 
Alexey

^ permalink raw reply

* Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code
From: Christoph Hellwig @ 2019-06-20  6:21 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: linux-kernel, Linus Torvalds, Paul Mackerras, linuxppc-dev,
	Christoph Hellwig
In-Reply-To: <309781b5-108f-c219-2cda-49179dca6b13@ozlabs.ru>

On Thu, Jun 20, 2019 at 04:20:08PM +1000, Alexey Kardashevskiy wrote:
> 
> 
> On 20/06/2019 16:03, Christoph Hellwig wrote:
> > Hi Linus,
> > 
> > this goes back to the discussion at last years kernel summit, where
> > we had the discussion on removing code never used by any in-kernel
> > user an no prospects of one.  The IBM folks are unfortunately still
> > dragging their feet on the powerpc side.  Can we revise this discussion?
> > 
> > The use case here is a IBM specific bus for which they only have an
> > out of tree driver that their partner doesn't want to submit for mainline,
> > but keep insisting on keeping the code around (which is also built
> > uncondіtionally for the platform).
> 
> 
> I personally keep insisting on correct commit logs, i.e. not calling
> working code dead and providing actual reasons for the change. Thanks,

If that is the only thing you are complaining about I can clarify it
a little of course.  But it didn't sound like that was the actual
problem.

^ permalink raw reply

* Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code
From: Alexey Kardashevskiy @ 2019-06-20  6:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linuxppc-dev, linux-kernel, Paul Mackerras, Linus Torvalds
In-Reply-To: <20190620062126.GA20765@lst.de>



On 20/06/2019 16:21, Christoph Hellwig wrote:
> On Thu, Jun 20, 2019 at 04:20:08PM +1000, Alexey Kardashevskiy wrote:
>>
>>
>> On 20/06/2019 16:03, Christoph Hellwig wrote:
>>> Hi Linus,
>>>
>>> this goes back to the discussion at last years kernel summit, where
>>> we had the discussion on removing code never used by any in-kernel
>>> user an no prospects of one.  The IBM folks are unfortunately still
>>> dragging their feet on the powerpc side.  Can we revise this discussion?
>>>
>>> The use case here is a IBM specific bus for which they only have an
>>> out of tree driver that their partner doesn't want to submit for mainline,
>>> but keep insisting on keeping the code around (which is also built
>>> uncondіtionally for the platform).
>>
>>
>> I personally keep insisting on correct commit logs, i.e. not calling
>> working code dead and providing actual reasons for the change. Thanks,
> 
> If that is the only thing you are complaining about I can clarify it
> a little of course.

Please do so. For me the problem is that if a maintainer decides to pull
that then so be it but I want that person to know exactly what is
happening. As it is now - the code may seem dead as nobody complains so
- I complained.


> But it didn't sound like that was the actual
> problem.

I'd like to see some formal statement in a written form about where we
stand in regard to the out-of-tree drivers support^wacceptability
(cannot pick the right word). Thanks,


-- 
Alexey

^ permalink raw reply

* Re: [PATCH 2/3] KVM: PPC: Book3S HV: Signed extend decrementer value if not using large decr
From: Laurent Vivier @ 2019-06-20  7:56 UTC (permalink / raw)
  To: Suraj Jitindar Singh, linuxppc-dev; +Cc: clg, kvm-ppc
In-Reply-To: <20190620014651.7645-2-sjitindarsingh@gmail.com>

On 20/06/2019 03:46, Suraj Jitindar Singh wrote:
> On POWER9 the decrementer can operate in large decrementer mode where
> the decrementer is 56 bits and signed extended to 64 bits. When not
> operating in this mode the decrementer behaves as a 32 bit decrementer
> which is NOT signed extended (as on POWER8).
> 
> Currently when reading a guest decrementer value we don't take into
> account whether the large decrementer is enabled or not, and this means
> the value will be incorrect when the guest is not using the large
> decrementer. Fix this by sign extending the value read when the guest
> isn't using the large decrementer.
> 
> Fixes: 95a6432ce903 "KVM: PPC: Book3S HV: Streamlined guest entry/exit path on P9 for radix guests"
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> ---
>  arch/powerpc/kvm/book3s_hv.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index d3684509da35..719fd2529eec 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3607,6 +3607,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
>  
>  	vcpu->arch.slb_max = 0;
>  	dec = mfspr(SPRN_DEC);
> +	if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
> +		dec = (s32) dec;
>  	tb = mftb();
>  	vcpu->arch.dec_expires = dec + tb;
>  	vcpu->cpu = -1;
> 

Patches 2 and 3: tested I can boot and run an L2 nested guest with qemu
v4.0.0 and caps-large-decr=on in the case we have had a hang previously.

Tested-by: Laurent Vivier <lvivier@redhat.com>

^ permalink raw reply

* Re: [PATCH 3/3] KVM: PPC: Book3S HV: Clear pending decr exceptions on nested guest entry
From: Laurent Vivier @ 2019-06-20  7:57 UTC (permalink / raw)
  To: Suraj Jitindar Singh, linuxppc-dev; +Cc: clg, kvm-ppc
In-Reply-To: <20190620014651.7645-3-sjitindarsingh@gmail.com>

On 20/06/2019 03:46, Suraj Jitindar Singh wrote:
> If we enter an L1 guest with a pending decrementer exception then this
> is cleared on guest exit if the guest has writtien a positive value into
> the decrementer (indicating that it handled the decrementer exception)
> since there is no other way to detect that the guest has handled the
> pending exception and that it should be dequeued. In the event that the
> L1 guest tries to run a nested (L2) guest immediately after this and the
> L2 guest decrementer is negative (which is loaded by L1 before making
> the H_ENTER_NESTED hcall), then the pending decrementer exception
> isn't cleared and the L2 entry is blocked since L1 has a pending
> exception, even though L1 may have already handled the exception and
> written a positive value for it's decrementer. This results in a loop of
> L1 trying to enter the L2 guest and L0 blocking the entry since L1 has
> an interrupt pending with the outcome being that L2 never gets to run
> and hangs.
> 
> Fix this by clearing any pending decrementer exceptions when L1 makes
> the H_ENTER_NESTED hcall since it won't do this if it's decrementer has
> gone negative, and anyway it's decrementer has been communicated to L0
> in the hdec_expires field and L0 will return control to L1 when this
> goes negative by delivering an H_DECREMENTER exception.
> 
> Fixes: 95a6432ce903 "KVM: PPC: Book3S HV: Streamlined guest entry/exit path on P9 for radix guests"
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> ---
>  arch/powerpc/kvm/book3s_hv.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 719fd2529eec..4a5eb29b952f 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4128,8 +4128,15 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
>  
>  	preempt_enable();
>  
> -	/* cancel pending decrementer exception if DEC is now positive */
> -	if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
> +	/*
> +	 * cancel pending decrementer exception if DEC is now positive, or if
> +	 * entering a nested guest in which case the decrementer is now owned
> +	 * by L2 and the L1 decrementer is provided in hdec_expires
> +	 */
> +	if (kvmppc_core_pending_dec(vcpu) &&
> +			((get_tb() < vcpu->arch.dec_expires) ||
> +			 (trap == BOOK3S_INTERRUPT_SYSCALL &&
> +			  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
>  		kvmppc_core_dequeue_dec(vcpu);
>  
>  	trace_kvm_guest_exit(vcpu);
> 

Patches 2 and 3: tested I can boot and run an L2 nested guest with qemu
v4.0.0 and caps-large-decr=on in the case we have had a hang previously.

Tested-by: Laurent Vivier <lvivier@redhat.com>


^ permalink raw reply

* Re: [PATCH] crypto: vmx - Document CTR mode counter width quirks
From: Herbert Xu @ 2019-06-20  8:06 UTC (permalink / raw)
  To: Daniel Axtens
  Cc: leo.barbosa, Stephan Mueller, nayna, omosnacek, ebiggers, leitao,
	pfsmorigo, linux-crypto, marcelo.cerri, gcwilson, linuxppc-dev
In-Reply-To: <20190611015431.26772-1-dja@axtens.net>

On Tue, Jun 11, 2019 at 11:54:31AM +1000, Daniel Axtens wrote:
> The CTR code comes from OpenSSL, where it does a 32-bit counter.
> The kernel has a 128-bit counter. This difference has lead to
> issues.
> 
> Document it.
> 
> Signed-off-by: Daniel Axtens <dja@axtens.net>
> ---
>  drivers/crypto/vmx/aesp8-ppc.pl | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] crypto: talitos - fix max key size for sha384 and sha512
From: Herbert Xu @ 2019-06-20  8:06 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: linux-kernel, linuxppc-dev, David S. Miller, horia.geanta,
	linux-crypto
In-Reply-To: <5f1004d33b2347dcfbc677551bafc9d469bb079e.1560318544.git.christophe.leroy@c-s.fr>

On Wed, Jun 12, 2019 at 05:49:50AM +0000, Christophe Leroy wrote:
> Below commit came with a typo in the CONFIG_ symbol, leading
> to a permanently reduced max key size regarless of the driver
> capabilities.
> 
> Reported-by: Horia Geantă <horia.geanta@nxp.com>
> Fixes: b8fbdc2bc4e7 ("crypto: talitos - reduce max key size for SEC1")
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  drivers/crypto/talitos.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v2] crypto: nx: no need to check return value of debugfs_create functions
From: Herbert Xu @ 2019-06-20  8:07 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: nayna, pfsmorigo, linux-crypto, leitao, paulus, linuxppc-dev
In-Reply-To: <20190614142904.GA11066@kroah.com>

Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote:
> When calling debugfs functions, there is no need to ever check the
> return value.  The function can work or not, but the code logic should
> never do something different based on this.
> 
> Also, there is no need to store the individual debugfs file names,
> especially as the whole directiry is deleted at once, so remove the
> unneeded structure entirely.
> 
> Cc: "Breno Leitão" <leitao@debian.org>
> Cc: Nayna Jain <nayna@linux.ibm.com>
> Cc: Paulo Flabiano Smorigo <pfsmorigo@gmail.com>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: linux-crypto@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> ---
> v2: fixed build error found by kbuild
> 
> drivers/crypto/nx/nx.c         |  4 +-
> drivers/crypto/nx/nx.h         | 12 +-----
> drivers/crypto/nx/nx_debugfs.c | 71 +++++++++++-----------------------
> 3 files changed, 26 insertions(+), 61 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v12 00/31] Speculative page faults
From: Haiyan Song @ 2019-06-20  8:19 UTC (permalink / raw)
  To: Laurent Dufour
  Cc: jack, sergey.senozhatsky.work, peterz, Will Deacon, mhocko,
	linux-mm, paulus, Punit Agrawal, hpa, Michel Lespinasse,
	Alexei Starovoitov, Andrea Arcangeli, ak, Minchan Kim,
	aneesh.kumar, x86, Matthew Wilcox, Daniel Jordan, Ingo Molnar,
	David Rientjes, paulmck, npiggin, sj38.park, Jerome Glisse, dave,
	kemi.wang, kirill, Thomas Gleixner, zhong jiang, Ganesh Mahendran,
	Yang Shi, Mike Rapoport, linuxppc-dev, linux-kernel,
	Sergey Senozhatsky, vinayak menon, akpm, Tim Chen, haren
In-Reply-To: <1c412ebe-c213-ee67-d261-c70ddcd34b79@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1253 bytes --]

Hi Laurent,

I downloaded your script and run it on Intel 2s skylake platform with spf-v12 patch
serials.

Here attached the output results of this script.

The following comparison result is statistics from the script outputs.

a). Enable THP
                                            SPF_0          change       SPF_1
will-it-scale.page_fault2.per_thread_ops    2664190.8      -11.7%       2353637.6      
will-it-scale.page_fault3.per_thread_ops    4480027.2      -14.7%       3819331.9     


b). Disable THP
                                            SPF_0           change      SPF_1
will-it-scale.page_fault2.per_thread_ops    2653260.7       -10%        2385165.8
will-it-scale.page_fault3.per_thread_ops    4436330.1       -12.4%      3886734.2 


Thanks,
Haiyan Song


On Fri, Jun 14, 2019 at 10:44:47AM +0200, Laurent Dufour wrote:
> Le 14/06/2019 à 10:37, Laurent Dufour a écrit :
> > Please find attached the script I run to get these numbers.
> > This would be nice if you could give it a try on your victim node and share the result.
> 
> Sounds that the Intel mail fitering system doesn't like the attached shell script.
> Please find it there: https://gist.github.com/ldu4/a5cc1a93f293108ea387d43d5d5e7f44
> 
> Thanks,
> Laurent.
> 

[-- Attachment #2: page_fault2_threads.5.1.0-rc4-mm1-00300-g02c5a1f.out --]
[-- Type: text/plain, Size: 715 bytes --]

#### THP always
#### SPF 0
average:2628818
average:2732209
average:2728392
average:2550695
average:2689873
average:2691963
average:2627612
average:2558295
average:2707877
average:2726174
#### SPF 1
average:2426260
average:2145674
average:2117769
average:2292502
average:2350403
average:2483327
average:2467324
average:2335393
average:2437859
average:2479865
#### THP never
#### SPF 0
average:2712575
average:2711447
average:2672362
average:2701981
average:2668073
average:2579296
average:2662048
average:2637422
average:2579143
average:2608260
#### SPF 1
average:2348782
average:2203349
average:2312960
average:2402995
average:2318914
average:2543129
average:2390337
average:2490178
average:2416798
average:2424216

[-- Attachment #3: page_fault3_threads.5.1.0-rc4-mm1-00300-g02c5a1f.out --]
[-- Type: text/plain, Size: 715 bytes --]

#### THP always
#### SPF 0
average:4370143
average:4245754
average:4678884
average:4665759
average:4665809
average:4639132
average:4210755
average:4330552
average:4290469
average:4703015
#### SPF 1
average:3810608
average:3918890
average:3758003
average:3965024
average:3578151
average:3822748
average:3687293
average:3998701
average:3915771
average:3738130
#### THP never
#### SPF 0
average:4505598
average:4672023
average:4701787
average:4355885
average:4338397
average:4446350
average:4360811
average:4653767
average:4016352
average:4312331
#### SPF 1
average:3685383
average:4029413
average:4051615
average:3747588
average:4058557
average:4042340
average:3971295
average:3752943
average:3750626
average:3777582

^ permalink raw reply

* Re: [PATCH 3/3] KVM: PPC: Book3S HV: Clear pending decr exceptions on nested guest entry
From: Cédric Le Goater @ 2019-06-20  8:19 UTC (permalink / raw)
  To: Laurent Vivier, Suraj Jitindar Singh, linuxppc-dev; +Cc: kvm-ppc
In-Reply-To: <30c02f09-8376-3dd0-e463-94d396df0240@redhat.com>

On 20/06/2019 09:57, Laurent Vivier wrote:
> On 20/06/2019 03:46, Suraj Jitindar Singh wrote:
>> If we enter an L1 guest with a pending decrementer exception then this
>> is cleared on guest exit if the guest has writtien a positive value into
>> the decrementer (indicating that it handled the decrementer exception)
>> since there is no other way to detect that the guest has handled the
>> pending exception and that it should be dequeued. In the event that the
>> L1 guest tries to run a nested (L2) guest immediately after this and the
>> L2 guest decrementer is negative (which is loaded by L1 before making
>> the H_ENTER_NESTED hcall), then the pending decrementer exception
>> isn't cleared and the L2 entry is blocked since L1 has a pending
>> exception, even though L1 may have already handled the exception and
>> written a positive value for it's decrementer. This results in a loop of
>> L1 trying to enter the L2 guest and L0 blocking the entry since L1 has
>> an interrupt pending with the outcome being that L2 never gets to run
>> and hangs.
>>
>> Fix this by clearing any pending decrementer exceptions when L1 makes
>> the H_ENTER_NESTED hcall since it won't do this if it's decrementer has
>> gone negative, and anyway it's decrementer has been communicated to L0
>> in the hdec_expires field and L0 will return control to L1 when this
>> goes negative by delivering an H_DECREMENTER exception.
>>
>> Fixes: 95a6432ce903 "KVM: PPC: Book3S HV: Streamlined guest entry/exit path on P9 for radix guests"
>>
>> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
>> ---
>>  arch/powerpc/kvm/book3s_hv.c | 11 +++++++++--
>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 719fd2529eec..4a5eb29b952f 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -4128,8 +4128,15 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
>>  
>>  	preempt_enable();
>>  
>> -	/* cancel pending decrementer exception if DEC is now positive */
>> -	if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
>> +	/*
>> +	 * cancel pending decrementer exception if DEC is now positive, or if
>> +	 * entering a nested guest in which case the decrementer is now owned
>> +	 * by L2 and the L1 decrementer is provided in hdec_expires
>> +	 */
>> +	if (kvmppc_core_pending_dec(vcpu) &&
>> +			((get_tb() < vcpu->arch.dec_expires) ||
>> +			 (trap == BOOK3S_INTERRUPT_SYSCALL &&
>> +			  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
>>  		kvmppc_core_dequeue_dec(vcpu);
>>  
>>  	trace_kvm_guest_exit(vcpu);
>>
> 
> Patches 2 and 3: tested I can boot and run an L2 nested guest with qemu
> v4.0.0 and caps-large-decr=on in the case we have had a hang previously.
> 
> Tested-by: Laurent Vivier <lvivier@redhat.com>

You beat me to it. All works fine on L0, L1, L2.

  Tested-by: Cédric Le Goater <clg@kaod.org>

With a QEMU-4.1. In this configuration, L2 runs with the XIVE (emulated) 
interrupt mode by default now (kernel_irqchip=allowed, ic-mode=dual).

Thanks,

C.



^ permalink raw reply

* [PATCH v4 0/6] Fixes related namespace alignment/page size/big endian
From: Aneesh Kumar K.V @ 2019-06-20  9:16 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm

This series handle configs where hugepage support is not enabled by default.
Also, we update some of the information messages to make sure we use PAGE_SIZE instead
of SZ_4K. We now store page size and struct page size in pfn_sb and do extra check
before enabling namespace. There also an endianness fix.

The patch series is on top of subsection v10 patchset

http://lore.kernel.org/linux-mm/156092349300.979959.17603710711957735135.stgit@dwillia2-desk3.amr.corp.intel.com

Changes from V3:
* Dropped the change related PFN_MIN_VERSION
* for pfn_sb minor version < 4, we default page_size to PAGE_SIZE instead of SZ_4k.

Aneesh Kumar K.V (6):
  nvdimm: Consider probe return -EOPNOTSUPP as success
  mm/nvdimm: Add page size and struct page size to pfn superblock
  mm/nvdimm: Use correct #defines instead of open coding
  mm/nvdimm: Pick the right alignment default when creating dax devices
  mm/nvdimm: Use correct alignment when looking at first pfn from a
    region
  mm/nvdimm: Fix endian conversion issues 

 arch/powerpc/include/asm/libnvdimm.h |  9 ++++
 arch/powerpc/mm/Makefile             |  1 +
 arch/powerpc/mm/nvdimm.c             | 34 +++++++++++++++
 arch/x86/include/asm/libnvdimm.h     | 19 +++++++++
 drivers/nvdimm/btt.c                 |  8 ++--
 drivers/nvdimm/bus.c                 |  4 +-
 drivers/nvdimm/label.c               |  2 +-
 drivers/nvdimm/namespace_devs.c      | 13 +++---
 drivers/nvdimm/nd-core.h             |  3 +-
 drivers/nvdimm/nd.h                  |  6 ---
 drivers/nvdimm/pfn.h                 |  5 ++-
 drivers/nvdimm/pfn_devs.c            | 62 ++++++++++++++++++++++++++--
 drivers/nvdimm/pmem.c                | 26 ++++++++++--
 drivers/nvdimm/region_devs.c         | 27 ++++++++----
 include/linux/huge_mm.h              |  7 +++-
 kernel/memremap.c                    |  8 ++--
 16 files changed, 194 insertions(+), 40 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

-- 
2.21.0


^ permalink raw reply

* [PATCH v4 1/6] nvdimm: Consider probe return -EOPNOTSUPP as success
From: Aneesh Kumar K.V @ 2019-06-20  9:16 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm
In-Reply-To: <20190620091626.31824-1-aneesh.kumar@linux.ibm.com>

This patch add -EOPNOTSUPP as return from probe callback to
indicate we were not able to initialize a namespace due to pfn superblock
feature/version mismatch. We want to consider this a probe success so that
we can create new namesapce seed and there by avoid marking the failed
namespace as the seed namespace.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 drivers/nvdimm/bus.c         |  4 ++--
 drivers/nvdimm/nd-core.h     |  3 ++-
 drivers/nvdimm/pmem.c        | 26 ++++++++++++++++++++++----
 drivers/nvdimm/region_devs.c | 19 +++++++++++++++----
 4 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 2dca3034fee0..3b8ffb3966ab 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -92,8 +92,8 @@ static int nvdimm_bus_probe(struct device *dev)
 
 	nvdimm_bus_probe_start(nvdimm_bus);
 	rc = nd_drv->probe(dev);
-	if (rc == 0)
-		nd_region_probe_success(nvdimm_bus, dev);
+	if (rc == 0 || rc == -EOPNOTSUPP)
+		nd_region_probe_success(nvdimm_bus, dev, rc);
 	else
 		nd_region_disable(nvdimm_bus, dev);
 	nvdimm_bus_probe_end(nvdimm_bus);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 391e88de3a29..4e6ffa0d89bb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -126,7 +126,8 @@ int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
 void nvdimm_devs_exit(void);
 void nd_region_devs_exit(void);
-void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus,
+			     struct device *dev, int ret);
 struct nd_region;
 void nd_region_create_ns_seed(struct nd_region *nd_region);
 void nd_region_create_btt_seed(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..422b11c01301 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -497,6 +497,7 @@ static int pmem_attach_disk(struct device *dev,
 
 static int nd_pmem_probe(struct device *dev)
 {
+	int ret;
 	struct nd_namespace_common *ndns;
 
 	ndns = nvdimm_namespace_common_probe(dev);
@@ -512,12 +513,29 @@ static int nd_pmem_probe(struct device *dev)
 	if (is_nd_pfn(dev))
 		return pmem_attach_disk(dev, ndns);
 
-	/* if we find a valid info-block we'll come back as that personality */
-	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
-			|| nd_dax_probe(dev, ndns) == 0)
+	ret = nd_btt_probe(dev, ndns);
+	if (ret == 0)
 		return -ENXIO;
+	else if (ret == -EOPNOTSUPP)
+		return ret;
 
-	/* ...otherwise we're just a raw pmem device */
+	ret = nd_pfn_probe(dev, ndns);
+	if (ret == 0)
+		return -ENXIO;
+	else if (ret == -EOPNOTSUPP)
+		return ret;
+
+	ret = nd_dax_probe(dev, ndns);
+	if (ret == 0)
+		return -ENXIO;
+	else if (ret == -EOPNOTSUPP)
+		return ret;
+	/*
+	 * We have two failure conditions here, there is no
+	 * info reserver block or we found a valid info reserve block
+	 * but failed to initialize the pfn superblock.
+	 * Don't create a raw pmem disk for the second case.
+	 */
 	return pmem_attach_disk(dev, ndns);
 }
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4fed9ce9c2fe..1e74a1c9fdac 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -715,7 +715,7 @@ void nd_mapping_free_labels(struct nd_mapping *nd_mapping)
  * disable the region.
  */
 static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
-		struct device *dev, bool probe)
+					   struct device *dev, bool probe, int ret)
 {
 	struct nd_region *nd_region;
 
@@ -745,6 +745,16 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 			nd_region_create_ns_seed(nd_region);
 		nvdimm_bus_unlock(dev);
 	}
+
+	if (dev->parent && is_nd_region(dev->parent) &&
+	    !probe && (ret == -EOPNOTSUPP)) {
+		nd_region = to_nd_region(dev->parent);
+		nvdimm_bus_lock(dev);
+		if (nd_region->ns_seed == dev)
+			nd_region_create_ns_seed(nd_region);
+		nvdimm_bus_unlock(dev);
+	}
+
 	if (is_nd_btt(dev) && probe) {
 		struct nd_btt *nd_btt = to_nd_btt(dev);
 
@@ -780,14 +790,15 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 	}
 }
 
-void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus,
+			     struct device *dev, int ret)
 {
-	nd_region_notify_driver_action(nvdimm_bus, dev, true);
+	nd_region_notify_driver_action(nvdimm_bus, dev, true, ret);
 }
 
 void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev)
 {
-	nd_region_notify_driver_action(nvdimm_bus, dev, false);
+	nd_region_notify_driver_action(nvdimm_bus, dev, false, 0);
 }
 
 static ssize_t mappingN(struct device *dev, char *buf, int n)
-- 
2.21.0


^ permalink raw reply related

* [PATCH v4 4/6] mm/nvdimm: Pick the right alignment default when creating dax devices
From: Aneesh Kumar K.V @ 2019-06-20  9:16 UTC (permalink / raw)
  To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm
In-Reply-To: <20190620091626.31824-1-aneesh.kumar@linux.ibm.com>

Allow arch to provide the supported alignments and use hugepage alignment only
if we support hugepage. Right now we depend on compile time configs whereas this
patch switch this to runtime discovery.

Architectures like ppc64 can have THP enabled in code, but then can have
hugepage size disabled by the hypervisor. This allows us to create dax devices
with PAGE_SIZE alignment in this case.

Existing dax namespace with alignment larger than PAGE_SIZE will fail to
initialize in this specific case. We still allow fsdax namespace initialization.

With respect to identifying whether to enable hugepage fault for a dax device,
if THP is enabled during compile, we default to taking hugepage fault and in dax
fault handler if we find the fault size > alignment we retry with PAGE_SIZE
fault size.

This also addresses the below failure scenario on ppc64

ndctl create-namespace --mode=devdax  | grep align
 "align":16777216,
 "align":16777216

cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments
 65536 16777216

daxio.static-debug  -z -o /dev/dax0.0
  Bus error (core dumped)

  $ dmesg | tail
   lpar: Failed hash pte insert with error -4
   hash-mmu: mm: Hashing failure ! EA=0x7fff17000000 access=0x8000000000000006 current=daxio
   hash-mmu:     trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 pte=0xc000000501002b86
   daxio[3860]: bus error (7) at 7fff17000000 nip 7fff973c007c lr 7fff973bff34 code 2 in libpmem.so.1.0.0[7fff973b0000+20000]
   daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c e93f0088 f93f0120
   daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128 <f9490000> e93f0088 39290008 f93f0110

The failure was due to guest kernel using wrong page size.

The namespaces created with 16M alignment will appear as below on a config with
16M page size disabled.

$ ndctl list -Ni
[
  {
    "dev":"namespace0.1",
    "mode":"fsdax",
    "map":"dev",
    "size":5351931904,
    "uuid":"fc6e9667-461a-4718-82b4-69b24570bddb",
    "align":16777216,
    "blockdev":"pmem0.1",
    "supported_alignments":[
      65536
    ]
  },
  {
    "dev":"namespace0.0",
    "mode":"fsdax",    <==== devdax 16M alignment marked disabled.
    "map":"mem",
    "size":5368709120,
    "uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484",
    "state":"disabled"
  }
]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/libnvdimm.h |  9 ++++++++
 arch/powerpc/mm/Makefile             |  1 +
 arch/powerpc/mm/nvdimm.c             | 34 ++++++++++++++++++++++++++++
 arch/x86/include/asm/libnvdimm.h     | 19 ++++++++++++++++
 drivers/nvdimm/nd.h                  |  6 -----
 drivers/nvdimm/pfn_devs.c            | 32 +++++++++++++++++++++++++-
 include/linux/huge_mm.h              |  7 +++++-
 7 files changed, 100 insertions(+), 8 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

diff --git a/arch/powerpc/include/asm/libnvdimm.h b/arch/powerpc/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..d35fd7f48603
--- /dev/null
+++ b/arch/powerpc/include/asm/libnvdimm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_LIBNVDIMM_H
+#define _ASM_POWERPC_LIBNVDIMM_H
+
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
+extern unsigned long *nd_pfn_supported_alignments(void);
+extern unsigned long nd_pfn_default_alignment(void);
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..42e4a399ba5d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)	+= ptdump/
 obj-$(CONFIG_KASAN)		+= kasan/
+obj-$(CONFIG_NVDIMM_PFN)		+= nvdimm.o
diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c
new file mode 100644
index 000000000000..a29a4510715e
--- /dev/null
+++ b/arch/powerpc/mm/nvdimm.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <linux/mm.h>
+/*
+ * We support only pte and pmd mappings for now.
+ */
+const unsigned long *nd_pfn_supported_alignments(void)
+{
+	static unsigned long supported_alignments[3];
+
+	supported_alignments[0] = PAGE_SIZE;
+
+	if (has_transparent_hugepage())
+		supported_alignments[1] = HPAGE_PMD_SIZE;
+	else
+		supported_alignments[1] = 0;
+
+	supported_alignments[2] = 0;
+	return supported_alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+unsigned long nd_pfn_default_alignment(void)
+{
+
+	if (has_transparent_hugepage())
+		return HPAGE_PMD_SIZE;
+	return PAGE_SIZE;
+}
diff --git a/arch/x86/include/asm/libnvdimm.h b/arch/x86/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..3d5361db9164
--- /dev/null
+++ b/arch/x86/include/asm/libnvdimm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_LIBNVDIMM_H
+#define _ASM_X86_LIBNVDIMM_H
+
+static inline unsigned long nd_pfn_default_alignment(void)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	return HPAGE_PMD_SIZE;
+#else
+	return PAGE_SIZE;
+#endif
+}
+
+static inline unsigned long nd_altmap_align_size(unsigned long nd_align)
+{
+	return PMD_SIZE;
+}
+
+#endif
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d24304c0e6d7..e2fbb51fb361 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -288,12 +288,6 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
-#else
-#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
-#endif
-
 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 9410d2692913..29bb46ca92f2 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <asm/libnvdimm.h>
 #include "nd-core.h"
 #include "pfn.h"
 #include "nd.h"
@@ -103,6 +104,8 @@ static ssize_t align_show(struct device *dev,
 	return sprintf(buf, "%ld\n", nd_pfn->align);
 }
 
+#ifndef nd_pfn_supported_alignments
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
 static const unsigned long *nd_pfn_supported_alignments(void)
 {
 	/*
@@ -125,6 +128,7 @@ static const unsigned long *nd_pfn_supported_alignments(void)
 
 	return data;
 }
+#endif
 
 static ssize_t align_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
@@ -302,7 +306,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
 		return NULL;
 
 	nd_pfn->mode = PFN_MODE_NONE;
-	nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
+	nd_pfn->align = nd_pfn_default_alignment();
 	dev = &nd_pfn->dev;
 	device_initialize(&nd_pfn->dev);
 	if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
@@ -412,6 +416,20 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
 	return 0;
 }
 
+static bool nd_supported_alignment(unsigned long align)
+{
+	int i;
+	const unsigned long *supported = nd_pfn_supported_alignments();
+
+	if (align == 0)
+		return false;
+
+	for (i = 0; supported[i]; i++)
+		if (align == supported[i])
+			return true;
+	return false;
+}
+
 /**
  * nd_pfn_validate - read and validate info-block
  * @nd_pfn: fsdax namespace runtime state / properties
@@ -498,6 +516,18 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 		return -EOPNOTSUPP;
 	}
 
+	/*
+	 * Check whether the we support the alignment. For Dax if the
+	 * superblock alignment is not matching, we won't initialize
+	 * the device.
+	 */
+	if (!nd_supported_alignment(align) &&
+	    !memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) {
+		dev_err(&nd_pfn->dev, "init failed, alignment mismatch: "
+			"%ld:%ld\n", nd_pfn->align, align);
+		return -EOPNOTSUPP;
+	}
+
 	if (!nd_pfn->uuid) {
 		/*
 		 * When probing a namepace via nd_pfn_probe() the uuid
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd5c150c21d..64d16794bb27 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -108,7 +108,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
 		return true;
-
+	/*
+	 * For dax let's try to do hugepage fault always. If we don't support
+	 * hugepages we will not have enabled namespaces with hugepage alignment.
+	 * This also means we try to handle hugepage fault on device with
+	 * smaller alignment. But for then we will return with VM_FAULT_FALLBACK
+	 */
 	if (vma_is_dax(vma))
 		return true;
 
-- 
2.21.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox