LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] [PATCH] powerpc/sstep: Check ISA 3.0 instruction validity before emulation
From: Ananth N Mavinakayanahalli @ 2021-01-20  9:46 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: naveen.n.rao, ravi.bangoria, dja

We currently unconditionally try to newer emulate instructions on older
Power versions that could cause issues. Gate it.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@linux.ibm.com>
---
 arch/powerpc/lib/sstep.c |   40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index bf7a7d62ae8b..ed119858e5e9 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1528,6 +1528,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 		goto compute_done;
 
 	case 19:
+		if (!cpu_has_feature(CPU_FTR_ARCH_300))
+			return -1;
 		if (((word >> 1) & 0x1f) == 2) {
 			/* addpcis */
 			imm = (short) (word & 0xffc1);	/* d0 + d2 fields */
@@ -2439,6 +2441,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 268:	/* lxvx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 16);
 			op->element_size = 16;
@@ -2448,6 +2452,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 		case 269:	/* lxvl */
 		case 301: {	/* lxvll */
 			int nb;
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->ea = ra ? regs->gpr[ra] : 0;
 			nb = regs->gpr[rb] & 0xff;
@@ -2475,6 +2481,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 364:	/* lxvwsx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 4);
 			op->element_size = 4;
@@ -2482,6 +2490,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 396:	/* stxvx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(STORE_VSX, 0, 16);
 			op->element_size = 16;
@@ -2491,6 +2501,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 		case 397:	/* stxvl */
 		case 429: {	/* stxvll */
 			int nb;
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->ea = ra ? regs->gpr[ra] : 0;
 			nb = regs->gpr[rb] & 0xff;
@@ -2542,6 +2554,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 781:	/* lxsibzx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 1);
 			op->element_size = 8;
@@ -2549,6 +2563,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 812:	/* lxvh8x */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 16);
 			op->element_size = 2;
@@ -2556,6 +2572,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 813:	/* lxsihzx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 2);
 			op->element_size = 8;
@@ -2569,6 +2587,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 876:	/* lxvb16x */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 16);
 			op->element_size = 1;
@@ -2582,6 +2602,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 909:	/* stxsibx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(STORE_VSX, 0, 1);
 			op->element_size = 8;
@@ -2589,6 +2611,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 940:	/* stxvh8x */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(STORE_VSX, 0, 16);
 			op->element_size = 2;
@@ -2596,6 +2620,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 941:	/* stxsihx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(STORE_VSX, 0, 2);
 			op->element_size = 8;
@@ -2609,6 +2635,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 1004:	/* stxvb16x */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(STORE_VSX, 0, 16);
 			op->element_size = 1;
@@ -2717,12 +2745,16 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			op->type = MKOP(LOAD_FP, 0, 16);
 			break;
 		case 2:		/* lxsd */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd + 32;
 			op->type = MKOP(LOAD_VSX, 0, 8);
 			op->element_size = 8;
 			op->vsx_flags = VSX_CHECK_VEC;
 			break;
 		case 3:		/* lxssp */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->reg = rd + 32;
 			op->type = MKOP(LOAD_VSX, 0, 4);
 			op->element_size = 8;
@@ -2775,6 +2807,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 1:		/* lxv */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->ea = dqform_ea(word, regs);
 			if (word & 8)
 				op->reg = rd + 32;
@@ -2785,6 +2819,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
 		case 2:		/* stxsd with LSB of DS field = 0 */
 		case 6:		/* stxsd with LSB of DS field = 1 */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->ea = dsform_ea(word, regs);
 			op->reg = rd + 32;
 			op->type = MKOP(STORE_VSX, 0, 8);
@@ -2794,6 +2830,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 
 		case 3:		/* stxssp with LSB of DS field = 0 */
 		case 7:		/* stxssp with LSB of DS field = 1 */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->ea = dsform_ea(word, regs);
 			op->reg = rd + 32;
 			op->type = MKOP(STORE_VSX, 0, 4);
@@ -2802,6 +2840,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			break;
 
 		case 5:		/* stxv */
+			if (!cpu_has_feature(CPU_FTR_ARCH_300))
+				return -1;
 			op->ea = dqform_ea(word, regs);
 			if (word & 8)
 				op->reg = rd + 32;



^ permalink raw reply related

* RE: [PATCH 2/2] powerpc/44x: Remove STDBINUTILS kconfig option
From: David Laight @ 2021-01-20  9:33 UTC (permalink / raw)
  To: 'Christophe Leroy', Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org
In-Reply-To: <f9981e819009aa121a998dc483052ec76f78f991.1611128938.git.christophe.leroy@csgroup.eu>

From: Christophe Leroy
> Sent: 20 January 2021 07:49
> 
> STDBINUTILS is just a toggle to allow 256k page size
> to appear in the possible page sizes list for the 44x.
> 
> Make 256k page size appear all the time with an
> explicit warning on binutils, and remove this unneccessary
> STDBINUTILS config option.
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  arch/powerpc/Kconfig | 27 +++++++--------------------
>  1 file changed, 7 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index a685e42d3993..3e29995540a7 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -720,18 +720,6 @@ config ARCH_MEMORY_PROBE
>  	def_bool y
>  	depends on MEMORY_HOTPLUG
> 
> -config STDBINUTILS
> -	bool "Using standard binutils settings"
> -	depends on 44x
> -	default y
> -	help
> -	  Turning this option off allows you to select 256KB PAGE_SIZE on 44x.
> -	  Note, that kernel will be able to run only those applications,
> -	  which had been compiled using binutils later than 2.17.50.0.3 with
> -	  '-zmax-page-size' set to 256K (the default is 64K). Or, if using
> -	  the older binutils, you can patch them with a trivial patch, which
> -	  changes the ELF_MAXPAGESIZE definition from 0x10000 to 0x40000.
> -
>  choice
>  	prompt "Page size"
>  	default PPC_4K_PAGES
> @@ -771,17 +759,16 @@ config PPC_64K_PAGES
>  	select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
> 
>  config PPC_256K_PAGES
> -	bool "256k page size"
> -	depends on 44x && !STDBINUTILS && !PPC_47x
> +	bool "256k page size (Requires non-standard binutils settings)"
> +	depends on 44x && !PPC_47x
>  	help
>  	  Make the page size 256k.
> 
> -	  As the ELF standard only requires alignment to support page
> -	  sizes up to 64k, you will need to compile all of your user
> -	  space applications with a non-standard binutils settings
> -	  (see the STDBINUTILS description for details).
> -
> -	  Say N unless you know what you are doing.
> +	  That kernel will be able to run only those applications,
> +	  which had been compiled using binutils later than 2.17.50.0.3 with
> +	  '-zmax-page-size' set to 256K (the default is 64K). Or, if using
> +	  the older binutils, you can patch them with a trivial patch, which
> +	  changes the ELF_MAXPAGESIZE definition from 0x10000 to 0x40000.


The kernel will only be able to run applications that have been
compiled with '-zmax-page-size' set to 256K (the default is 64K)
using binutils later than 2.17.50.0.3, or by patching the
ELF_MAXPAGESIZE definition from 0x10000 to 0x40000 in older versions.

> 
>  endchoice
> 
> --
> 2.25.0

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* [PATCH] powerpc/64: prevent replayed interrupt handlers from running softirqs
From: Nicholas Piggin @ 2021-01-20  7:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin

Running softirqs enables interrupts, which can then end up recursing
into the irq soft-mask code we're adjusting, including replaying
interrupts itself, which might be theoretically unbounded.

This abridged trace shows how this can occur:

! NIP replay_soft_interrupts
  LR  interrupt_exit_kernel_prepare
  Call Trace:
    interrupt_exit_kernel_prepare (unreliable)
    interrupt_return
  --- interrupt: ea0 at __rb_reserve_next
  NIP __rb_reserve_next
  LR __rb_reserve_next
  Call Trace:
    ring_buffer_lock_reserve
    trace_function
    function_trace_call
    ftrace_call
    __do_softirq
    irq_exit
    timer_interrupt
!   replay_soft_interrupts
    interrupt_exit_kernel_prepare
    interrupt_return
  --- interrupt: ea0 at arch_local_irq_restore

Fix this by disabling bhs (softirqs) around the interrupt replay.

I don't know that commit 3282a3da25bd ("powerpc/64: Implement soft
interrupt replay in C") actually introduced the problem. Prior to that
change, the interrupt replay seems like it should still be subect to
this recusion, however it's done after all the irq state has been fixed
up at the end of the replay, so it seems reasonable to fix back to this
commit.

Fixes: 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in C")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/irq.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 6b1eca53e36c..7064135f9dc3 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -188,6 +188,18 @@ void replay_soft_interrupts(void)
 	unsigned char happened = local_paca->irq_happened;
 	struct pt_regs regs;
 
+	/*
+	 * Prevent softirqs from being run when an interrupt handler returns
+	 * and calls irq_exit(), because softirq processing enables interrupts.
+	 * If an interrupt is taken, it may then call replay_soft_interrupts
+	 * on its way out, which gets messy and recursive.
+	 *
+	 * softirqs created by replayed interrupts will be run at the end of
+	 * this function when bhs are enabled (if they were enabled in our
+	 * caller).
+	 */
+	local_bh_disable();
+
 	ppc_save_regs(&regs);
 	regs.softe = IRQS_ENABLED;
 
@@ -263,6 +275,8 @@ void replay_soft_interrupts(void)
 		trace_hardirqs_off();
 		goto again;
 	}
+
+	local_bh_enable();
 }
 
 notrace void arch_local_irq_restore(unsigned long mask)
-- 
2.23.0


^ permalink raw reply related

* [PATCH 2/2] powerpc/44x: Remove STDBINUTILS kconfig option
From: Christophe Leroy @ 2021-01-20  7:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <2fed79b1154c872194f98bac4422c23918325e61.1611128938.git.christophe.leroy@csgroup.eu>

STDBINUTILS is just a toggle to allow 256k page size
to appear in the possible page sizes list for the 44x.

Make 256k page size appear all the time with an
explicit warning on binutils, and remove this unneccessary
STDBINUTILS config option.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/Kconfig | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a685e42d3993..3e29995540a7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -720,18 +720,6 @@ config ARCH_MEMORY_PROBE
 	def_bool y
 	depends on MEMORY_HOTPLUG
 
-config STDBINUTILS
-	bool "Using standard binutils settings"
-	depends on 44x
-	default y
-	help
-	  Turning this option off allows you to select 256KB PAGE_SIZE on 44x.
-	  Note, that kernel will be able to run only those applications,
-	  which had been compiled using binutils later than 2.17.50.0.3 with
-	  '-zmax-page-size' set to 256K (the default is 64K). Or, if using
-	  the older binutils, you can patch them with a trivial patch, which
-	  changes the ELF_MAXPAGESIZE definition from 0x10000 to 0x40000.
-
 choice
 	prompt "Page size"
 	default PPC_4K_PAGES
@@ -771,17 +759,16 @@ config PPC_64K_PAGES
 	select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_256K_PAGES
-	bool "256k page size"
-	depends on 44x && !STDBINUTILS && !PPC_47x
+	bool "256k page size (Requires non-standard binutils settings)"
+	depends on 44x && !PPC_47x
 	help
 	  Make the page size 256k.
 
-	  As the ELF standard only requires alignment to support page
-	  sizes up to 64k, you will need to compile all of your user
-	  space applications with a non-standard binutils settings
-	  (see the STDBINUTILS description for details).
-
-	  Say N unless you know what you are doing.
+	  That kernel will be able to run only those applications,
+	  which had been compiled using binutils later than 2.17.50.0.3 with
+	  '-zmax-page-size' set to 256K (the default is 64K). Or, if using
+	  the older binutils, you can patch them with a trivial patch, which
+	  changes the ELF_MAXPAGESIZE definition from 0x10000 to 0x40000.
 
 endchoice
 
-- 
2.25.0


^ permalink raw reply related

* [PATCH 1/2] powerpc/47x: Disable 256k page size
From: Christophe Leroy @ 2021-01-20  7:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

PPC47x_TLBE_SIZE isn't defined for 256k pages, so
this size of page shall not be selected for 47x.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: e7f75ad01d59 ("powerpc/47x: Base ppc476 support")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 107bb4319e0e..a685e42d3993 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -772,7 +772,7 @@ config PPC_64K_PAGES
 
 config PPC_256K_PAGES
 	bool "256k page size"
-	depends on 44x && !STDBINUTILS
+	depends on 44x && !STDBINUTILS && !PPC_47x
 	help
 	  Make the page size 256k.
 
-- 
2.25.0


^ permalink raw reply related

* [PATCH 2/2] powerpc/32s: Unroll kuep_lock and kuep_unlock macros
From: Christophe Leroy @ 2021-01-20  7:34 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <00e462bd8b271bed0fec62dc9b25ffd4e0ec19cf.1611128021.git.christophe.leroy@csgroup.eu>

Unroll the loops in kuep_lock and kuep_unlock.

Benchmarked on an mpc 8321 with a standard kernel having a
3M/1M user/kernel memory split, i.e. 12 segments for user.

Without KUEP, null_syscall benchmark is 220 cycles.
With KUEP, null_syscall benchmark is 439 cycles.

Once loops are unrolled, null_syscall benchmark is 366 cycles.
This is almost 17% reduction.

It is assumed that userspace covers at least 4 segments and
at most 14 segments.

The isync is removed, it saves 8 cycles. For kuep_unlock, the rfi
will do the synchronisation. For kuep_lock, we get a small window
during which exec is still possible, but is won't last more than a
few instructions.

Both macros are called two times so the size increase is in
the noise (approx 120 instructions).

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/book3s/32/kup.h | 67 ++++++++++++++++++------
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index a0117a9d5b06..e800b515ac02 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -7,21 +7,61 @@
 
 #ifdef __ASSEMBLY__
 
-.macro kuep_update_sr	gpr1, gpr2		/* NEVER use r0 as gpr2 due to addis */
-101:	mtsrin	\gpr1, \gpr2
-	addi	\gpr1, \gpr1, 0x111		/* next VSID */
-	rlwinm	\gpr1, \gpr1, 0, 0xf0ffffff	/* clear VSID overflow */
-	addis	\gpr2, \gpr2, 0x1000		/* address of next segment */
-	bdnz	101b
-	isync
+.macro kuep_increment gpr1, gpr2
+	addi	\gpr1, \gpr1, 0x222		/* Next second VSID */
+	addi	\gpr2, \gpr2, 0x222		/* Next second VSID */
+	rlwinm	\gpr1, \gpr1, 0, 0xf0ffffff	/* Clear VSID overflow */
+	rlwinm	\gpr2, \gpr2, 0, 0xf0ffffff	/* Clear VSID overflow */
+.endm
+
+.macro kuep_update_sr gpr1, gpr2		/* NEVER use r0 as gpr1 or gpr2 due to addi */
+	addi	\gpr2, \gpr1, 0x111		/* Next VSID */
+	rlwinm	\gpr2, \gpr2, 0, 0xf0ffffff	/* Clear VSID overflow */
+	mtsr	0, \gpr1
+	mtsr	1, \gpr2
+	kuep_increment \gpr1, \gpr2
+	mtsr	2, \gpr1
+	mtsr	3, \gpr2
+#if NUM_USER_SEGMENTS > 4
+	kuep_increment \gpr1, \gpr2
+	mtsr	4, \gpr1
+#if NUM_USER_SEGMENTS > 5
+	mtsr	5, \gpr2
+#if NUM_USER_SEGMENTS > 6
+	kuep_increment \gpr1, \gpr2
+	mtsr	6, \gpr1
+#if NUM_USER_SEGMENTS > 7
+	mtsr	7, \gpr2
+#if NUM_USER_SEGMENTS > 8
+	kuep_increment \gpr1, \gpr2
+	mtsr	8, \gpr1
+#if NUM_USER_SEGMENTS > 9
+	mtsr	9, \gpr2
+#if NUM_USER_SEGMENTS > 10
+	kuep_increment \gpr1, \gpr2
+	mtsr	10, \gpr1
+#if NUM_USER_SEGMENTS > 11
+	mtsr	11, \gpr2
+#if NUM_USER_SEGMENTS > 12
+	kuep_increment \gpr1, \gpr2
+	mtsr	12, \gpr1
+#if NUM_USER_SEGMENTS > 13
+	mtsr	13, \gpr2
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
 .endm
 
 .macro kuep_lock	gpr1, gpr2
 #ifdef CONFIG_PPC_KUEP
-	li	\gpr1, NUM_USER_SEGMENTS
-	li	\gpr2, 0
-	mtctr	\gpr1
-	mfsrin	\gpr1, \gpr2
+	mfsr	\gpr1, 0
 	oris	\gpr1, \gpr1, SR_NX@h		/* set Nx */
 	kuep_update_sr \gpr1, \gpr2
 #endif
@@ -29,10 +69,7 @@
 
 .macro kuep_unlock	gpr1, gpr2
 #ifdef CONFIG_PPC_KUEP
-	li	\gpr1, NUM_USER_SEGMENTS
-	li	\gpr2, 0
-	mtctr	\gpr1
-	mfsrin	\gpr1, \gpr2
+	mfsr	\gpr1, 0
 	rlwinm	\gpr1, \gpr1, 0, ~SR_NX		/* Clear Nx */
 	kuep_update_sr \gpr1, \gpr2
 #endif
-- 
2.25.0


^ permalink raw reply related

* [PATCH 1/2] powerpc/32s: Add missing call to kuep_lock on syscall entry
From: Christophe Leroy @ 2021-01-20  7:33 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Userspace Execution protection and fast syscall entry were implemented
independently from each other and were both merged in kernel 5.2,
leading to syscall entry missing userspace execution protection.

On syscall entry, execution of user space memory must be
locked in the same way as on exception entry.

Fixes: b86fb88855ea ("powerpc/32: implement fast entry for syscalls on non BOOKE")
Cc: stable@vger.kernel.org
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 1c9b0ccc2172..9bc4e7dd0bee 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -356,6 +356,9 @@ trace_syscall_entry_irq_off:
 
 	.globl	transfer_to_syscall
 transfer_to_syscall:
+#ifdef CONFIG_PPC_BOOK3S_32
+	kuep_lock r11, r12
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	andi.	r12,r9,MSR_EE
 	beq-	trace_syscall_entry_irq_off
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCH] powerpc/47x: Disable 256k page size
From: Christophe Leroy @ 2021-01-20  6:31 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <87h7ncqhz1.fsf@mpe.ellerman.id.au>



Le 20/01/2021 à 06:45, Michael Ellerman a écrit :
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> PPC47x_TLBE_SIZE isn't defined for 256k pages, so
>> this size of page shall not be selected for 47x.
>>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Fixes: e7f75ad01d59 ("powerpc/47x: Base ppc476 support")
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> ---
>>   arch/powerpc/Kconfig | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 107bb4319e0e..a685e42d3993 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -772,7 +772,7 @@ config PPC_64K_PAGES
>>   
>>   config PPC_256K_PAGES
>>   	bool "256k page size"
>> -	depends on 44x && !STDBINUTILS
>> +	depends on 44x && !STDBINUTILS && !PPC_47x
> 
> Do we still need this STDBINUTILS thing?
> 
> It's pretty gross, and I notice we have zero defconfigs which disable
> it, meaning it's only randconfig builds that will ever test 256K pages.
> 
> Can we just drop it and say if you enable 256K pages you need to know
> what you're doing?

I guess we can, yes. I'll send a patch for that.

Christophe

> 

^ permalink raw reply

* Re: [PATCH] powerpc/47x: Disable 256k page size
From: Michael Ellerman @ 2021-01-20  5:45 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <2fed79b1154c872194f98bac4422c23918325e61.1611039590.git.christophe.leroy@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> PPC47x_TLBE_SIZE isn't defined for 256k pages, so
> this size of page shall not be selected for 47x.
>
> Reported-by: kernel test robot <lkp@intel.com>
> Fixes: e7f75ad01d59 ("powerpc/47x: Base ppc476 support")
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  arch/powerpc/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 107bb4319e0e..a685e42d3993 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -772,7 +772,7 @@ config PPC_64K_PAGES
>  
>  config PPC_256K_PAGES
>  	bool "256k page size"
> -	depends on 44x && !STDBINUTILS
> +	depends on 44x && !STDBINUTILS && !PPC_47x

Do we still need this STDBINUTILS thing?

It's pretty gross, and I notice we have zero defconfigs which disable
it, meaning it's only randconfig builds that will ever test 256K pages.

Can we just drop it and say if you enable 256K pages you need to know
what you're doing?

cheers

^ permalink raw reply

* Re: [PATCH v3 1/8] powerpc/uaccess: Add unsafe_copy_from_user
From: Christopher M. Riedl @ 2021-01-20  5:08 UTC (permalink / raw)
  To: Christophe Leroy, Michael Ellerman, linuxppc-dev
In-Reply-To: <1e6309a4-58fe-c6a4-6e47-d8659177846c@csgroup.eu>

On Tue Jan 19, 2021 at 11:27 AM CST, Christophe Leroy wrote:
>
>
> Le 19/01/2021 à 18:02, Christopher M. Riedl a écrit :
> > On Tue Jan 19, 2021 at 6:33 AM CST, Christophe Leroy wrote:
> >>
> >>
> >> Le 19/01/2021 à 03:11, Michael Ellerman a écrit :
> >>> "Christopher M. Riedl" <cmr@codefail.de> writes:
> >>>> On Mon Jan 11, 2021 at 7:22 AM CST, Christophe Leroy wrote:
> >>>>> Le 09/01/2021 à 04:25, Christopher M. Riedl a écrit :
> >>>>>> Implement raw_copy_from_user_allowed() which assumes that userspace read
> >>>>>> access is open. Use this new function to implement raw_copy_from_user().
> >>>>>> Finally, wrap the new function to follow the usual "unsafe_" convention
> >>>>>> of taking a label argument.
> >>>>>
> >>>>> I think there is no point implementing raw_copy_from_user_allowed(), see
> >>>>> https://github.com/linuxppc/linux/commit/4b842e4e25b1 and
> >>>>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/8c74fc9ce8131cabb10b3e95dc0e430f396ee83e.1610369143.git.christophe.leroy@csgroup.eu/
> >>>>>
> >>>>> You should simply do:
> >>>>>
> >>>>> #define unsafe_copy_from_user(d, s, l, e) \
> >>>>> unsafe_op_wrap(__copy_tofrom_user((__force void __user *)d, s, l), e)
> >>>>>
> >>>>
> >>>> I gave this a try and the signal ops decreased by ~8K. Now, to be
> >>>> honest, I am not sure what an "acceptable" benchmark number here
> >>>> actually is - so maybe this is ok? Same loss with both radix and hash:
> >>>>
> >>>> 	|                                      | hash   | radix  |
> >>>> 	| ------------------------------------ | ------ | ------ |
> >>>> 	| linuxppc/next                        | 118693 | 133296 |
> >>>> 	| linuxppc/next w/o KUAP+KUEP          | 228911 | 228654 |
> >>>> 	| unsafe-signal64                      | 200480 | 234067 |
> >>>> 	| unsafe-signal64 (__copy_tofrom_user) | 192467 | 225119 |
> >>>>
> >>>> To put this into perspective, prior to KUAP and uaccess flush, signal
> >>>> performance in this benchmark was ~290K on hash.
> >>>
> >>> If I'm doing the math right 8K is ~4% of the best number.
> >>>
> >>> It seems like 4% is worth a few lines of code to handle these constant
> >>> sizes. It's not like we have performance to throw away.
> >>>
> >>> Or, we should chase down where the call sites are that are doing small
> >>> constant copies with copy_to/from_user() and change them to use
> >>> get/put_user().
> >>>
> >>
> >> Christopher, when you say you gave it a try, is I my series or only the
> >> following ?
> >>
> >> #define unsafe_copy_from_user(d, s, l, e) \
> >> unsafe_op_wrap(__copy_tofrom_user((__force void __user *)d, s, l), e)
> >>
> > 
> > I only used the above to replace this patch in my series (so none of my
> > changes implementing raw_copy_from_user_allowed() are included).
>
> Then I see no reason why the performance would be different, because you
> only call
> unsafe_copy_from_user() with non trivial lengthes.
>

Ok I made a mistake - this actually included the other improvements from
your feedback on this series; specifically moving the unsafe_get_user()
call to read the msr regs value into the #ifdef. My first pass resulted
in another __get_user() call which explains some of the perf loss.

With that mistake fixed, the benchmark performance is still only ~197k
on hash (consistent). I agree that there are no places where
unsafe_copy_from_user() is called with trivial lengths so the only
conclusion I can draw is that the changes in this patch marginally
speed-up the other __copy_from_user() calls. I started comparing the
disassembly but nothing immediately obvious stands out to me. In fact, I
observe the speed-up even if I keep this patch _and_ apply this change:

#define unsafe_copy_from_user(d, s, l, e) \
unsafe_op_wrap(__copy_tofrom_user((__force void __user *)d, s, l), e)

Related to that, I think sigset_t is always 8B on ppc64 so these will
probably need a wrapper if we pick-up your series to get rid of trivial
size optimizations:

__copy_from_user(&set, &uc->uc_sigmask, sizeof(set))

I can bring performance up to ~200K on hash again by replacing the two
__copy_from_user(&set, ...) with direct calls to __get_user_size() (it's
kind of hacky I think). See the last commit in this tree:

https://git.sr.ht/~cmr/linux/log/unsafe-signal64-v4

All my comments apply to performance on radix as well.

> > 
> >>
> >> Because I see no use of unsafe_copy_from_user() that would explain that.
> >>
> >> Christophe


^ permalink raw reply

* Re: [PATCH 5/6] powerpc/rtas: rename RTAS_RMOBUF_MAX to RTAS_USER_REGION_SIZE
From: Alexey Kardashevskiy @ 2021-01-20  5:05 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <87y2gowgo6.fsf@linux.ibm.com>



On 20/01/2021 12:17, Nathan Lynch wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> On 16/01/2021 02:56, Nathan Lynch wrote:
>>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>>> On 15/01/2021 09:00, Nathan Lynch wrote:
>>>>> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
>>>>> index 332e1000ca0f..1aa7ab1cbc84 100644
>>>>> --- a/arch/powerpc/include/asm/rtas.h
>>>>> +++ b/arch/powerpc/include/asm/rtas.h
>>>>> @@ -19,8 +19,11 @@
>>>>>     #define RTAS_UNKNOWN_SERVICE (-1)
>>>>>     #define RTAS_INSTANTIATE_MAX (1ULL<<30) /* Don't instantiate rtas at/above this value */
>>>>>     
>>>>> -/* Buffer size for ppc_rtas system call. */
>>>>> -#define RTAS_RMOBUF_MAX (64 * 1024)
>>>>> +/* Work areas shared with RTAS must be 4K, naturally aligned. */
>>>>
>>>> Why exactly 4K and not (for example) PAGE_SIZE?
>>>
>>> 4K is a platform requirement and isn't related to Linux's configured
>>> page size. See the PAPR specification for RTAS functions such as
>>> ibm,configure-connector, ibm,update-nodes, ibm,update-properties.
>>
>> Good, since we are documenting things here - add to the comment ("per
>> PAPR")?
> 
> But almost every constant in this header relates to a specification or
> requirement in PAPR.


Yup, "almost".

> 
>>> There are other calls with work area parameters where alignment isn't
>>> specified (e.g. ibm,get-system-parameter) but 4KB alignment is a safe
>>> choice for those.
>>>
>>>>> +#define RTAS_WORK_AREA_SIZE   4096
>>>>> +
>>>>> +/* Work areas allocated for user space access. */
>>>>> +#define RTAS_USER_REGION_SIZE (RTAS_WORK_AREA_SIZE * 16)
>>>>
>>>> This is still 64K but no clarity why. There is 16 of something, what
>>>> is it?
>>>
>>> There are 16 4KB work areas in the region. I can name it
>>> RTAS_NR_USER_WORK_AREAS or similar.
>>
>>
>> Why 16? PAPR (then add "per PAPR") or we just like 16 ("should be
>> enough")?
> 
> PAPR doesn't know anything about the user region; it's a Linux
> construct. It's been 64KB since pre-git days and I'm not sure what the
> original reason is. At this point, maintaining a kernel-user ABI seems
> like enough justification for the value.

I am not arguing keeping the numbers but you are replacing one magic 
number with another and for neither it is horribly obvious where they 
came from. Is 16 the max number of concurrently running sys_rtas system 
calls? Does the userspace ensure there is no more than 16? btw where is 
that userspace code? I thought 
https://github.com/power-ras/ppc64-diag.git but no. Thanks,



-- 
Alexey

^ permalink raw reply

* Re: [PATCH 6/6] powerpc/rtas: constrain user region allocation to RMA
From: Alexey Kardashevskiy @ 2021-01-20  4:49 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <871regxwzh.fsf@linux.ibm.com>



On 20/01/2021 11:39, Nathan Lynch wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> On 16/01/2021 02:38, Nathan Lynch wrote:
>>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>>> On 15/01/2021 09:00, Nathan Lynch wrote:
>>>>> Memory locations passed as arguments from the OS to RTAS usually need
>>>>> to be addressable in 32-bit mode and must reside in the Real Mode
>>>>> Area. On PAPR guests, the RMA starts at logical address 0 and is the
>>>>> first logical memory block reported in the LPAR’s device tree.
>>>>>
>>>>> On powerpc targets with RTAS, Linux makes available to user space a
>>>>> region of memory suitable for arguments to be passed to RTAS via
>>>>> sys_rtas(). This region (rtas_rmo_buf) is allocated via the memblock
>>>>> API during boot in order to ensure that it satisfies the requirements
>>>>> described above.
>>>>>
>>>>> With radix MMU, the upper limit supplied to the memblock allocation
>>>>> can exceed the bounds of the first logical memory block, since
>>>>> ppc64_rma_size is ULONG_MAX and RTAS_INSTANTIATE_MAX is 1GB. (512MB is
>>>>> a common size of the first memory block according to a small sample of
>>>>> LPARs I have checked.) This leads to failures when user space invokes
>>>>> an RTAS function that uses a work area, such as
>>>>> ibm,configure-connector.
>>>>>
>>>>> Alter the determination of the upper limit for rtas_rmo_buf's
>>>>> allocation to consult the device tree directly, ensuring placement
>>>>> within the RMA regardless of the MMU in use.
>>>>
>>>> Can we tie this with RTAS (which also needs to be in RMA) and simply add
>>>> extra 64K in prom_instantiate_rtas() and advertise this address
>>>> (ALIGH_UP(rtas-base + rtas-size, PAGE_SIZE)) to the user space? We do
>>>> not need this RMO area before that point.
>>>
>>> Can you explain more about what advantage that would bring? I'm not
>>> seeing it. It's a more significant change than what I've written
>>> here.
>>
>>
>> We already allocate space for RTAS and (like RMO) it needs to be in RMA,
>> and RMO is useless without RTAS. We can reuse RTAS allocation code for
>> RMO like this:
> 
> When you say RMO I assume you are referring to rtas_rmo_buf? (I don't
> think it is well-named.)
> 
> 
>> ===
>> diff --git a/arch/powerpc/kernel/prom_init.c
>> b/arch/powerpc/kernel/prom_init.c
>> index e9d4eb6144e1..d9527d3e01d2 100644
>> --- a/arch/powerpc/kernel/prom_init.c
>> +++ b/arch/powerpc/kernel/prom_init.c
>> @@ -1821,7 +1821,8 @@ static void __init prom_instantiate_rtas(void)
>>           if (size == 0)
>>                   return;
>>
>> -       base = alloc_down(size, PAGE_SIZE, 0);
>> +       /* One page for RTAS, one for RMO */
> 
> One page for RTAS? RTAS is ~20MB on LPARs I've checked:
> 
> # lsprop /proc/device-tree/rtas/{rtas-size,linux,rtas-base}
> /proc/device-tree/rtas/rtas-size
> 		 01370000 (20381696)

You are right, I did not sleep well when replied, sorry about that :) I 
tried it with KVM where RTAS is just a few KBs (20 constant bytes + MCE 
log, depends on cpu number) so it worked for me.


> 
>> +       base = alloc_down(size, PAGE_SIZE + PAGE_SIZE, 0);
> 
> This changes the alignment but not the size of the allocation.


Should be:

base = alloc_down(ALIGN_UP(size, PAGE_SIZE) + PAGE_SIZE, PAGE_SIZE, 0);

> 
> 
>>           if (base == 0)
>>                   prom_panic("Could not allocate memory for RTAS\n");
>>
>> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
>> index d126d71ea5bd..885d95cf4ed3 100644
>> --- a/arch/powerpc/kernel/rtas.c
>> +++ b/arch/powerpc/kernel/rtas.c
>> @@ -1186,6 +1186,7 @@ void __init rtas_initialize(void)
>>           rtas.size = size;
>>           no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry",
>> &entry);
>>           rtas.entry = no_entry ? rtas.base : entry;
>> +       rtas_rmo_buf = rtas.base + PAGE_SIZE;
> 
> I think this would overlay the user region on top of the RTAS private
> data area, allowing user space to corrupt it.


Right, my bad. Should be:

rtas_rmo_buf = ALIGN_UP(rtas.base + rtas.size, PAGE_SIZE);


> 
>>
>>           /* If RTAS was found, allocate the RMO buffer for it and look for
>>            * the stop-self token if any
>> @@ -1196,11 +1197,6 @@ void __init rtas_initialize(void)
>>                   ibm_suspend_me_token = rtas_token("ibm,suspend-me");
>>           }
>>    #endif
>> -       rtas_rmo_buf = memblock_phys_alloc_range(RTAS_RMOBUF_MAX, PAGE_SIZE,
>> -                                                0, rtas_region);
>> -       if (!rtas_rmo_buf)
>> -               panic("ERROR: RTAS: Failed to allocate %lx bytes below
>> %pa\n",
>> -                     PAGE_SIZE, &rtas_region);
>> ===
>>
>> May be store in the FDT as "linux,rmo-base" next to "linux,rtas-base",
>> for clarity, as sharing symbols between prom and main kernel is a bit
>> tricky.
>>
>> The benefit is that we do not do the same thing   (== find 64K in RMA)
>> in 2 different ways and if the RMO allocated my way is broken - we'll
>> know it much sooner as RTAS itself will break too.
> 
> Implementation details aside... I'll grant that combining the
> allocations into one in prom_init reduces some duplication in the sense
> that both are subject to the same constraints (mostly - the RTAS data
> area must not cross a 256MB boundary, while the user region may). But
> they really are distinct concerns. The RTAS private data area is
> specified in the platform architecture, the OS is obligated to allocate
> it and pass it to instantiate-rtas, etc etc. However the user region
> (rtas_rmo_buf) is purely a Linux construct which is there to support
> sys_rtas.

Not purely - it should be an address which RTAS accepts. Cannot argue 
with the rest though, it all sounds correct.

> Now, there are multiple sites in the kernel proper that must allocate
> memory suitable for passing to RTAS. Obviously there is value in
> consolidating the logic for that purpose in one place, so I'll work on
> adding that in v2. OK?

Sure.


-- 
Alexey

^ permalink raw reply

* Re: [PATCH] selftests/powerpc: Fix exit status of pkey tests
From: Michael Ellerman @ 2021-01-20  4:44 UTC (permalink / raw)
  To: mpe, Sandipan Das; +Cc: harish, aneesh.kumar, efuller, linuxppc-dev
In-Reply-To: <20210118093145.10134-1-sandipan@linux.ibm.com>

On Mon, 18 Jan 2021 15:01:45 +0530, Sandipan Das wrote:
> Since main() does not return a value explicitly, the
> return values from FAIL_IF() conditions are ignored
> and the tests can still pass irrespective of failures.
> This makes sure that we always explicitly return the
> correct test exit status.

Applied to powerpc/fixes.

[1/1] selftests/powerpc: Fix exit status of pkey tests
      https://git.kernel.org/powerpc/c/92a5e1fdb286851d5bd0eb966b8d075be27cf5ee

cheers

^ permalink raw reply

* Re: [PATCH] selftests/powerpc: Only test lwm/stmw on big endian
From: Michael Ellerman @ 2021-01-20  4:44 UTC (permalink / raw)
  To: linuxppc-dev, Michael Ellerman; +Cc: msuchanek, lpechacek
In-Reply-To: <20210119041800.3093047-1-mpe@ellerman.id.au>

On Tue, 19 Jan 2021 15:18:00 +1100, Michael Ellerman wrote:
> Newer binutils (>= 2.36) refuse to assemble lmw/stmw when building in
> little endian mode. That breaks compilation of our alignment handler
> test:
> 
>   /tmp/cco4l14N.s: Assembler messages:
>   /tmp/cco4l14N.s:1440: Error: `lmw' invalid when little-endian
>   /tmp/cco4l14N.s:1814: Error: `stmw' invalid when little-endian
>   make[2]: *** [../../lib.mk:139: /output/kselftest/powerpc/alignment/alignment_handler] Error 1
> 
> [...]

Applied to powerpc/fixes.

[1/1] selftests/powerpc: Only test lwm/stmw on big endian
      https://git.kernel.org/powerpc/c/dd3a44c06f7b4f14e90065bf05d62c255b20005f

cheers

^ permalink raw reply

* Re: [PATCH v6 14/39] powerpc/perf: move perf irq/nmi handling details into traps.c
From: Nicholas Piggin @ 2021-01-20  4:21 UTC (permalink / raw)
  To: Athira Rajeev; +Cc: linuxppc-dev
In-Reply-To: <1611108829.0isdl3z9na.astroid@bobo.none>

Excerpts from Nicholas Piggin's message of January 20, 2021 1:09 pm:
> Excerpts from Athira Rajeev's message of January 19, 2021 8:24 pm:
>> 
>> [  883.900762] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [swapper/0:0]
>> [  883.901381] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G           OE     5.11.0-rc3+ #34
>> --
>> [  883.901999] NIP [c0000000000168d0] replay_soft_interrupts+0x70/0x2f0
>> [  883.902032] LR [c00000000003b2b8] interrupt_exit_kernel_prepare+0x1e8/0x240
>> [  883.902063] Call Trace:
>> [  883.902085] [c000000001c96f50] [c00000000003b2b8] interrupt_exit_kernel_prepare+0x1e8/0x240 (unreliable)
>> [  883.902139] [c000000001c96fb0] [c00000000000fd88] interrupt_return+0x158/0x200
>> [  883.902185] --- interrupt: ea0 at __rb_reserve_next+0xc0/0x5b0
>> [  883.902224] NIP:  c0000000002d8980 LR: c0000000002d897c CTR: c0000000001aad90
>> [  883.902262] REGS: c000000001c97020 TRAP: 0ea0   Tainted: G           OE      (5.11.0-rc3+)
>> [  883.902301] MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 28000484  XER: 20040000
>> [  883.902387] CFAR: c00000000000fe00 IRQMASK: 0 
>> --
>> [  883.902757] NIP [c0000000002d8980] __rb_reserve_next+0xc0/0x5b0
>> [  883.902786] LR [c0000000002d897c] __rb_reserve_next+0xbc/0x5b0
>> [  883.902824] --- interrupt: ea0
>> [  883.902848] [c000000001c97360] [c0000000002d8fcc] ring_buffer_lock_reserve+0x15c/0x580
>> [  883.902894] [c000000001c973f0] [c0000000002e82fc] trace_function+0x4c/0x1c0
>> [  883.902930] [c000000001c97440] [c0000000002f6f50] function_trace_call+0x140/0x190
>> [  883.902976] [c000000001c97470] [c00000000007d6f8] ftrace_call+0x4/0x44
>> [  883.903021] [c000000001c97660] [c000000000dcf70c] __do_softirq+0x15c/0x3d4
>> [  883.903066] [c000000001c97750] [c00000000015fc68] irq_exit+0x198/0x1b0
>> [  883.903102] [c000000001c97780] [c000000000dc1790] timer_interrupt+0x170/0x3b0
>> [  883.903148] [c000000001c977e0] [c000000000016994] replay_soft_interrupts+0x134/0x2f0
>> [  883.903193] [c000000001c979d0] [c00000000003b2b8] interrupt_exit_kernel_prepare+0x1e8/0x240
>> [  883.903240] [c000000001c97a30] [c00000000000fd88] interrupt_return+0x158/0x200
>> [  883.903276] --- interrupt: ea0 at arch_local_irq_restore+0x70/0xc0
> 
> You got a 0xea0 interrupt in the ftrace code. I wonder where it is 
> looping. Do you see more soft lockup messages?

We should probably fix this recursion too. I was vaguely aware of it and 
thought it might have existed with the old interrupt exit and replay 
code as well and was pretty well bounded, but I'm not entirely sure it's
okay. And now that I've thought about it a bit harder, I think there is
actualy a simple way to fix it -

[PATCH] powerpc/64: prevent replayed interrupt handlers from running
 softirqs

Running softirqs enables interrupts, which can then end up recursing
into the irq soft-mask code we're trying to adjust, including replaying
interrupts itself which may not be bounded. This abridged trace shows
how this can occur:

  NIP replay_soft_interrupts
  LR  interrupt_exit_kernel_prepare
  Call Trace:
    interrupt_exit_kernel_prepare (unreliable)
    interrupt_return
  --- interrupt: ea0 at __rb_reserve_next
  NIP __rb_reserve_next
  LR __rb_reserve_next
  Call Trace:
    ring_buffer_lock_reserve
    trace_function
    function_trace_call
    ftrace_call
    __do_softirq
    irq_exit
   timer_interrupt
   replay_soft_interrupts
   interrupt_exit_kernel_prepare
   interrupt_return
  --- interrupt: ea0 at arch_local_irq_restore

Fix this by disabling bhs (softirqs) around the interrupt replay.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/irq.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 681abb7c0507..bb0d4fc8df89 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -189,6 +189,18 @@ void replay_soft_interrupts(void)
 	unsigned char happened = local_paca->irq_happened;
 	struct pt_regs regs;
 
+	/*
+	 * Prevent softirqs from being run when an interrupt handler returns
+	 * and calls irq_exit(), because softirq processing enables interrupts.
+	 * If an interrupt is taken, it may then call replay_soft_interrupts
+	 * on its way out, which gets messy and recursive.
+	 *
+	 * softirqs created by replayed interrupts will be run at the end of
+	 * this function when bhs are enabled (if they were enabled in our
+	 * caller).
+	 */
+	local_bh_disable();
+
 	ppc_save_regs(&regs);
 	regs.softe = IRQS_ENABLED;
 
@@ -264,6 +276,8 @@ void replay_soft_interrupts(void)
 		trace_hardirqs_off();
 		goto again;
 	}
+
+	local_bh_enable();
 }
 
 notrace void arch_local_irq_restore(unsigned long mask)
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH v6 14/39] powerpc/perf: move perf irq/nmi handling details into traps.c
From: Nicholas Piggin @ 2021-01-20  3:09 UTC (permalink / raw)
  To: Athira Rajeev; +Cc: linuxppc-dev
In-Reply-To: <AB6E725D-225E-4FBD-B484-4C8FA627D096@linux.vnet.ibm.com>

Excerpts from Athira Rajeev's message of January 19, 2021 8:24 pm:
> 
> 
>> On 15-Jan-2021, at 10:19 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> This is required in order to allow more significant differences between
>> NMI type interrupt handlers and regular asynchronous handlers.
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>> arch/powerpc/kernel/traps.c      | 31 +++++++++++++++++++++++++++-
>> arch/powerpc/perf/core-book3s.c  | 35 ++------------------------------
>> arch/powerpc/perf/core-fsl-emb.c | 25 -----------------------
>> 3 files changed, 32 insertions(+), 59 deletions(-)
> 
> Hi Nick,
> 
> Reviewed this perf patch which moves the nmi_enter/irq_enter to traps.c and code-wise changes
> for perf looks fine to me. Further, I was trying to test this by picking the whole Patch series on top
> of 5.11.0-rc3 kernel and using below test scenario:
> 
> Intention of testcase is to check whether the perf nmi and asynchronous interrupts are getting
> captured as expected. My test kernel module below tries to create one of performance monitor
> counter ( PMC6 ) overflow between local_irq_save/local_irq_restore.
> [ Here interrupts are disabled and has IRQS_DISABLED as regs->softe ].
> I am expecting the PMI to come as an NMI in this case. I am also configuring ftrace so that I
> can confirm whether it comes as an NMI or a replayed interrupt from the trace.
> 
> Environment :One CPU online
> prerequisite for ftrace:
> # cd /sys/kernel/debug/tracing
> # echo 100 > buffer_percent
> # echo 200000 > buffer_size_kb 
> # echo ppc-tb > trace_clock
> # echo function > current_tracer
> 
> Part of sample kernel test module to trigger a PMI between 
> local_irq_save and local_irq_restore:
> 
> <<>>
> static ulong delay = 1;
> static void busy_wait(ulong time)
> {
>         udelay(delay);
> }
> static __always_inline void irq_test(void)
> {
>         unsigned long flags;
>         local_irq_save(flags);
>         trace_printk("IN IRQ TEST\n");
>         mtspr(SPRN_MMCR0, 0x80000000);
>         mtspr(SPRN_PMC6, 0x80000000 - 100);
>         mtspr(SPRN_MMCR0, 0x6004000);
>         busy_wait(delay);
>         trace_printk("IN IRQ TEST DONE\n");
>         local_irq_restore(flags);
>         mtspr(SPRN_MMCR0, 0x80000000);
>         mtspr(SPRN_PMC6, 0);
> }
> <<>>
> 
> But this resulted in soft lockup, Adding a snippet of call-trace below:

I'm not getting problems with your test case, but I am testing in a VM 
so may not be getting device interrupts so much (your 0xea0 interrupt).
I'll try test on bare metal next. Does it reproduce easily, and 
unpatched kernel definitely does not have the problem?

A different issue, after my series, I don't see the perf "NMI" interrupt 
in any traces under local_irq_disable, because it's disabling ftrace the
same as the other NMI interrupts, so your test wouldn't see them.

I don't know if this is exactly right. Can tracing cope with such NMIs
okay even if it's interrupted in the middle of the tracing code? Machine
check at least has to disable tracing because it's in real-mode, machine
check and sreset also want to disable tracing because something is going
wrong and we don't want to make it worse (e.g., to get a cleaner crash).
Should we still permit tracing of perf NMIs?

> 
> [  883.900762] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [swapper/0:0]
> [  883.901381] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G           OE     5.11.0-rc3+ #34
> --
> [  883.901999] NIP [c0000000000168d0] replay_soft_interrupts+0x70/0x2f0
> [  883.902032] LR [c00000000003b2b8] interrupt_exit_kernel_prepare+0x1e8/0x240
> [  883.902063] Call Trace:
> [  883.902085] [c000000001c96f50] [c00000000003b2b8] interrupt_exit_kernel_prepare+0x1e8/0x240 (unreliable)
> [  883.902139] [c000000001c96fb0] [c00000000000fd88] interrupt_return+0x158/0x200
> [  883.902185] --- interrupt: ea0 at __rb_reserve_next+0xc0/0x5b0
> [  883.902224] NIP:  c0000000002d8980 LR: c0000000002d897c CTR: c0000000001aad90
> [  883.902262] REGS: c000000001c97020 TRAP: 0ea0   Tainted: G           OE      (5.11.0-rc3+)
> [  883.902301] MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 28000484  XER: 20040000
> [  883.902387] CFAR: c00000000000fe00 IRQMASK: 0 
> --
> [  883.902757] NIP [c0000000002d8980] __rb_reserve_next+0xc0/0x5b0
> [  883.902786] LR [c0000000002d897c] __rb_reserve_next+0xbc/0x5b0
> [  883.902824] --- interrupt: ea0
> [  883.902848] [c000000001c97360] [c0000000002d8fcc] ring_buffer_lock_reserve+0x15c/0x580
> [  883.902894] [c000000001c973f0] [c0000000002e82fc] trace_function+0x4c/0x1c0
> [  883.902930] [c000000001c97440] [c0000000002f6f50] function_trace_call+0x140/0x190
> [  883.902976] [c000000001c97470] [c00000000007d6f8] ftrace_call+0x4/0x44
> [  883.903021] [c000000001c97660] [c000000000dcf70c] __do_softirq+0x15c/0x3d4
> [  883.903066] [c000000001c97750] [c00000000015fc68] irq_exit+0x198/0x1b0
> [  883.903102] [c000000001c97780] [c000000000dc1790] timer_interrupt+0x170/0x3b0
> [  883.903148] [c000000001c977e0] [c000000000016994] replay_soft_interrupts+0x134/0x2f0
> [  883.903193] [c000000001c979d0] [c00000000003b2b8] interrupt_exit_kernel_prepare+0x1e8/0x240
> [  883.903240] [c000000001c97a30] [c00000000000fd88] interrupt_return+0x158/0x200
> [  883.903276] --- interrupt: ea0 at arch_local_irq_restore+0x70/0xc0

You got a 0xea0 interrupt in the ftrace code. I wonder where it is 
looping. Do you see more soft lockup messages?

Thanks,
Nick


> 
> Thanks
> Athira
>> 
>> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
>> index 738370519937..bd55f201115b 100644
>> --- a/arch/powerpc/kernel/traps.c
>> +++ b/arch/powerpc/kernel/traps.c
>> @@ -1892,11 +1892,40 @@ void vsx_unavailable_tm(struct pt_regs *regs)
>> }
>> #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
>> 
>> -void performance_monitor_exception(struct pt_regs *regs)
>> +static void performance_monitor_exception_nmi(struct pt_regs *regs)
>> +{
>> +	nmi_enter();
>> +
>> +	__this_cpu_inc(irq_stat.pmu_irqs);
>> +
>> +	perf_irq(regs);
>> +
>> +	nmi_exit();
>> +}
>> +
>> +static void performance_monitor_exception_async(struct pt_regs *regs)
>> {
>> +	irq_enter();
>> +
>> 	__this_cpu_inc(irq_stat.pmu_irqs);
>> 
>> 	perf_irq(regs);
>> +
>> +	irq_exit();
>> +}
>> +
>> +void performance_monitor_exception(struct pt_regs *regs)
>> +{
>> +	/*
>> +	 * On 64-bit, if perf interrupts hit in a local_irq_disable
>> +	 * (soft-masked) region, we consider them as NMIs. This is required to
>> +	 * prevent hash faults on user addresses when reading callchains (and
>> +	 * looks better from an irq tracing perspective).
>> +	 */
>> +	if (IS_ENABLED(CONFIG_PPC64) && unlikely(arch_irq_disabled_regs(regs)))
>> +		performance_monitor_exception_nmi(regs);
>> +	else
>> +		performance_monitor_exception_async(regs);
>> }
>> 
>> #ifdef CONFIG_PPC_ADV_DEBUG_REGS
>> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
>> index 28206b1fe172..9fd06010e8b6 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -110,10 +110,6 @@ static inline void perf_read_regs(struct pt_regs *regs)
>> {
>> 	regs->result = 0;
>> }
>> -static inline int perf_intr_is_nmi(struct pt_regs *regs)
>> -{
>> -	return 0;
>> -}
>> 
>> static inline int siar_valid(struct pt_regs *regs)
>> {
>> @@ -353,15 +349,6 @@ static inline void perf_read_regs(struct pt_regs *regs)
>> 	regs->result = use_siar;
>> }
>> 
>> -/*
>> - * If interrupts were soft-disabled when a PMU interrupt occurs, treat
>> - * it as an NMI.
>> - */
>> -static inline int perf_intr_is_nmi(struct pt_regs *regs)
>> -{
>> -	return (regs->softe & IRQS_DISABLED);
>> -}
>> -
>> /*
>>  * On processors like P7+ that have the SIAR-Valid bit, marked instructions
>>  * must be sampled only if the SIAR-valid bit is set.
>> @@ -2279,7 +2266,6 @@ static void __perf_event_interrupt(struct pt_regs *regs)
>> 	struct perf_event *event;
>> 	unsigned long val[8];
>> 	int found, active;
>> -	int nmi;
>> 
>> 	if (cpuhw->n_limited)
>> 		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
>> @@ -2287,18 +2273,6 @@ static void __perf_event_interrupt(struct pt_regs *regs)
>> 
>> 	perf_read_regs(regs);
>> 
>> -	/*
>> -	 * If perf interrupts hit in a local_irq_disable (soft-masked) region,
>> -	 * we consider them as NMIs. This is required to prevent hash faults on
>> -	 * user addresses when reading callchains. See the NMI test in
>> -	 * do_hash_page.
>> -	 */
>> -	nmi = perf_intr_is_nmi(regs);
>> -	if (nmi)
>> -		nmi_enter();
>> -	else
>> -		irq_enter();
>> -
>> 	/* Read all the PMCs since we'll need them a bunch of times */
>> 	for (i = 0; i < ppmu->n_counter; ++i)
>> 		val[i] = read_pmc(i + 1);
>> @@ -2344,8 +2318,8 @@ static void __perf_event_interrupt(struct pt_regs *regs)
>> 			}
>> 		}
>> 	}
>> -	if (!found && !nmi && printk_ratelimit())
>> -		printk(KERN_WARNING "Can't find PMC that caused IRQ\n");
>> +	if (unlikely(!found) && !arch_irq_disabled_regs(regs))
>> +		printk_ratelimited(KERN_WARNING "Can't find PMC that caused IRQ\n");
>> 
>> 	/*
>> 	 * Reset MMCR0 to its normal value.  This will set PMXE and
>> @@ -2355,11 +2329,6 @@ static void __perf_event_interrupt(struct pt_regs *regs)
>> 	 * we get back out of this interrupt.
>> 	 */
>> 	write_mmcr0(cpuhw, cpuhw->mmcr.mmcr0);
>> -
>> -	if (nmi)
>> -		nmi_exit();
>> -	else
>> -		irq_exit();
>> }
>> 
>> static void perf_event_interrupt(struct pt_regs *regs)
>> diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
>> index e0e7e276bfd2..ee721f420a7b 100644
>> --- a/arch/powerpc/perf/core-fsl-emb.c
>> +++ b/arch/powerpc/perf/core-fsl-emb.c
>> @@ -31,19 +31,6 @@ static atomic_t num_events;
>> /* Used to avoid races in calling reserve/release_pmc_hardware */
>> static DEFINE_MUTEX(pmc_reserve_mutex);
>> 
>> -/*
>> - * If interrupts were soft-disabled when a PMU interrupt occurs, treat
>> - * it as an NMI.
>> - */
>> -static inline int perf_intr_is_nmi(struct pt_regs *regs)
>> -{
>> -#ifdef __powerpc64__
>> -	return (regs->softe & IRQS_DISABLED);
>> -#else
>> -	return 0;
>> -#endif
>> -}
>> -
>> static void perf_event_interrupt(struct pt_regs *regs);
>> 
>> /*
>> @@ -659,13 +646,6 @@ static void perf_event_interrupt(struct pt_regs *regs)
>> 	struct perf_event *event;
>> 	unsigned long val;
>> 	int found = 0;
>> -	int nmi;
>> -
>> -	nmi = perf_intr_is_nmi(regs);
>> -	if (nmi)
>> -		nmi_enter();
>> -	else
>> -		irq_enter();
>> 
>> 	for (i = 0; i < ppmu->n_counter; ++i) {
>> 		event = cpuhw->event[i];
>> @@ -690,11 +670,6 @@ static void perf_event_interrupt(struct pt_regs *regs)
>> 	mtmsr(mfmsr() | MSR_PMM);
>> 	mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE);
>> 	isync();
>> -
>> -	if (nmi)
>> -		nmi_exit();
>> -	else
>> -		irq_exit();
>> }
>> 
>> void hw_perf_event_setup(int cpu)
>> -- 
>> 2.23.0
>> 
> 
> 

^ permalink raw reply

* Re: [PATCH 1/2] dt-bindings: powerpc: Add a schema for the 'sleep' property
From: Rob Herring @ 2021-01-20  1:44 UTC (permalink / raw)
  To: Johan Jonker
  Cc: devicetree, Heiko Stuebner, linux-kernel,
	open list:ARM/Rockchip SoC..., Paul Mackerras, linuxppc-dev
In-Reply-To: <752e9355-defb-6d3c-248b-f626247d4cee@gmail.com>

On Sun, Jan 17, 2021 at 05:10:03PM +0100, Johan Jonker wrote:
> Hi Rob,
> 
> This patch generates notifications in the Rockchip ARM and arm64 tree.
> Could you limit the scope to PowerPC only.
> 
> Kind regards,
> 
> Johan Jonker
> 
> make ARCH=arm dtbs_check
> DT_SCHEMA_FILES=Documentation/devicetree/bindings/powerpc/sleep.yaml
> 
> make ARCH=arm64 dtbs_check
> DT_SCHEMA_FILES=Documentation/devicetree/bindings/powerpc/sleep.yaml
> 
> Example:
> 
> /arch/arm64/boot/dts/rockchip/rk3399pro-rock-pi-n10.dt.yaml: pinctrl:
> sleep: {'ddrio-pwroff': {'rockchip,pins': [[0, 1, 1, 168]]},
> 'ap-pwroff': {'rockchip,pins': [[1, 5, 1, 168]]}} is not of type 'array'
> 	From schema: /Documentation/devicetree/bindings/powerpc/sleep.yaml

IMO, the node name should be changed or just removed. The grouping 
doesn't serve any purpose and changing wouldn't break the ABI.

Rob

^ permalink raw reply

* Re: [PATCH 5/6] powerpc/rtas: rename RTAS_RMOBUF_MAX to RTAS_USER_REGION_SIZE
From: Nathan Lynch @ 2021-01-20  1:17 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <6905c3d2-e524-b6d8-036f-7812ea3f8b85@ozlabs.ru>

Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> On 16/01/2021 02:56, Nathan Lynch wrote:
>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>> On 15/01/2021 09:00, Nathan Lynch wrote:
>>>> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
>>>> index 332e1000ca0f..1aa7ab1cbc84 100644
>>>> --- a/arch/powerpc/include/asm/rtas.h
>>>> +++ b/arch/powerpc/include/asm/rtas.h
>>>> @@ -19,8 +19,11 @@
>>>>    #define RTAS_UNKNOWN_SERVICE (-1)
>>>>    #define RTAS_INSTANTIATE_MAX (1ULL<<30) /* Don't instantiate rtas at/above this value */
>>>>    
>>>> -/* Buffer size for ppc_rtas system call. */
>>>> -#define RTAS_RMOBUF_MAX (64 * 1024)
>>>> +/* Work areas shared with RTAS must be 4K, naturally aligned. */
>>>
>>> Why exactly 4K and not (for example) PAGE_SIZE?
>> 
>> 4K is a platform requirement and isn't related to Linux's configured
>> page size. See the PAPR specification for RTAS functions such as
>> ibm,configure-connector, ibm,update-nodes, ibm,update-properties.
>
> Good, since we are documenting things here - add to the comment ("per 
> PAPR")?

But almost every constant in this header relates to a specification or
requirement in PAPR.


>> There are other calls with work area parameters where alignment isn't
>> specified (e.g. ibm,get-system-parameter) but 4KB alignment is a safe
>> choice for those.
>> 
>>>> +#define RTAS_WORK_AREA_SIZE   4096
>>>> +
>>>> +/* Work areas allocated for user space access. */
>>>> +#define RTAS_USER_REGION_SIZE (RTAS_WORK_AREA_SIZE * 16)
>>>
>>> This is still 64K but no clarity why. There is 16 of something, what
>>> is it?
>> 
>> There are 16 4KB work areas in the region. I can name it
>> RTAS_NR_USER_WORK_AREAS or similar.
>
>
> Why 16? PAPR (then add "per PAPR") or we just like 16 ("should be
> enough")?

PAPR doesn't know anything about the user region; it's a Linux
construct. It's been 64KB since pre-git days and I'm not sure what the
original reason is. At this point, maintaining a kernel-user ABI seems
like enough justification for the value.

^ permalink raw reply

* Re: [PATCH] selftests/powerpc: Only test lwm/stmw on big endian
From: Segher Boessenkool @ 2021-01-20  1:12 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, msuchanek, lpechacek
In-Reply-To: <20210119041800.3093047-1-mpe@ellerman.id.au>

Hi!

On Tue, Jan 19, 2021 at 03:18:00PM +1100, Michael Ellerman wrote:
> Newer binutils (>= 2.36) refuse to assemble lmw/stmw when building in
> little endian mode. That breaks compilation of our alignment handler
> test:
> 
>   /tmp/cco4l14N.s: Assembler messages:
>   /tmp/cco4l14N.s:1440: Error: `lmw' invalid when little-endian
>   /tmp/cco4l14N.s:1814: Error: `stmw' invalid when little-endian
>   make[2]: *** [../../lib.mk:139: /output/kselftest/powerpc/alignment/alignment_handler] Error 1
> 
> These tests do pass on little endian machines, as the kernel will
> still emulate those instructions even when running little
> endian (which is arguably a kernel bug).

The opposite: in older ISAs it is *required* to.  On all very old ISA
versions, and when not on the Server Environment on everything before
ISA 2.07.

Many older implementations did an alignment interrupt, but that was an
implementation detail (they could still be compliant with proper system
software support, e.g. kernel emulation handlers).  Nowadays that
interrupt is required, so you can still support it like that.

(The patch is fine of course.)


Segher

^ permalink raw reply

* Re: [PATCH 6/6] powerpc/rtas: constrain user region allocation to RMA
From: Nathan Lynch @ 2021-01-20  0:39 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <3c5141d5-ee78-3771-3410-37635d423945@ozlabs.ru>

Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> On 16/01/2021 02:38, Nathan Lynch wrote:
>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>>> On 15/01/2021 09:00, Nathan Lynch wrote:
>>>> Memory locations passed as arguments from the OS to RTAS usually need
>>>> to be addressable in 32-bit mode and must reside in the Real Mode
>>>> Area. On PAPR guests, the RMA starts at logical address 0 and is the
>>>> first logical memory block reported in the LPAR’s device tree.
>>>>
>>>> On powerpc targets with RTAS, Linux makes available to user space a
>>>> region of memory suitable for arguments to be passed to RTAS via
>>>> sys_rtas(). This region (rtas_rmo_buf) is allocated via the memblock
>>>> API during boot in order to ensure that it satisfies the requirements
>>>> described above.
>>>>
>>>> With radix MMU, the upper limit supplied to the memblock allocation
>>>> can exceed the bounds of the first logical memory block, since
>>>> ppc64_rma_size is ULONG_MAX and RTAS_INSTANTIATE_MAX is 1GB. (512MB is
>>>> a common size of the first memory block according to a small sample of
>>>> LPARs I have checked.) This leads to failures when user space invokes
>>>> an RTAS function that uses a work area, such as
>>>> ibm,configure-connector.
>>>>
>>>> Alter the determination of the upper limit for rtas_rmo_buf's
>>>> allocation to consult the device tree directly, ensuring placement
>>>> within the RMA regardless of the MMU in use.
>>>
>>> Can we tie this with RTAS (which also needs to be in RMA) and simply add
>>> extra 64K in prom_instantiate_rtas() and advertise this address
>>> (ALIGH_UP(rtas-base + rtas-size, PAGE_SIZE)) to the user space? We do
>>> not need this RMO area before that point.
>> 
>> Can you explain more about what advantage that would bring? I'm not
>> seeing it. It's a more significant change than what I've written
>> here.
>
>
> We already allocate space for RTAS and (like RMO) it needs to be in RMA, 
> and RMO is useless without RTAS. We can reuse RTAS allocation code for 
> RMO like this:

When you say RMO I assume you are referring to rtas_rmo_buf? (I don't
think it is well-named.)


> ===
> diff --git a/arch/powerpc/kernel/prom_init.c 
> b/arch/powerpc/kernel/prom_init.c
> index e9d4eb6144e1..d9527d3e01d2 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -1821,7 +1821,8 @@ static void __init prom_instantiate_rtas(void)
>          if (size == 0)
>                  return;
>
> -       base = alloc_down(size, PAGE_SIZE, 0);
> +       /* One page for RTAS, one for RMO */

One page for RTAS? RTAS is ~20MB on LPARs I've checked:

# lsprop /proc/device-tree/rtas/{rtas-size,linux,rtas-base}
/proc/device-tree/rtas/rtas-size
		 01370000 (20381696)

> +       base = alloc_down(size, PAGE_SIZE + PAGE_SIZE, 0);

This changes the alignment but not the size of the allocation.


>          if (base == 0)
>                  prom_panic("Could not allocate memory for RTAS\n");
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index d126d71ea5bd..885d95cf4ed3 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -1186,6 +1186,7 @@ void __init rtas_initialize(void)
>          rtas.size = size;
>          no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", 
> &entry);
>          rtas.entry = no_entry ? rtas.base : entry;
> +       rtas_rmo_buf = rtas.base + PAGE_SIZE;

I think this would overlay the user region on top of the RTAS private
data area, allowing user space to corrupt it.


>
>          /* If RTAS was found, allocate the RMO buffer for it and look for
>           * the stop-self token if any
> @@ -1196,11 +1197,6 @@ void __init rtas_initialize(void)
>                  ibm_suspend_me_token = rtas_token("ibm,suspend-me");
>          }
>   #endif
> -       rtas_rmo_buf = memblock_phys_alloc_range(RTAS_RMOBUF_MAX, PAGE_SIZE,
> -                                                0, rtas_region);
> -       if (!rtas_rmo_buf)
> -               panic("ERROR: RTAS: Failed to allocate %lx bytes below 
> %pa\n",
> -                     PAGE_SIZE, &rtas_region);
> ===
>
> May be store in the FDT as "linux,rmo-base" next to "linux,rtas-base", 
> for clarity, as sharing symbols between prom and main kernel is a bit 
> tricky.
>
> The benefit is that we do not do the same thing   (== find 64K in RMA) 
> in 2 different ways and if the RMO allocated my way is broken - we'll 
> know it much sooner as RTAS itself will break too.

Implementation details aside... I'll grant that combining the
allocations into one in prom_init reduces some duplication in the sense
that both are subject to the same constraints (mostly - the RTAS data
area must not cross a 256MB boundary, while the user region may). But
they really are distinct concerns. The RTAS private data area is
specified in the platform architecture, the OS is obligated to allocate
it and pass it to instantiate-rtas, etc etc. However the user region
(rtas_rmo_buf) is purely a Linux construct which is there to support
sys_rtas.

Now, there are multiple sites in the kernel proper that must allocate
memory suitable for passing to RTAS. Obviously there is value in
consolidating the logic for that purpose in one place, so I'll work on
adding that in v2. OK?

^ permalink raw reply

* Re: [PATCH 1/4] KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support
From: Nicholas Piggin @ 2021-01-20  0:05 UTC (permalink / raw)
  To: Fabiano Rosas, kvm-ppc; +Cc: linuxppc-dev
In-Reply-To: <87a6t4bpp2.fsf@linux.ibm.com>

Excerpts from Fabiano Rosas's message of January 20, 2021 7:07 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> Excerpts from Fabiano Rosas's message of January 19, 2021 11:46 am:
>>> Resending because the previous got spam-filtered:
>>> 
>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>> 
>>>> This reverts much of commit c01015091a770 ("KVM: PPC: Book3S HV: Run HPT
>>>> guests on POWER9 radix hosts"), which was required to run HPT guests on
>>>> RPT hosts on early POWER9 CPUs without support for "mixed mode", which
>>>> meant the host could not run with MMU on while guests were running.
>>>>
>>>> This code has some corner case bugs, e.g., when the guest hits a machine
>>>> check or HMI the primary locks up waiting for secondaries to switch LPCR
>>>> to host, which they never do. This could all be fixed in software, but
>>>> most CPUs in production have mixed mode support, and those that don't
>>>> are believed to be all in installations that don't use this capability.
>>>> So simplify things and remove support.
>>> 
>>> With this patch in a DD2.1 machine + indep_threads_mode=N +
>>> disable_radix, QEMU aborts and dumps registers, is that intended?
>>
>> Yes. That configuration is hanging handling MCEs in the guest with some 
>> threads waiting forever to synchronize. Paul suggested it was never a
>> supported configuration so we might just remove it.
>>
> 
> OK, so:
> 
> Tested-by: Fabiano Rosas <farosas@linux.ibm.com>
> 
>>> Could we use the 'no_mixing_hpt_and_radix' logic in check_extension to
>>> advertise only KVM_CAP_PPC_MMU_RADIX to the guest via OV5 so it doesn't
>>> try to run hash?
>>> 
>>> For instance, if I hack QEMU's 'spapr_dt_ov5_platform_support' from
>>> OV5_MMU_BOTH to OV5_MMU_RADIX_300 then it boots succesfuly, but the
>>> guest turns into radix, due to this code in prom_init:
>>> 
>>> prom_parse_mmu_model:
>>> 
>>> case OV5_FEAT(OV5_MMU_RADIX): /* Only Radix */
>>> 	prom_debug("MMU - radix only\n");
>>> 	if (prom_radix_disable) {
>>> 		/*
>>> 		 * If we __have__ to do radix, we're better off ignoring
>>> 		 * the command line rather than not booting.
>>> 		 */
>>> 		prom_printf("WARNING: Ignoring cmdline option disable_radix\n");
>>> 	}
>>> 	support->radix_mmu = true;
>>> 	break;
>>> 
>>> It seems we could explicitly say that the host does not support hash and
>>> that would align with the above code.
>>
>> I'm not sure, sounds like you could, on the other hand these aborts seem 
>> like the prefered failure mode for these kinds of configuration issues, 
>> I don't know what the policy is, is reverting back to radix acceptable?
>>
> 
> Yeah, I couldn't find documentation about why we're reverting back to
> radix. I personally dislike it, but there is already a precedent so I'm
> not sure. A radix guest on a hash host does the same transparent
> conversion AFAICT.
> 
> But despite that, this patch removes support for hash MMU in this
> particular scenario. I don't see why continuing to tell the guest we
> support hash.
> 
> Anyway, here's a patch if you decide to go that way (tested w/ DD2.1 &
> 2.3 machines):

Thanks, I don't mind it, have to see if the maintainer will take it :)

You could add a small changelog / SOB and I could putit after my patch?

> 
> ---
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 0a056c64c317b..53743555676d6 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -314,6 +314,7 @@ struct kvmppc_ops {
>  			      int size);
>  	int (*enable_svm)(struct kvm *kvm);
>  	int (*svm_off)(struct kvm *kvm);
> +	bool (*hash_possible)(void);
>  };
>  
>  extern struct kvmppc_ops *kvmppc_hv_ops;
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 6f612d240392f..2d1e8aba22b85 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -5599,6 +5599,15 @@ static int kvmhv_svm_off(struct kvm *kvm)
>  	return ret;
>  }
>  
> +static bool kvmppc_hash_possible(void)
> +{
> +	if (radix_enabled() && no_mixing_hpt_and_radix)
> +		return false;
> +
> +	return cpu_has_feature(CPU_FTR_ARCH_300) &&
> +		cpu_has_feature(CPU_FTR_HVMODE);
> +}

Just be careful, it's hash_v3 specifically. Either make this return true 
for arch < 300 add the ARCH_300 check in the ioctl, or rename to include
v3.

> +
>  static struct kvmppc_ops kvm_ops_hv = {
>  	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
>  	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
> @@ -5642,6 +5651,7 @@ static struct kvmppc_ops kvm_ops_hv = {
>  	.store_to_eaddr = kvmhv_store_to_eaddr,
>  	.enable_svm = kvmhv_enable_svm,
>  	.svm_off = kvmhv_svm_off,
> +	.hash_possible = kvmppc_hash_possible,
>  };
>  

How about adding an op which can check extensions? It could return false
if it wasn't checked and so default to the generic checks in 
kvm_vm_ioctl_check_extension, and take a pointer to 'r' to set.

>  static int kvm_init_subcore_bitmap(void)
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index cf52d26f49cd7..99ced6c570e74 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -611,8 +611,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  		r = !!(hv_enabled && radix_enabled());
>  		break;
>  	case KVM_CAP_PPC_MMU_HASH_V3:
> -		r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
> -		       cpu_has_feature(CPU_FTR_HVMODE));
> +		r = !!(hv_enabled && kvmppc_hv_ops->hash_possible());
>  		break;
>  	case KVM_CAP_PPC_NESTED_HV:
>  		r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 1/4] KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support
From: Fabiano Rosas @ 2021-01-19 21:07 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev
In-Reply-To: <1611025782.s66bkxjtqz.astroid@bobo.none>

Nicholas Piggin <npiggin@gmail.com> writes:

> Excerpts from Fabiano Rosas's message of January 19, 2021 11:46 am:
>> Resending because the previous got spam-filtered:
>> 
>> Nicholas Piggin <npiggin@gmail.com> writes:
>> 
>>> This reverts much of commit c01015091a770 ("KVM: PPC: Book3S HV: Run HPT
>>> guests on POWER9 radix hosts"), which was required to run HPT guests on
>>> RPT hosts on early POWER9 CPUs without support for "mixed mode", which
>>> meant the host could not run with MMU on while guests were running.
>>>
>>> This code has some corner case bugs, e.g., when the guest hits a machine
>>> check or HMI the primary locks up waiting for secondaries to switch LPCR
>>> to host, which they never do. This could all be fixed in software, but
>>> most CPUs in production have mixed mode support, and those that don't
>>> are believed to be all in installations that don't use this capability.
>>> So simplify things and remove support.
>> 
>> With this patch in a DD2.1 machine + indep_threads_mode=N +
>> disable_radix, QEMU aborts and dumps registers, is that intended?
>
> Yes. That configuration is hanging handling MCEs in the guest with some 
> threads waiting forever to synchronize. Paul suggested it was never a
> supported configuration so we might just remove it.
>

OK, so:

Tested-by: Fabiano Rosas <farosas@linux.ibm.com>

>> Could we use the 'no_mixing_hpt_and_radix' logic in check_extension to
>> advertise only KVM_CAP_PPC_MMU_RADIX to the guest via OV5 so it doesn't
>> try to run hash?
>> 
>> For instance, if I hack QEMU's 'spapr_dt_ov5_platform_support' from
>> OV5_MMU_BOTH to OV5_MMU_RADIX_300 then it boots succesfuly, but the
>> guest turns into radix, due to this code in prom_init:
>> 
>> prom_parse_mmu_model:
>> 
>> case OV5_FEAT(OV5_MMU_RADIX): /* Only Radix */
>> 	prom_debug("MMU - radix only\n");
>> 	if (prom_radix_disable) {
>> 		/*
>> 		 * If we __have__ to do radix, we're better off ignoring
>> 		 * the command line rather than not booting.
>> 		 */
>> 		prom_printf("WARNING: Ignoring cmdline option disable_radix\n");
>> 	}
>> 	support->radix_mmu = true;
>> 	break;
>> 
>> It seems we could explicitly say that the host does not support hash and
>> that would align with the above code.
>
> I'm not sure, sounds like you could, on the other hand these aborts seem 
> like the prefered failure mode for these kinds of configuration issues, 
> I don't know what the policy is, is reverting back to radix acceptable?
>

Yeah, I couldn't find documentation about why we're reverting back to
radix. I personally dislike it, but there is already a precedent so I'm
not sure. A radix guest on a hash host does the same transparent
conversion AFAICT.

But despite that, this patch removes support for hash MMU in this
particular scenario. I don't see why continuing to tell the guest we
support hash.

Anyway, here's a patch if you decide to go that way (tested w/ DD2.1 &
2.3 machines):

---
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0a056c64c317b..53743555676d6 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -314,6 +314,7 @@ struct kvmppc_ops {
 			      int size);
 	int (*enable_svm)(struct kvm *kvm);
 	int (*svm_off)(struct kvm *kvm);
+	bool (*hash_possible)(void);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6f612d240392f..2d1e8aba22b85 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5599,6 +5599,15 @@ static int kvmhv_svm_off(struct kvm *kvm)
 	return ret;
 }
 
+static bool kvmppc_hash_possible(void)
+{
+	if (radix_enabled() && no_mixing_hpt_and_radix)
+		return false;
+
+	return cpu_has_feature(CPU_FTR_ARCH_300) &&
+		cpu_has_feature(CPU_FTR_HVMODE);
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5642,6 +5651,7 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.store_to_eaddr = kvmhv_store_to_eaddr,
 	.enable_svm = kvmhv_enable_svm,
 	.svm_off = kvmhv_svm_off,
+	.hash_possible = kvmppc_hash_possible,
 };
 
 static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cf52d26f49cd7..99ced6c570e74 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -611,8 +611,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = !!(hv_enabled && radix_enabled());
 		break;
 	case KVM_CAP_PPC_MMU_HASH_V3:
-		r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
-		       cpu_has_feature(CPU_FTR_HVMODE));
+		r = !!(hv_enabled && kvmppc_hv_ops->hash_possible());
 		break;
 	case KVM_CAP_PPC_NESTED_HV:
 		r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
---

https://github.com/farosas/linux/commit/0ae6c65ba9592c23215899c473acf392bd6e36d5.patch


> Thanks,
> Nick

^ permalink raw reply related

* Re: [PATCH 6/6] powerpc/rtas: constrain user region allocation to RMA
From: Nathan Lynch @ 2021-01-19 21:00 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: aik, tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <87mtx5qp1g.fsf@mpe.ellerman.id.au>

Michael Ellerman <mpe@ellerman.id.au> writes:
> Nathan Lynch <nathanl@linux.ibm.com> writes:
>> Memory locations passed as arguments from the OS to RTAS usually need
>> to be addressable in 32-bit mode and must reside in the Real Mode
>> Area. On PAPR guests, the RMA starts at logical address 0 and is the
>> first logical memory block reported in the LPAR’s device tree.
>>
>> On powerpc targets with RTAS, Linux makes available to user space a
>> region of memory suitable for arguments to be passed to RTAS via
>> sys_rtas(). This region (rtas_rmo_buf) is allocated via the memblock
>> API during boot in order to ensure that it satisfies the requirements
>> described above.
>>
>> With radix MMU, the upper limit supplied to the memblock allocation
>> can exceed the bounds of the first logical memory block, since
>> ppc64_rma_size is ULONG_MAX and RTAS_INSTANTIATE_MAX is 1GB.
>
> Why does the size of the first memory block matter for radix?

Here is my understanding: in the platform architecture, the size of the
first memory block equals the RMA, regardless of the MMU mode. It just
so happens that when using radix, Linux can pass ibm,configure-connector
a work area address outside of the RMA because the allocation
constraints for the work area are computed differently. It would be
wrong of the OS to pass RTAS arguments outside of this region with hash
MMU as well.

> The 1GB limit is sufficient to make it accessible by 32-bit code.

But the requirement is that memory arguments passed to RTAS reside in
the RMA, which may be less than 1GB.


>> (512MB is a common size of the first memory block according to a small sample of
>> LPARs I have checked.)
>
> That's the minimum we request, see prom_init.c:
>
> 	/* option vector 2: Open Firmware options supported */
> 	.vec2 = {
> 		.byte1 = OV2_REAL_MODE,
> 		.reserved = 0,
> 		.real_base = cpu_to_be32(0xffffffff),
> 		.real_size = cpu_to_be32(0xffffffff),
> 		.virt_base = cpu_to_be32(0xffffffff),
> 		.virt_size = cpu_to_be32(0xffffffff),
> 		.load_base = cpu_to_be32(0xffffffff),
> 		.min_rma = cpu_to_be32(512),		/* 512MB min RMA */
>
> Since v4.12 in 2017.

Thanks for the pointer, I had been trying to find this without success.

^ permalink raw reply

* Re: [PATCH] selftests/powerpc: Only test lwm/stmw on big endian
From: Libor Pechacek @ 2021-01-19 20:01 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, msuchanek
In-Reply-To: <20210119041800.3093047-1-mpe@ellerman.id.au>

On Út 19-01-21 15:18:00, Michael Ellerman wrote:
> Newer binutils (>= 2.36) refuse to assemble lmw/stmw when building in
> little endian mode. That breaks compilation of our alignment handler
> test:
> 
>   /tmp/cco4l14N.s: Assembler messages:
>   /tmp/cco4l14N.s:1440: Error: `lmw' invalid when little-endian
>   /tmp/cco4l14N.s:1814: Error: `stmw' invalid when little-endian
>   make[2]: *** [../../lib.mk:139: /output/kselftest/powerpc/alignment/alignment_handler] Error 1
> 
> These tests do pass on little endian machines, as the kernel will
> still emulate those instructions even when running little
> endian (which is arguably a kernel bug).
> 
> But we don't really need to test that case, so ifdef those
> instructions out to get the alignment test building again.
> 
> Reported-by: Libor Pechacek <lpechacek@suse.com>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

Tested-by: Libor Pechacek <lpechacek@suse.com>

Thanks, Michael, for the fix!

Libor

> ---
>  .../testing/selftests/powerpc/alignment/alignment_handler.c  | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
> index cb53a8b777e6..c25cf7cd45e9 100644
> --- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c
> +++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
> @@ -443,7 +443,6 @@ int test_alignment_handler_integer(void)
>  	LOAD_DFORM_TEST(ldu);
>  	LOAD_XFORM_TEST(ldx);
>  	LOAD_XFORM_TEST(ldux);
> -	LOAD_DFORM_TEST(lmw);
>  	STORE_DFORM_TEST(stb);
>  	STORE_XFORM_TEST(stbx);
>  	STORE_DFORM_TEST(stbu);
> @@ -462,7 +461,11 @@ int test_alignment_handler_integer(void)
>  	STORE_XFORM_TEST(stdx);
>  	STORE_DFORM_TEST(stdu);
>  	STORE_XFORM_TEST(stdux);
> +
> +#ifdef __BIG_ENDIAN__
> +	LOAD_DFORM_TEST(lmw);
>  	STORE_DFORM_TEST(stmw);
> +#endif
>  
>  	return rc;
>  }
> -- 
> 2.25.1
> 

-- 
Libor Pechacek
SUSE Labs                                Remember to have fun...

^ permalink raw reply

* Re: [PATCH net RFC] ibmvnic: device remove has higher precedence over reset
From: Lijun Pan @ 2021-01-19 20:08 UTC (permalink / raw)
  To: Uwe Kleine-König
  Cc: gregkh, julietk, netdev, Jakub Kicinski, Lijun Pan, kernel, drt,
	paulus, sukadev, linuxppc-dev, davem
In-Reply-To: <20210119194959.a67nlfbngx4drvyz@pengutronix.de>

On Tue, Jan 19, 2021 at 1:56 PM Uwe Kleine-König
<u.kleine-koenig@pengutronix.de> wrote:
>
> On Tue, Jan 19, 2021 at 01:33:13PM -0600, Lijun Pan wrote:
> > Returning -EBUSY in ibmvnic_remove() does not actually hold the
> > removal procedure since driver core doesn't care for the return
> > value (see __device_release_driver() in drivers/base/dd.c
> > calling dev->bus->remove()) though vio_bus_remove
> > (in arch/powerpc/platforms/pseries/vio.c) records the
> > return value and passes it on. [1]
> >
> > During the device removal precedure, we should not schedule
> > any new reset, and rely on the flush_work and flush_delayed_work
> > to complete the pending resets, and specifically we need to
> > let __ibmvnic_reset() keep running while in REMOVING state since
> > flush_work and flush_delayed_work shall call __ibmvnic_reset finally.
> >
> > [1] https://lore.kernel.org/linuxppc-dev/20210117101242.dpwayq6wdgfdzirl@pengutronix.de/T/#m48f5befd96bc9842ece2a3ad14f4c27747206a53
> > Reported-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
> > Fixes: 7d7195a026ba ("ibmvnic: Do not process device remove during device reset")
> > Signed-off-by: Lijun Pan <ljp@linux.ibm.com>
> > ---
> >  drivers/net/ethernet/ibm/ibmvnic.c | 8 +-------
> >  1 file changed, 1 insertion(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
> > index aed985e08e8a..11f28fd03057 100644
> > --- a/drivers/net/ethernet/ibm/ibmvnic.c
> > +++ b/drivers/net/ethernet/ibm/ibmvnic.c
> > @@ -2235,8 +2235,7 @@ static void __ibmvnic_reset(struct work_struct *work)
> >       while (rwi) {
> >               spin_lock_irqsave(&adapter->state_lock, flags);
> >
> > -             if (adapter->state == VNIC_REMOVING ||
> > -                 adapter->state == VNIC_REMOVED) {
> > +             if (adapter->state == VNIC_REMOVED) {
>
> I think you need to keep the check for VNIC_REMOVING. Otherwise you
> don't prevent that a new reset being queued after ibmvnic_remove() set
> the state to VNIC_REMOVING. Am I missing something?

I leave the checking for REMOVING there in ibmvnic_reset, which is the
function to schedule/queue up the resets.
Here I delete the REMOVING in __ibmvnic_reset to let it run and finish.
Otherwise flush_work will not do anything if __ibmvnic_reset() just return
doing nothing.

I can explain it in the commit message.

>
> >                       spin_unlock_irqrestore(&adapter->state_lock, flags);
> >                       kfree(rwi);
> >                       rc = EBUSY;
> > @@ -5372,11 +5371,6 @@ static int ibmvnic_remove(struct vio_dev *dev)
> >       unsigned long flags;
> >
> >       spin_lock_irqsave(&adapter->state_lock, flags);
> > -     if (test_bit(0, &adapter->resetting)) {
> > -             spin_unlock_irqrestore(&adapter->state_lock, flags);
> > -             return -EBUSY;
> > -     }
> > -
> >       adapter->state = VNIC_REMOVING;
> >       spin_unlock_irqrestore(&adapter->state_lock, flags);
>
> Best regards
> Uwe
>
> --
> Pengutronix e.K.                           | Uwe Kleine-König            |
> Industrial Linux Solutions                 | https://www.pengutronix.de/ |

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox