LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 5.7 219/388] powerpc/64s/exceptions: Machine check reconcile irq state
From: Sasha Levin @ 2020-06-18  1:05 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, linuxppc-dev, Nicholas Piggin
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Nicholas Piggin <npiggin@gmail.com>

[ Upstream commit f0fd9dd3c213c947dfb5bc2cad3ef5e30d3258ec ]

pseries fwnmi machine check code pops the soft-irq checks in rtas_call
(after the next patch to remove rtas_token from this call path).
Rather than play whack a mole with these and forever having fragile
code, it seems better to have the early machine check handler perform
the same kind of reconcile as the other NMI interrupts.

  WARNING: CPU: 0 PID: 493 at arch/powerpc/kernel/irq.c:343
  CPU: 0 PID: 493 Comm: a Tainted: G        W
  NIP:  c00000000001ed2c LR: c000000000042c40 CTR: 0000000000000000
  REGS: c0000001fffd38b0 TRAP: 0700   Tainted: G        W
  MSR:  8000000000021003 <SF,ME,RI,LE>  CR: 28000488  XER: 00000000
  CFAR: c00000000001ec90 IRQMASK: 0
  GPR00: c000000000043820 c0000001fffd3b40 c0000000012ba300 0000000000000000
  GPR04: 0000000048000488 0000000000000000 0000000000000000 00000000deadbeef
  GPR08: 0000000000000080 0000000000000000 0000000000000000 0000000000001001
  GPR12: 0000000000000000 c0000000014a0000 0000000000000000 0000000000000000
  GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR24: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR28: 0000000000000000 0000000000000001 c000000001360810 0000000000000000
  NIP [c00000000001ed2c] arch_local_irq_restore.part.0+0xac/0x100
  LR [c000000000042c40] unlock_rtas+0x30/0x90
  Call Trace:
  [c0000001fffd3b40] [c000000001360810] 0xc000000001360810 (unreliable)
  [c0000001fffd3b60] [c000000000043820] rtas_call+0x1c0/0x280
  [c0000001fffd3bb0] [c0000000000dc328] fwnmi_release_errinfo+0x38/0x70
  [c0000001fffd3c10] [c0000000000dcd8c] pseries_machine_check_realmode+0x1dc/0x540
  [c0000001fffd3cd0] [c00000000003fe04] machine_check_early+0x54/0x70
  [c0000001fffd3d00] [c000000000008384] machine_check_early_common+0x134/0x1f0
  --- interrupt: 200 at 0x13f1307c8
      LR = 0x7fff888b8528
  Instruction dump:
  60000000 7d2000a6 71298000 41820068 39200002 7d210164 4bffff9c 60000000
  60000000 7d2000a6 71298000 4c820020 <0fe00000> 4e800020 60000000 60000000

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200508043408.886394-5-npiggin@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/kernel/exceptions-64s.S | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 463372046169..d3e19934cca9 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1117,11 +1117,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	li	r10,MSR_RI
 	mtmsrd	r10,1
 
+	/*
+	 * Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see
+	 * system_reset_common)
+	 */
+	li	r10,IRQS_ALL_DISABLED
+	stb	r10,PACAIRQSOFTMASK(r13)
+	lbz	r10,PACAIRQHAPPENED(r13)
+	std	r10,RESULT(r1)
+	ori	r10,r10,PACA_IRQ_HARD_DIS
+	stb	r10,PACAIRQHAPPENED(r13)
+
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_early
 	std	r3,RESULT(r1)	/* Save result */
 	ld	r12,_MSR(r1)
 
+	/*
+	 * Restore soft mask settings.
+	 */
+	ld	r10,RESULT(r1)
+	stb	r10,PACAIRQHAPPENED(r13)
+	ld	r10,SOFTE(r1)
+	stb	r10,PACAIRQSOFTMASK(r13)
+
 #ifdef CONFIG_PPC_P7_NAP
 	/*
 	 * Check if thread was in power saving mode. We come here when any
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 218/388] powerpc/64s/exception: Fix machine check no-loss idle wakeup
From: Sasha Levin @ 2020-06-18  1:05 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, linuxppc-dev, Nicholas Piggin
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Nicholas Piggin <npiggin@gmail.com>

[ Upstream commit 8a5054d8cbbe03c68dcb0957c291c942132e4101 ]

The architecture allows for machine check exceptions to cause idle
wakeups which resume at the 0x200 address which has to return via
the idle wakeup code, but the early machine check handler is run
first.

The case of a no state-loss sleep is broken because the early
handler uses non-volatile register r1 , which is needed for the wakeup
protocol, but it is not restored.

Fix this by loading r1 from the MCE exception frame before returning
to the idle wakeup code. Also update the comment which has become
stale since the idle rewrite in C.

This crash was found and fix confirmed with a machine check injection
test in qemu powernv model (which is not upstream in qemu yet).

Fixes: 10d91611f426d ("powerpc/64s: Reimplement book3s idle code in C")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200508043408.886394-2-npiggin@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/kernel/exceptions-64s.S | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index ebeebab74b56..463372046169 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1225,17 +1225,19 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
 	bl	machine_check_queue_event
 
 	/*
-	 * We have not used any non-volatile GPRs here, and as a rule
-	 * most exception code including machine check does not.
-	 * Therefore PACA_NAPSTATELOST does not need to be set. Idle
-	 * wakeup will restore volatile registers.
+	 * GPR-loss wakeups are relatively straightforward, because the
+	 * idle sleep code has saved all non-volatile registers on its
+	 * own stack, and r1 in PACAR1.
 	 *
-	 * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+	 * For no-loss wakeups the r1 and lr registers used by the
+	 * early machine check handler have to be restored first. r2 is
+	 * the kernel TOC, so no need to restore it.
 	 *
 	 * Then decrement MCE nesting after finishing with the stack.
 	 */
 	ld	r3,_MSR(r1)
 	ld	r4,_LINK(r1)
+	ld	r1,GPR1(r1)
 
 	lhz	r11,PACA_IN_MCE(r13)
 	subi	r11,r11,1
@@ -1244,7 +1246,7 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
 	mtlr	r4
 	rlwinm	r10,r3,47-31,30,31
 	cmpwi	cr1,r10,2
-	bltlr	cr1	/* no state loss, return to idle caller */
+	bltlr	cr1	/* no state loss, return to idle caller with r3=SRR1 */
 	b	idle_return_gpr_loss
 #endif
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 205/388] powerpc/64: Don't initialise init_task->thread.regs
From: Sasha Levin @ 2020-06-18  1:05 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, linuxppc-dev, Aneesh Kumar K . V
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Michael Ellerman <mpe@ellerman.id.au>

[ Upstream commit 7ffa8b7dc11752827329e4e84a574ea6aaf24716 ]

Aneesh increased the size of struct pt_regs by 16 bytes and started
seeing this WARN_ON:

  smp: Bringing up secondary CPUs ...
  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 0 at arch/powerpc/kernel/process.c:455 giveup_all+0xb4/0x110
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0-rc2-gcc-8.2.0-1.g8f6a41f-default+ #318
  NIP:  c00000000001a2b4 LR: c00000000001a29c CTR: c0000000031d0000
  REGS: c0000000026d3980 TRAP: 0700   Not tainted  (5.7.0-rc2-gcc-8.2.0-1.g8f6a41f-default+)
  MSR:  800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 48048224  XER: 00000000
  CFAR: c000000000019cc8 IRQMASK: 1
  GPR00: c00000000001a264 c0000000026d3c20 c0000000026d7200 800000000280b033
  GPR04: 0000000000000001 0000000000000000 0000000000000077 30206d7372203164
  GPR08: 0000000000002000 0000000002002000 800000000280b033 3230303030303030
  GPR12: 0000000000008800 c0000000031d0000 0000000000800050 0000000002000066
  GPR16: 000000000309a1a0 000000000309a4b0 000000000309a2d8 000000000309a890
  GPR20: 00000000030d0098 c00000000264da40 00000000fd620000 c0000000ff798080
  GPR24: c00000000264edf0 c0000001007469f0 00000000fd620000 c0000000020e5e90
  GPR28: c00000000264edf0 c00000000264d200 000000001db60000 c00000000264d200
  NIP [c00000000001a2b4] giveup_all+0xb4/0x110
  LR [c00000000001a29c] giveup_all+0x9c/0x110
  Call Trace:
  [c0000000026d3c20] [c00000000001a264] giveup_all+0x64/0x110 (unreliable)
  [c0000000026d3c90] [c00000000001ae34] __switch_to+0x104/0x480
  [c0000000026d3cf0] [c000000000e0b8a0] __schedule+0x320/0x970
  [c0000000026d3dd0] [c000000000e0c518] schedule_idle+0x38/0x70
  [c0000000026d3df0] [c00000000019c7c8] do_idle+0x248/0x3f0
  [c0000000026d3e70] [c00000000019cbb8] cpu_startup_entry+0x38/0x40
  [c0000000026d3ea0] [c000000000011bb0] rest_init+0xe0/0xf8
  [c0000000026d3ed0] [c000000002004820] start_kernel+0x990/0x9e0
  [c0000000026d3f90] [c00000000000c49c] start_here_common+0x1c/0x400

Which was unexpected. The warning is checking the thread.regs->msr
value of the task we are switching from:

  usermsr = tsk->thread.regs->msr;
  ...
  WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC)));

ie. if MSR_VSX is set then both of MSR_FP and MSR_VEC are also set.

Dumping tsk->thread.regs->msr we see that it's: 0x1db60000

Which is not a normal looking MSR, in fact the only valid bit is
MSR_VSX, all the other bits are reserved in the current definition of
the MSR.

We can see from the oops that it was swapper/0 that we were switching
from when we hit the warning, ie. init_task. So its thread.regs points
to the base (high addresses) in init_stack.

Dumping the content of init_task->thread.regs, with the members of
pt_regs annotated (the 16 bytes larger version), we see:

  0000000000000000 c000000002780080    gpr[0]     gpr[1]
  0000000000000000 c000000002666008    gpr[2]     gpr[3]
  c0000000026d3ed0 0000000000000078    gpr[4]     gpr[5]
  c000000000011b68 c000000002780080    gpr[6]     gpr[7]
  0000000000000000 0000000000000000    gpr[8]     gpr[9]
  c0000000026d3f90 0000800000002200    gpr[10]    gpr[11]
  c000000002004820 c0000000026d7200    gpr[12]    gpr[13]
  000000001db60000 c0000000010aabe8    gpr[14]    gpr[15]
  c0000000010aabe8 c0000000010aabe8    gpr[16]    gpr[17]
  c00000000294d598 0000000000000000    gpr[18]    gpr[19]
  0000000000000000 0000000000001ff8    gpr[20]    gpr[21]
  0000000000000000 c00000000206d608    gpr[22]    gpr[23]
  c00000000278e0cc 0000000000000000    gpr[24]    gpr[25]
  000000002fff0000 c000000000000000    gpr[26]    gpr[27]
  0000000002000000 0000000000000028    gpr[28]    gpr[29]
  000000001db60000 0000000004750000    gpr[30]    gpr[31]
  0000000002000000 000000001db60000    nip        msr
  0000000000000000 0000000000000000    orig_r3    ctr
  c00000000000c49c 0000000000000000    link       xer
  0000000000000000 0000000000000000    ccr        softe
  0000000000000000 0000000000000000    trap       dar
  0000000000000000 0000000000000000    dsisr      result
  0000000000000000 0000000000000000    ppr        kuap
  0000000000000000 0000000000000000    pad[2]     pad[3]

This looks suspiciously like stack frames, not a pt_regs. If we look
closely we can see return addresses from the stack trace above,
c000000002004820 (start_kernel) and c00000000000c49c (start_here_common).

init_task->thread.regs is setup at build time in processor.h:

  #define INIT_THREAD  { \
  	.ksp = INIT_SP, \
  	.regs = (struct pt_regs *)INIT_SP - 1, /* XXX bogus, I think */ \

The early boot code where we setup the initial stack is:

  LOAD_REG_ADDR(r3,init_thread_union)

  /* set up a stack pointer */
  LOAD_REG_IMMEDIATE(r1,THREAD_SIZE)
  add	r1,r3,r1
  li	r0,0
  stdu	r0,-STACK_FRAME_OVERHEAD(r1)

Which creates a stack frame of size 112 bytes (STACK_FRAME_OVERHEAD).
Which is far too small to contain a pt_regs.

So the result is init_task->thread.regs is pointing at some stack
frames on the init stack, not at a pt_regs.

We have gotten away with this for so long because with pt_regs at its
current size the MSR happens to point into the first frame, at a
location that is not written to by the early asm. With the 16 byte
expansion the MSR falls into the second frame, which is used by the
compiler, and collides with a saved register that tends to be
non-zero.

As far as I can see this has been wrong since the original merge of
64-bit ppc support, back in 2002.

Conceptually swapper should have no regs, it never entered from
userspace, and in fact that's what we do on 32-bit. It's also
presumably what the "bogus" comment is referring to.

So I think the right fix is to just not-initialise regs at all. I'm
slightly worried this will break some code that isn't prepared for a
NULL regs, but we'll have to see.

Remove the comment in head_64.S which refers to us setting up the
regs (even though we never did), and is otherwise not really accurate
any more.

Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200428123130.73078-1-mpe@ellerman.id.au
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/include/asm/processor.h | 1 -
 arch/powerpc/kernel/head_64.S        | 9 +--------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index eedcbfb9a6ff..c220cb9eccad 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -301,7 +301,6 @@ struct thread_struct {
 #else
 #define INIT_THREAD  { \
 	.ksp = INIT_SP, \
-	.regs = (struct pt_regs *)INIT_SP - 1, /* XXX bogus, I think */ \
 	.addr_limit = KERNEL_DS, \
 	.fpexc_mode = 0, \
 	.fscr = FSCR_TAR | FSCR_EBB \
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index ddfbd02140d9..0e05a9a47a4b 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -947,15 +947,8 @@ start_here_multiplatform:
 	std	r0,0(r4)
 #endif
 
-	/* The following gets the stack set up with the regs */
-	/* pointing to the real addr of the kernel stack.  This is   */
-	/* all done to support the C function call below which sets  */
-	/* up the htab.  This is done because we have relocated the  */
-	/* kernel but are still running in real mode. */
-
-	LOAD_REG_ADDR(r3,init_thread_union)
-
 	/* set up a stack pointer */
+	LOAD_REG_ADDR(r3,init_thread_union)
 	LOAD_REG_IMMEDIATE(r1,THREAD_SIZE)
 	add	r1,r3,r1
 	li	r0,0
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH] mm: Move p?d_alloc_track to separate header file
From: Andrew Morton @ 2020-06-18  1:12 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: linux-arch, Stephen Rothwell, jroedel, linux-mm, peterz,
	Linus Torvalds, linuxppc-dev, Steven Rostedt, Mike Rapoport,
	Abdul Haleem, linux-next, Satheesh Rajendran, Andy Lutomirski,
	manvanth, hch, linux-kernel
In-Reply-To: <20200609120533.25867-1-joro@8bytes.org>

On Tue,  9 Jun 2020 14:05:33 +0200 Joerg Roedel <joro@8bytes.org> wrote:

> From: Joerg Roedel <jroedel@suse.de>
> 
> The functions are only used in two source files, so there is no need
> for them to be in the global <linux/mm.h> header. Move them to the new
> <linux/pgalloc-track.h> header and include it only where needed.
> 
> ...
>
> new file mode 100644
> index 000000000000..1dcc865029a2
> --- /dev/null
> +++ b/include/linux/pgalloc-track.h
> @@ -0,0 +1,51 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_PGALLLC_TRACK_H
> +#define _LINUX_PGALLLC_TRACK_H

hm, no #includes.  I guess this is OK, given the limited use.

But it does make one wonder whether ioremap.c should be moved from lib/
to mm/ and this file should be moved from include/linux/ to mm/.

Oh well.

^ permalink raw reply

* [PATCH AUTOSEL 5.7 146/388] tty: hvc: Fix data abort due to race in hvc_open
From: Sasha Levin @ 2020-06-18  1:04 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Raghavendra Rao Ananta, Greg Kroah-Hartman, linuxppc-dev,
	Sasha Levin
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Raghavendra Rao Ananta <rananta@codeaurora.org>

[ Upstream commit e2bd1dcbe1aa34ff5570b3427c530e4332ecf0fe ]

Potentially, hvc_open() can be called in parallel when two tasks calls
open() on /dev/hvcX. In such a scenario, if the hp->ops->notifier_add()
callback in the function fails, where it sets the tty->driver_data to
NULL, the parallel hvc_open() can see this NULL and cause a memory abort.
Hence, serialize hvc_open and check if tty->private_data is NULL before
proceeding ahead.

The issue can be easily reproduced by launching two tasks simultaneously
that does nothing but open() and close() on /dev/hvcX.
For example:
$ ./simple_open_close /dev/hvc0 & ./simple_open_close /dev/hvc0 &

Signed-off-by: Raghavendra Rao Ananta <rananta@codeaurora.org>
Link: https://lore.kernel.org/r/20200428032601.22127-1-rananta@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/tty/hvc/hvc_console.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index cdcc64ea2554..f8e43a6faea9 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -75,6 +75,8 @@ static LIST_HEAD(hvc_structs);
  */
 static DEFINE_MUTEX(hvc_structs_mutex);
 
+/* Mutex to serialize hvc_open */
+static DEFINE_MUTEX(hvc_open_mutex);
 /*
  * This value is used to assign a tty->index value to a hvc_struct based
  * upon order of exposure via hvc_probe(), when we can not match it to
@@ -346,16 +348,24 @@ static int hvc_install(struct tty_driver *driver, struct tty_struct *tty)
  */
 static int hvc_open(struct tty_struct *tty, struct file * filp)
 {
-	struct hvc_struct *hp = tty->driver_data;
+	struct hvc_struct *hp;
 	unsigned long flags;
 	int rc = 0;
 
+	mutex_lock(&hvc_open_mutex);
+
+	hp = tty->driver_data;
+	if (!hp) {
+		rc = -EIO;
+		goto out;
+	}
+
 	spin_lock_irqsave(&hp->port.lock, flags);
 	/* Check and then increment for fast path open. */
 	if (hp->port.count++ > 0) {
 		spin_unlock_irqrestore(&hp->port.lock, flags);
 		hvc_kick();
-		return 0;
+		goto out;
 	} /* else count == 0 */
 	spin_unlock_irqrestore(&hp->port.lock, flags);
 
@@ -383,6 +393,8 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 	/* Force wakeup of the polling thread */
 	hvc_kick();
 
+out:
+	mutex_unlock(&hvc_open_mutex);
 	return rc;
 }
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 110/388] ibmvnic: Flush existing work items before device removal
From: Sasha Levin @ 2020-06-18  1:03 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, netdev, Thomas Falcon, linuxppc-dev,
	David S . Miller
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Thomas Falcon <tlfalcon@linux.ibm.com>

[ Upstream commit 6954a9e4192b86d778fb52b525fd7b62d51b1147 ]

Ensure that all scheduled work items have completed before continuing
with device removal and after further event scheduling has been
halted. This patch fixes a bug where a scheduled driver reset event
is processed following device removal.

Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 197dc5b2c090..1b4d04e4474b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -5184,6 +5184,9 @@ static int ibmvnic_remove(struct vio_dev *dev)
 	adapter->state = VNIC_REMOVING;
 	spin_unlock_irqrestore(&adapter->state_lock, flags);
 
+	flush_work(&adapter->ibmvnic_reset);
+	flush_delayed_work(&adapter->ibmvnic_delayed_reset);
+
 	rtnl_lock();
 	unregister_netdevice(netdev);
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 107/388] scsi: ibmvscsi: Don't send host info in adapter info MAD after LPM
From: Sasha Levin @ 2020-06-18  1:03 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Tyrel Datwyler, linuxppc-dev, linux-scsi,
	Martin K . Petersen
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Tyrel Datwyler <tyreld@linux.ibm.com>

[ Upstream commit 4919b33b63c8b69d8dcf2b867431d0e3b6dc6d28 ]

The adapter info MAD is used to send the client info and receive the host
info as a response. A persistent buffer is used and as such the client info
is overwritten after the response. During the course of a normal adapter
reset the client info is refreshed in the buffer in preparation for sending
the adapter info MAD.

However, in the special case of LPM where we reenable the CRQ instead of a
full CRQ teardown and reset we fail to refresh the client info in the
adapter info buffer. As a result, after Live Partition Migration (LPM) we
erroneously report the host's info as our own.

[mkp: typos]

Link: https://lore.kernel.org/r/20200603203632.18426-1-tyreld@linux.ibm.com
Signed-off-by: Tyrel Datwyler <tyreld@linux.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/ibmvscsi/ibmvscsi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 59f0f1030c54..c5711c659b51 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -415,6 +415,8 @@ static int ibmvscsi_reenable_crq_queue(struct crq_queue *queue,
 	int rc = 0;
 	struct vio_dev *vdev = to_vio_dev(hostdata->dev);
 
+	set_adapter_info(hostdata);
+
 	/* Re-enable the CRQ */
 	do {
 		if (rc)
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 086/388] powerpc/crashkernel: Take "mem=" option into account
From: Sasha Levin @ 2020-06-18  1:03 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, linuxppc-dev, Hari Bathini, Pingfan Liu
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Pingfan Liu <kernelfans@gmail.com>

[ Upstream commit be5470e0c285a68dc3afdea965032f5ddc8269d7 ]

'mem=" option is an easy way to put high pressure on memory during
some test. Hence after applying the memory limit, instead of total
mem, the actual usable memory should be considered when reserving mem
for crashkernel. Otherwise the boot up may experience OOM issue.

E.g. it would reserve 4G prior to the change and 512M afterward, if
passing
crashkernel="2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G",
and mem=5G on a 256G machine.

This issue is powerpc specific because it puts higher priority on
fadump and kdump reservation than on "mem=". Referring the following
code:
    if (fadump_reserve_mem() == 0)
            reserve_crashkernel();
    ...
    /* Ensure that total memory size is page-aligned. */
    limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
    memblock_enforce_memory_limit(limit);

While on other arches, the effect of "mem=" takes a higher priority
and pass through memblock_phys_mem_size() before calling
reserve_crashkernel().

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Reviewed-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/1585749644-4148-1-git-send-email-kernelfans@gmail.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/kexec/core.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 078fe3d76feb..56da5eb2b923 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -115,11 +115,12 @@ void machine_kexec(struct kimage *image)
 
 void __init reserve_crashkernel(void)
 {
-	unsigned long long crash_size, crash_base;
+	unsigned long long crash_size, crash_base, total_mem_sz;
 	int ret;
 
+	total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size();
 	/* use common parsing */
-	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+	ret = parse_crashkernel(boot_command_line, total_mem_sz,
 			&crash_size, &crash_base);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
@@ -178,6 +179,7 @@ void __init reserve_crashkernel(void)
 	/* Crash kernel trumps memory limit */
 	if (memory_limit && memory_limit <= crashk_res.end) {
 		memory_limit = crashk_res.end + 1;
+		total_mem_sz = memory_limit;
 		printk("Adjusted memory limit for crashkernel, now 0x%llx\n",
 		       memory_limit);
 	}
@@ -186,7 +188,7 @@ void __init reserve_crashkernel(void)
 			"for crashkernel (System RAM: %ldMB)\n",
 			(unsigned long)(crash_size >> 20),
 			(unsigned long)(crashk_res.start >> 20),
-			(unsigned long)(memblock_phys_mem_size() >> 20));
+			(unsigned long)(total_mem_sz >> 20));
 
 	if (!memblock_is_region_memory(crashk_res.start, crash_size) ||
 	    memblock_reserve(crashk_res.start, crash_size)) {
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 080/388] powerpc/perf/hv-24x7: Fix inconsistent output values incase multiple hv-24x7 events run
From: Sasha Levin @ 2020-06-18  1:02 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Madhavan Srinivasan, Kajol Jain, Sukadev Bhattiprolu,
	linuxppc-dev
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Kajol Jain <kjain@linux.ibm.com>

[ Upstream commit b4ac18eead28611ff470d0f47a35c4e0ac080d9c ]

Commit 2b206ee6b0df ("powerpc/perf/hv-24x7: Display change in counter
values")' added to print _change_ in the counter value rather then raw
value for 24x7 counters. Incase of transactions, the event count
is set to 0 at the beginning of the transaction. It also sets
the event's prev_count to the raw value at the time of initialization.
Because of setting event count to 0, we are seeing some weird behaviour,
whenever we run multiple 24x7 events at a time.

For example:

command#: ./perf stat -e "{hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/,
			   hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/}"
	  		   -C 0 -I 1000 sleep 100

     1.000121704                120 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     1.000121704                  5 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     2.000357733                  8 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     2.000357733                 10 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     3.000495215 18,446,744,073,709,551,616 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     3.000495215 18,446,744,073,709,551,616 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     4.000641884                 56 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     4.000641884 18,446,744,073,709,551,616 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     5.000791887 18,446,744,073,709,551,616 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/

Getting these large values in case we do -I.

As we are setting event_count to 0, for interval case, overall event_count is not
coming in incremental order. As we may can get new delta lesser then previous count.
Because of which when we print intervals, we are getting negative value which create
these large values.

This patch removes part where we set event_count to 0 in function
'h_24x7_event_read'. There won't be much impact as we do set event->hw.prev_count
to the raw value at the time of initialization to print change value.

With this patch
In power9 platform

command#: ./perf stat -e "{hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/,
		           hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/}"
			   -C 0 -I 1000 sleep 100

     1.000117685                 93 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     1.000117685                  1 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     2.000349331                 98 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     2.000349331                  2 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     3.000495900                131 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     3.000495900                  4 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     4.000645920                204 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
     4.000645920                 61 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
     4.284169997                 22 hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/

Suggested-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Tested-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200525104308.9814-2-kjain@linux.ibm.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/perf/hv-24x7.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 573e0b309c0c..48e8f4b17b91 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -1400,16 +1400,6 @@ static void h_24x7_event_read(struct perf_event *event)
 			h24x7hw = &get_cpu_var(hv_24x7_hw);
 			h24x7hw->events[i] = event;
 			put_cpu_var(h24x7hw);
-			/*
-			 * Clear the event count so we can compute the _change_
-			 * in the 24x7 raw counter value at the end of the txn.
-			 *
-			 * Note that we could alternatively read the 24x7 value
-			 * now and save its value in event->hw.prev_count. But
-			 * that would require issuing a hcall, which would then
-			 * defeat the purpose of using the txn interface.
-			 */
-			local64_set(&event->count, 0);
 		}
 
 		put_cpu_var(hv_24x7_reqb);
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 072/388] powerpc/ptdump: Add _PAGE_COHERENT flag
From: Sasha Levin @ 2020-06-18  1:02 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, linuxppc-dev
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Christophe Leroy <christophe.leroy@csgroup.eu>

[ Upstream commit 3af4786eb429b2df76cbd7ce3bae21467ac3e4fb ]

For platforms using shared.c (4xx, Book3e, Book3s/32), also handle the
_PAGE_COHERENT flag which corresponds to the M bit of the WIMG flags.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Make it more verbose, use "coherent" rather than "m"]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/324c3d860717e8e91fca3bb6c0f8b23e1644a404.1589866984.git.christophe.leroy@csgroup.eu
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/mm/ptdump/shared.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index f7ed2f187cb0..784f8df17f73 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -30,6 +30,11 @@ static const struct flag_info flag_array[] = {
 		.val	= _PAGE_PRESENT,
 		.set	= "present",
 		.clear	= "       ",
+	}, {
+		.mask	= _PAGE_COHERENT,
+		.val	= _PAGE_COHERENT,
+		.set	= "coherent",
+		.clear	= "        ",
 	}, {
 		.mask	= _PAGE_GUARDED,
 		.val	= _PAGE_GUARDED,
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 065/388] powerpc/book3s64/radix/tlb: Determine hugepage flush correctly
From: Sasha Levin @ 2020-06-18  1:02 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Aneesh Kumar K.V, Nicholas Piggin, Bharata B Rao,
	linuxppc-dev
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>

[ Upstream commit 8f53f9c0f68ab2168f637494b9e24034899c1310 ]

With a 64K page size flush with start and end:

  (start, end) = (721f680d0000, 721f680e0000)

results in:

  (hstart, hend) = (721f68200000, 721f68000000)

ie. hstart is above hend, which indicates no huge page flush is
needed.

However the current logic incorrectly sets hflush = true in this case,
because hstart != hend.

That causes us to call __tlbie_va_range() passing hstart/hend, to do a
huge page flush even though we don't need to. __tlbie_va_range() will
skip the actual tlbie operation for start > end. But it will still end
up calling fixup_tlbie_va_range() and doing the TLB fixups in there,
which is harmless but unnecessary work.

Reported-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Drop else case, hflush is already false, flesh out change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200513030616.152288-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 758ade2c2b6e..b5cc9b23cf02 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -884,9 +884,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 		if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
 			hstart = (start + PMD_SIZE - 1) & PMD_MASK;
 			hend = end & PMD_MASK;
-			if (hstart == hend)
-				hflush = false;
-			else
+			if (hstart < hend)
 				hflush = true;
 		}
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 058/388] ps3disk: use the default segment boundary
From: Sasha Levin @ 2020-06-18  1:02 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Emmanuel Nicolet, Geoff Levand, linux-block,
	linuxppc-dev
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Emmanuel Nicolet <emmanuel.nicolet@gmail.com>

[ Upstream commit 720bc316690bd27dea9d71510b50f0cd698ffc32 ]

Since commit dcebd755926b ("block: use bio_for_each_bvec() to compute
multi-page bvec count"), the kernel will bug_on on the PS3 because
bio_split() is called with sectors == 0:

  kernel BUG at block/bio.c:1853!
  Oops: Exception in kernel mode, sig: 5 [#1]
  BE PAGE_SIZE=4K MMU=Hash PREEMPT SMP NR_CPUS=8 NUMA PS3
  Modules linked in: firewire_sbp2 rtc_ps3(+) soundcore ps3_gelic(+) \
  ps3rom(+) firewire_core ps3vram(+) usb_common crc_itu_t
  CPU: 0 PID: 97 Comm: blkid Not tainted 5.3.0-rc4 #1
  NIP:  c00000000027d0d0 LR: c00000000027d0b0 CTR: 0000000000000000
  REGS: c00000000135ae90 TRAP: 0700   Not tainted  (5.3.0-rc4)
  MSR:  8000000000028032 <SF,EE,IR,DR,RI>  CR: 44008240  XER: 20000000
  IRQMASK: 0
  GPR00: c000000000289368 c00000000135b120 c00000000084a500 c000000004ff8300
  GPR04: 0000000000000c00 c000000004c905e0 c000000004c905e0 000000000000ffff
  GPR08: 0000000000000000 0000000000000001 0000000000000000 000000000000ffff
  GPR12: 0000000000000000 c0000000008ef000 000000000000003e 0000000000080001
  GPR16: 0000000000000100 000000000000ffff 0000000000000000 0000000000000004
  GPR20: c00000000062fd7e 0000000000000001 000000000000ffff 0000000000000080
  GPR24: c000000000781788 c00000000135b350 0000000000000080 c000000004c905e0
  GPR28: c00000000135b348 c000000004ff8300 0000000000000000 c000000004c90000
  NIP [c00000000027d0d0] .bio_split+0x28/0xac
  LR [c00000000027d0b0] .bio_split+0x8/0xac
  Call Trace:
  [c00000000135b120] [c00000000027d130] .bio_split+0x88/0xac (unreliable)
  [c00000000135b1b0] [c000000000289368] .__blk_queue_split+0x11c/0x53c
  [c00000000135b2d0] [c00000000028f614] .blk_mq_make_request+0x80/0x7d4
  [c00000000135b3d0] [c000000000283a8c] .generic_make_request+0x118/0x294
  [c00000000135b4b0] [c000000000283d34] .submit_bio+0x12c/0x174
  [c00000000135b580] [c000000000205a44] .mpage_bio_submit+0x3c/0x4c
  [c00000000135b600] [c000000000206184] .mpage_readpages+0xa4/0x184
  [c00000000135b750] [c0000000001ff8fc] .blkdev_readpages+0x24/0x38
  [c00000000135b7c0] [c0000000001589f0] .read_pages+0x6c/0x1a8
  [c00000000135b8b0] [c000000000158c74] .__do_page_cache_readahead+0x118/0x184
  [c00000000135b9b0] [c0000000001591a8] .force_page_cache_readahead+0xe4/0xe8
  [c00000000135ba50] [c00000000014fc24] .generic_file_read_iter+0x1d8/0x830
  [c00000000135bb50] [c0000000001ffadc] .blkdev_read_iter+0x40/0x5c
  [c00000000135bbc0] [c0000000001b9e00] .new_sync_read+0x144/0x1a0
  [c00000000135bcd0] [c0000000001bc454] .vfs_read+0xa0/0x124
  [c00000000135bd70] [c0000000001bc7a4] .ksys_read+0x70/0xd8
  [c00000000135be20] [c00000000000a524] system_call+0x5c/0x70
  Instruction dump:
  7fe3fb78 482e30dc 7c0802a6 482e3085 7c9e2378 f821ff71 7ca42b78 7d3e00d0
  7c7d1b78 79290fe0 7cc53378 69290001 <0b090000> 81230028 7bca0020 7929ba62
  [ end trace 313fec760f30aa1f ]---

The problem originates from setting the segment boundary of the
request queue to -1UL. This makes get_max_segment_size() return zero
when offset is zero, whatever the max segment size. The test with
BLK_SEG_BOUNDARY_MASK fails and 'mask - (mask & offset) + 1' overflows
to zero in the return statement.

Not setting the segment boundary and using the default
value (BLK_SEG_BOUNDARY_MASK) fixes the problem.

Signed-off-by: Emmanuel Nicolet <emmanuel.nicolet@gmail.com>
Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/060a416c43138f45105c0540eff1a45539f7e2fc.1589049250.git.geoff@infradead.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/block/ps3disk.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index c5c6487a19d5..7b55811c2a81 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -454,7 +454,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
 	queue->queuedata = dev;
 
 	blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
-	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
 	blk_queue_logical_block_size(queue, dev->blk_size);
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 032/388] powerpc/kasan: Fix stack overflow by increasing THREAD_SHIFT
From: Sasha Levin @ 2020-06-18  1:02 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Christophe Leroy, erhard_f, linuxppc-dev, Sasha Levin
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Christophe Leroy <christophe.leroy@c-s.fr>

[ Upstream commit edbadaf0671072298e506074128b64e003c5812c ]

When CONFIG_KASAN is selected, the stack usage is increased.

In the same way as x86 and arm64 architectures, increase
THREAD_SHIFT when CONFIG_KASAN is selected.

Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support")
Reported-by: <erhard_f@mailbox.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=207129
Link: https://lore.kernel.org/r/2c50f3b1c9bbaa4217c9a98f3044bd2a36c46a4f.1586361277.git.christophe.leroy@c-s.fr
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b29d7cb38368..51a074c26793 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -773,6 +773,7 @@ config THREAD_SHIFT
 	range 13 15
 	default "15" if PPC_256K_PAGES
 	default "14" if PPC64
+	default "14" if KASAN
 	default "13"
 	help
 	  Used to define the stack size. The default is almost always what you
-- 
2.25.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.7 013/388] ASoC: fsl_esai: Disable exception interrupt before scheduling tasklet
From: Sasha Levin @ 2020-06-18  1:01 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, alsa-devel, Shengjiu Wang, Nicolin Chen, Mark Brown,
	linuxppc-dev
In-Reply-To: <20200618010805.600873-1-sashal@kernel.org>

From: Shengjiu Wang <shengjiu.wang@nxp.com>

[ Upstream commit 1fecbb71fe0e46b886f84e3b6decca6643c3af6d ]

Disable exception interrupt before scheduling tasklet, otherwise if
the tasklet isn't handled immediately, there will be endless xrun
interrupt.

Fixes: 7ccafa2b3879 ("ASoC: fsl_esai: recover the channel swap after xrun")
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
Link: https://lore.kernel.org/r/a8f2ad955aac9e52587beedc1133b3efbe746895.1587968824.git.shengjiu.wang@nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 sound/soc/fsl/fsl_esai.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c
index c7a49d03463a..84290be778f0 100644
--- a/sound/soc/fsl/fsl_esai.c
+++ b/sound/soc/fsl/fsl_esai.c
@@ -87,6 +87,10 @@ static irqreturn_t esai_isr(int irq, void *devid)
 	if ((saisr & (ESAI_SAISR_TUE | ESAI_SAISR_ROE)) &&
 	    esai_priv->reset_at_xrun) {
 		dev_dbg(&pdev->dev, "reset module for xrun\n");
+		regmap_update_bits(esai_priv->regmap, REG_ESAI_TCR,
+				   ESAI_xCR_xEIE_MASK, 0);
+		regmap_update_bits(esai_priv->regmap, REG_ESAI_RCR,
+				   ESAI_xCR_xEIE_MASK, 0);
 		tasklet_schedule(&esai_priv->task);
 	}
 
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH 3/3] powerpc/8xx: Provide ptep_get() with 16k pages
From: Michael Ellerman @ 2020-06-18  1:00 UTC (permalink / raw)
  To: Christophe Leroy, Peter Zijlstra
  Cc: Will Deacon, linux-kernel, linux-mm, Paul Mackerras,
	Andrew Morton, linuxppc-dev
In-Reply-To: <0bb024ce-11aa-80dc-c7d8-d5acc5329f25@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 17/06/2020 à 16:38, Peter Zijlstra a écrit :
>> On Thu, Jun 18, 2020 at 12:21:22AM +1000, Michael Ellerman wrote:
>>> Peter Zijlstra <peterz@infradead.org> writes:
>>>> On Mon, Jun 15, 2020 at 12:57:59PM +0000, Christophe Leroy wrote:
>> 
>>>>> +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
>>>>> +#define __HAVE_ARCH_PTEP_GET
>>>>> +static inline pte_t ptep_get(pte_t *ptep)
>>>>> +{
>>>>> +	pte_t pte = {READ_ONCE(ptep->pte), 0, 0, 0};
>>>>> +
>>>>> +	return pte;
>>>>> +}
>>>>> +#endif
>>>>
>>>> Would it make sense to have a comment with this magic? The casual reader
>>>> might wonder WTH just happened when he stumbles on this :-)
>>>
>>> I tried writing a helpful comment but it's too late for my brain to form
>>> sensible sentences.
>>>
>>> Christophe can you send a follow-up with a comment explaining it? In
>>> particular the zero entries stand out, it's kind of subtle that those
>>> entries are only populated with the right value when we write to the
>>> page table.
>> 
>> static inline pte_t ptep_get(pte_t *ptep)
>> {
>> 	unsigned long val = READ_ONCE(ptep->pte);
>> 	/* 16K pages have 4 identical value 4K entries */
>> 	pte_t pte = {val, val, val, val);
>> 	return pte;
>> }
>> 
>> Maybe something like that?
>
> This should work as well. Indeed nobody cares about what's in the other 
> three. They are only there to ensure that ptep++ increases the ptep 
> pointer by 16 bytes. Only the HW require 4 identical values, that's 
> taken care of in set_pte_at() and pte_update().

Right, but it seems less error-prone to have the in-memory
representation match what we have in the page table (well that's
in-memory too but you know what I mean).

> So we should use the most efficient. Thinking once more, maybe what you 
> propose is the most efficient as there is no need to load another 
> register with value 0 in order to write it in the stack.

On 64-bit I'd say it makes zero difference, the only thing that's going
to matter is the load from ptep->pte. I don't know whether that's true
on the 8xx cores though.

cheers

^ permalink raw reply

* Re: [PATCH 3/3] powerpc/8xx: Provide ptep_get() with 16k pages
From: Michael Ellerman @ 2020-06-18  0:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linuxppc-dev, linux-kernel, linux-mm, Paul Mackerras,
	Andrew Morton, Will Deacon
In-Reply-To: <20200617143826.GJ2531@hirez.programming.kicks-ass.net>

Peter Zijlstra <peterz@infradead.org> writes:
> On Thu, Jun 18, 2020 at 12:21:22AM +1000, Michael Ellerman wrote:
>> Peter Zijlstra <peterz@infradead.org> writes:
>> > On Mon, Jun 15, 2020 at 12:57:59PM +0000, Christophe Leroy wrote:
>
>> >> +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
>> >> +#define __HAVE_ARCH_PTEP_GET
>> >> +static inline pte_t ptep_get(pte_t *ptep)
>> >> +{
>> >> +	pte_t pte = {READ_ONCE(ptep->pte), 0, 0, 0};
>> >> +
>> >> +	return pte;
>> >> +}
>> >> +#endif
>> >
>> > Would it make sense to have a comment with this magic? The casual reader
>> > might wonder WTH just happened when he stumbles on this :-)
>> 
>> I tried writing a helpful comment but it's too late for my brain to form
>> sensible sentences.
>> 
>> Christophe can you send a follow-up with a comment explaining it? In
>> particular the zero entries stand out, it's kind of subtle that those
>> entries are only populated with the right value when we write to the
>> page table.
>
> static inline pte_t ptep_get(pte_t *ptep)
> {
> 	unsigned long val = READ_ONCE(ptep->pte);
> 	/* 16K pages have 4 identical value 4K entries */
> 	pte_t pte = {val, val, val, val);
> 	return pte;
> }
>
> Maybe something like that?

I think val wants to be pte_basic_t, but otherwise yeah I like that much
better.

cheers

^ permalink raw reply

* Re: [PATCH v5 01/13] powerpc: Remove Xilinx PPC405/PPC440 support
From: Michael Ellerman @ 2020-06-18  0:48 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Arnd Bergmann, Michal Simek, LKML, clang-built-linux,
	Paul Mackerras, Nathan Chancellor, linuxppc-dev
In-Reply-To: <CAKwvOdnkcjLGay0jdQ77kHTmKhE56F9jvzh01XWwEE8rjbhLAA@mail.gmail.com>

Nick Desaulniers <ndesaulniers@google.com> writes:
> On Wed, Jun 17, 2020 at 3:20 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Michael Ellerman <mpe@ellerman.id.au> writes:
>> > Michal Simek <michal.simek@xilinx.com> writes:
>> <snip>
>>
>> >> Or if bamboo requires uImage to be built by default you can do it via
>> >> Kconfig.
>> >>
>> >> diff --git a/arch/powerpc/platforms/44x/Kconfig
>> >> b/arch/powerpc/platforms/44x/Kconfig
>> >> index 39e93d23fb38..300864d7b8c9 100644
>> >> --- a/arch/powerpc/platforms/44x/Kconfig
>> >> +++ b/arch/powerpc/platforms/44x/Kconfig
>> >> @@ -13,6 +13,7 @@ config BAMBOO
>> >>         select PPC44x_SIMPLE
>> >>         select 440EP
>> >>         select FORCE_PCI
>> >> +       select DEFAULT_UIMAGE
>> >>         help
>> >>           This option enables support for the IBM PPC440EP evaluation board.
>> >
>> > Who knows what the actual bamboo board used. But I'd be happy to take a
>> > SOB'ed patch to do the above, because these days the qemu emulation is
>> > much more likely to be used than the actual board.
>>
>> I just went to see why my CI boot of 44x didn't catch this, and it's
>> because I don't use the uImage, I just boot the vmlinux directly:
>>
>>   $ qemu-system-ppc -M bamboo -m 128m -display none -kernel build~/vmlinux -append "console=ttyS0" -display none -nodefaults -serial mon:stdio
>>   Linux version 5.8.0-rc1-00118-g69119673bd50 (michael@alpine1-p1) (gcc (Ubuntu 9.3.0-10ubuntu2) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #4 Wed Jun 17 20:19:22 AEST 2020
>>   Using PowerPC 44x Platform machine description
>>   ioremap() called early from find_legacy_serial_ports+0x690/0x770. Use early_ioremap() instead
>>   printk: bootconsole [udbg0] enabled
>>
>>
>> So that's probably the simplest solution?
>
> If the uImage or zImage self decompresses, I would prefer to test that as well.

The uImage is decompressed by qemu AIUI.

>> That means previously arch/powerpc/boot/zImage was just a hardlink to
>> the uImage:
>
> It sounds like we can just boot the zImage, or is that no longer
> created with the uImage?

The zImage won't boot on bamboo.

Because of the vagaries of the arch/powerpc/boot/Makefile the zImage
ends up pointing to treeImage.ebony, which is for a different board.

The zImage link is made to the first item in $(image-y):

$(obj)/zImage:		$(addprefix $(obj)/, $(image-y))
	$(Q)rm -f $@; ln $< $@
                         ^
                         first preqrequisite

Which for this defconfig happens to be:

image-$(CONFIG_EBONY)			+= treeImage.ebony cuImage.ebony

If you turned off CONFIG_EBONY then the zImage will be a link to
treeImage.bamboo, but qemu can't boot that either.

It's kind of nuts that the zImage points to some arbitrary image
depending on what's configured and the order of things in the Makefile.
But I'm not sure how we make it less nuts without risking breaking
people's existing setups.

cheers

^ permalink raw reply

* Re: [PATCH] scsi: target/sbp: remove firewire SBP target driver
From: Finn Thain @ 2020-06-18  0:40 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Bart Van Assche, linux-scsi@vger.kernel.org, Johannes Thumshirn,
	Chuhong Yuan, linux-kernel@vger.kernel.org, Nicholas Bellinger,
	James Bottomley, target-devel@vger.kernel.org, Chris Boot,
	linux1394-devel@lists.sourceforge.net,
	linuxppc-dev@lists.ozlabs.org, Stefan Richter
In-Reply-To: <yq1366ul8o4.fsf@ca-mkp.ca.oracle.com>

On Tue, 16 Jun 2020, Martin K. Petersen wrote:

> 
> However, keeping code around is not free.

Right. And removing code isn't free either, if it forces people to find 
workarounds.

> Core interfaces change frequently.  Nobody enjoys having to tweak host 
> templates for 50 devices they have never even heard about.

And yet some people seem to enjoy writing patches that are as trivial as 
they are invasive...

You seem to be making an argument for more automation here, not an 
argument for less code. Or is there some upper bound to the size of the 
kernel, that might be lifted by adding maintainers? (Can you deliver a 
better product by adding more developers to your project?)

> Also, we now live in a reality where there is a constant barrage of 
> build bots and code analyzers sending mail. So the effective cost of 
> keeping code around in the tree is going up.

But if maintenance cost rises due to good analysis, the value of the code 
should rise too. So what's the problem? It seems to me that the real 
problem is too many analyses that generate pedantic noise and no tangible 
improvement in code quality or value.

> I get a substantial amount of code analysis mail about drivers nobody 
> have touched in a decade or more.
> 

When stable, mature code fails analysis, the analysis is also questionable 
(in the absence of real examples).

> Consequently, I am much more inclined to remove drivers than I have been 
> in the past. But I am also very happy to bring them back if somebody 
> uses them or - even better - are willing to step up and maintain them.
> 

You seem to be saying that 1) a driver should be removed when it no longer 
meets the present threshold for code quality and 2) that a low quality 
driver is eligible for re-addition because someone wants to use it.
I don't think you can have it both ways.

> I don't particularly like the notion of a driver being orphaned because 
> all that really means is that the driver transitions from being (at 
> least partially) somebody else's problem to being mine and mine alone.
> 

Yes it's your problem but only on a best-effort basis.

Many issues detected by automatic analyzers can be fixed with automatic 
code transformation tools. This kind of solution works tree-wide, so even 
if some defect in your driver is "yours and yours alone", the solution 
will probably come from others.

This email, like yours, is just hand-waving. So feel free to ignore it or 
(preferably) provide evidence of real defects.

^ permalink raw reply

* Re: [PATCH v4 0/3] mm, treewide: Rename kzfree() to kfree_sensitive()
From: Denis Efremov @ 2020-06-17 21:31 UTC (permalink / raw)
  To: Joe Perches, Waiman Long, Andrew Morton, David Howells,
	Jarkko Sakkinen, James Morris, Serge E. Hallyn, Linus Torvalds,
	Matthew Wilcox, David Rientjes
  Cc: Jason A . Donenfeld, Michal Hocko, linux-btrfs, linux-kernel,
	linux-mm, linux-sctp, target-devel, linux-stm32, devel,
	linux-cifs, linux-scsi, kasan-dev, linux-wpan, Dan Carpenter,
	linux-pm, ecryptfs, linux-fscrypt, linux-mediatek, linux-amlogic,
	virtualization, linux-nfs, netdev, linux-wireless, David Sterba,
	linux-bluetooth, linux-security-module, keyrings, tipc-discussion,
	linux-crypto, Johannes Weiner, linux-integrity, linuxppc-dev,
	wireguard, linux-ppp
In-Reply-To: <fe3b9a437be4aeab3bac68f04193cb6daaa5bee4.camel@perches.com>



On 6/16/20 9:53 PM, Joe Perches wrote:
> On Mon, 2020-06-15 at 21:57 -0400, Waiman Long wrote:
>>  v4:
>>   - Break out the memzero_explicit() change as suggested by Dan Carpenter
>>     so that it can be backported to stable.
>>   - Drop the "crypto: Remove unnecessary memzero_explicit()" patch for
>>     now as there can be a bit more discussion on what is best. It will be
>>     introduced as a separate patch later on after this one is merged.
> 
> To this larger audience and last week without reply:
> https://lore.kernel.org/lkml/573b3fbd5927c643920e1364230c296b23e7584d.camel@perches.com/
> 
> Are there _any_ fastpath uses of kfree or vfree?
> 
> Many patches have been posted recently to fix mispairings
> of specific types of alloc and free functions.

I've prepared a coccinelle script to highlight these mispairings in a function
a couple of days ago: https://lkml.org/lkml/2020/6/5/953
I've listed all the fixes in the commit message. 

Not so many mispairings actually, and most of them are harmless like:
kmalloc(E) -> kvfree(E)

However, coccinelle script can't detect cross-functions mispairings, i.e.
allocation in one function, free in another funtion.

Thanks,
Denis

^ permalink raw reply

* Re: [PATCH v4 0/3] mm, treewide: Rename kzfree() to kfree_sensitive()
From: Joe Perches @ 2020-06-17 23:12 UTC (permalink / raw)
  To: Denis Efremov, Waiman Long, Andrew Morton, David Howells,
	Jarkko Sakkinen, James Morris, Serge E. Hallyn, Linus Torvalds,
	Matthew Wilcox, David Rientjes
  Cc: Jason A . Donenfeld, Michal Hocko, linux-btrfs, linux-kernel,
	linux-mm, linux-sctp, target-devel, linux-stm32, devel,
	linux-cifs, linux-scsi, kasan-dev, linux-wpan, Dan Carpenter,
	linux-pm, ecryptfs, linux-fscrypt, linux-mediatek, linux-amlogic,
	virtualization, linux-nfs, netdev, linux-wireless, David Sterba,
	linux-bluetooth, linux-security-module, keyrings, tipc-discussion,
	linux-crypto, Johannes Weiner, linux-integrity, linuxppc-dev,
	wireguard, linux-ppp
In-Reply-To: <17e4fede-bab0-d93c-6964-69decc889d7d@ispras.ru>

On Thu, 2020-06-18 at 00:31 +0300, Denis Efremov wrote:
> 
> On 6/16/20 9:53 PM, Joe Perches wrote:
> > On Mon, 2020-06-15 at 21:57 -0400, Waiman Long wrote:
> > >  v4:
> > >   - Break out the memzero_explicit() change as suggested by Dan Carpenter
> > >     so that it can be backported to stable.
> > >   - Drop the "crypto: Remove unnecessary memzero_explicit()" patch for
> > >     now as there can be a bit more discussion on what is best. It will be
> > >     introduced as a separate patch later on after this one is merged.
> > 
> > To this larger audience and last week without reply:
> > https://lore.kernel.org/lkml/573b3fbd5927c643920e1364230c296b23e7584d.camel@perches.com/
> > 
> > Are there _any_ fastpath uses of kfree or vfree?
> > 
> > Many patches have been posted recently to fix mispairings
> > of specific types of alloc and free functions.
> 
> I've prepared a coccinelle script to highlight these mispairings in a function
> a couple of days ago: https://lkml.org/lkml/2020/6/5/953
> I've listed all the fixes in the commit message. 
> 
> Not so many mispairings actually, and most of them are harmless like:
> kmalloc(E) -> kvfree(E)
> 
> However, coccinelle script can't detect cross-functions mispairings, i.e.
> allocation in one function, free in another funtion.

Hey Denis, thanks for those patches.

If possible, it's probably better to not require these pairings
and use a single standard kfree/free function.

Given the existing ifs in kfree in slab/slob/slub, it seems
likely that adding a few more wouldn't have much impact.



^ permalink raw reply

* Re: [v1 PATCH 2/2] Add Documentation regarding the ima-kexec-buffer node in the chosen node documentation
From: Rob Herring @ 2020-06-17 20:43 UTC (permalink / raw)
  To: Prakhar Srivastava
  Cc: kstewart, mark.rutland, gregkh, bhsharma, tao.li, zohar, paulus,
	vincenzo.frascino, will, nramas, frowand.list, masahiroy, jmorris,
	takahiro.akashi, linux-arm-kernel, catalin.marinas, serge,
	devicetree, pasha.tatashin, hsinyi, tusharsu, tglx, allison,
	christophe.leroy, mbrugger, balajib, dmitry.kasatkin,
	linux-kernel, linux-security-module, james.morse, linux-integrity,
	linuxppc-dev
In-Reply-To: <20200607233323.22375-3-prsriva@linux.microsoft.com>

On Sun, Jun 07, 2020 at 04:33:23PM -0700, Prakhar Srivastava wrote:
> Add Documentation regarding the ima-kexec-buffer node in
>  the chosen node documentation

Run 'git log --oneline Documentation/devicetree/bindings/chosen.txt' and 
write $subject using the dominate format used.

For the commit message, answer why you need the change, not what the 
change is. I can read the diff for that.

>  
> Signed-off-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
> ---
>  Documentation/devicetree/bindings/chosen.txt | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)

This file has moved to a schema here[1]. You need to update it.

> 
> diff --git a/Documentation/devicetree/bindings/chosen.txt b/Documentation/devicetree/bindings/chosen.txt
> index 45e79172a646..a15f70c007ef 100644
> --- a/Documentation/devicetree/bindings/chosen.txt
> +++ b/Documentation/devicetree/bindings/chosen.txt
> @@ -135,3 +135,20 @@ e.g.
>  		linux,initrd-end = <0x82800000>;
>  	};
>  };
> +
> +linux,ima-kexec-buffer
> +----------------------
> +
> +This property(currently used by powerpc, arm64) holds the memory range,
> +the address and the size, of the IMA measurement logs that are being carried
> +over to the kexec session.

What's IMA? 

> +
> +/ {
> +	chosen {
> +		linux,ima-kexec-buffer = <0x9 0x82000000 0x0 0x00008000>;
> +	};
> +};
> +
> +This porperty does not represent real hardware, but the memory allocated for

typo

> +carrying the IMA measurement logs. The address and the suze are expressed in

typo

> +#address-cells and #size-cells, respectively of the root node.
> -- 
> 2.25.1
> 


[1] https://github.com/devicetree-org/dt-schema/blob/master/schemas/chosen.yaml


^ permalink raw reply

* Re: [PATCH v5 01/13] powerpc: Remove Xilinx PPC405/PPC440 support
From: Nick Desaulniers @ 2020-06-17 18:16 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Arnd Bergmann, Michal Simek, LKML, clang-built-linux,
	Paul Mackerras, Nathan Chancellor, linuxppc-dev
In-Reply-To: <878sgmdmcv.fsf@mpe.ellerman.id.au>

On Wed, Jun 17, 2020 at 3:20 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Michael Ellerman <mpe@ellerman.id.au> writes:
> > Michal Simek <michal.simek@xilinx.com> writes:
> <snip>
>
> >> Or if bamboo requires uImage to be built by default you can do it via
> >> Kconfig.
> >>
> >> diff --git a/arch/powerpc/platforms/44x/Kconfig
> >> b/arch/powerpc/platforms/44x/Kconfig
> >> index 39e93d23fb38..300864d7b8c9 100644
> >> --- a/arch/powerpc/platforms/44x/Kconfig
> >> +++ b/arch/powerpc/platforms/44x/Kconfig
> >> @@ -13,6 +13,7 @@ config BAMBOO
> >>         select PPC44x_SIMPLE
> >>         select 440EP
> >>         select FORCE_PCI
> >> +       select DEFAULT_UIMAGE
> >>         help
> >>           This option enables support for the IBM PPC440EP evaluation board.
> >
> > Who knows what the actual bamboo board used. But I'd be happy to take a
> > SOB'ed patch to do the above, because these days the qemu emulation is
> > much more likely to be used than the actual board.
>
> I just went to see why my CI boot of 44x didn't catch this, and it's
> because I don't use the uImage, I just boot the vmlinux directly:
>
>   $ qemu-system-ppc -M bamboo -m 128m -display none -kernel build~/vmlinux -append "console=ttyS0" -display none -nodefaults -serial mon:stdio
>   Linux version 5.8.0-rc1-00118-g69119673bd50 (michael@alpine1-p1) (gcc (Ubuntu 9.3.0-10ubuntu2) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #4 Wed Jun 17 20:19:22 AEST 2020
>   Using PowerPC 44x Platform machine description
>   ioremap() called early from find_legacy_serial_ports+0x690/0x770. Use early_ioremap() instead
>   printk: bootconsole [udbg0] enabled
>
>
> So that's probably the simplest solution?

If the uImage or zImage self decompresses, I would prefer to test that as well.

> That means previously arch/powerpc/boot/zImage was just a hardlink to
> the uImage:

It sounds like we can just boot the zImage, or is that no longer
created with the uImage?
-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply

* [PATCH v2 0/2] powerpc/pci: unmap interrupts when a PHB is removed
From: Cédric Le Goater @ 2020-06-17 16:29 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linuxppc-dev, Oliver O'Halloran, Cédric Le Goater

Hello,

When a passthrough IO adapter is removed from a pseries machine using
hash MMU and the XIVE interrupt mode, the POWER hypervisor expects the
guest OS to clear all page table entries related to the adapter. If
some are still present, the RTAS call which isolates the PCI slot
returns error 9001 "valid outstanding translations" and the removal of
the IO adapter fails. This is because when the PHBs are scanned, Linux
maps automatically some interrupts in the Linux interrupt number space
but these are never removed.

To solve this problem, we introduce a PPC platform specific
pcibios_remove_bus() routine which clears all interrupt mappings when
the bus is removed. This also clears the associated page table entries
of the ESB pages when using XIVE.

For this purpose, we record the logical interrupt numbers of the
mapped interrupt under the PHB structure and let pcibios_remove_bus()
do the clean up.

Tested on :

  - PowerNV with PCI, OpenCAPI, CAPI and GPU adapters. I don't know
    how to inject a failure on a PHB but that would be a good test.
  - KVM P8+P9 guests with passthrough PCI adapters, but PHBs can not
    be removed under QEMU/KVM.   
  - PowerVM with passthrough PCI adapters (main target)
  
Thanks,

C.

Changes since v1:

 - extended the removal to interrupts other than the legacy INTx.

Cédric Le Goater (2):
  powerpc/pci: unmap legacy INTx interrupts when a PHB is removed
  powerpc/pci: unmap all interrupts when a PHB is removed

 arch/powerpc/include/asm/pci-bridge.h |   6 ++
 arch/powerpc/kernel/pci-common.c      | 114 ++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)

-- 
2.25.4


^ permalink raw reply

* [PATCH v2 2/2] powerpc/pci: unmap all interrupts when a PHB is removed
From: Cédric Le Goater @ 2020-06-17 16:29 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linuxppc-dev, Oliver O'Halloran, Cédric Le Goater
In-Reply-To: <20200617162938.743439-1-clg@kaod.org>

Some PCI adapters, like GPUs, use the "interrupt-map" property to
describe interrupt mappings other than the legacy INTx interrupts.
There can be more than 4 mappings.

To clear all interrupts when a PHB is removed, we need to increase the
'irq_map' array in which mappings are recorded. Compute the number of
interrupt mappings from the "interrupt-map" property and allocate a
bigger 'irq_map' array.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/kernel/pci-common.c | 49 +++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 515480a4bac6..deb831f0ae13 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -353,9 +353,56 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr)
 	return NULL;
 }
 
+/*
+ * Assumption is made on the interrupt parent. All interrupt-map
+ * entries are considered to have the same parent.
+ */
+static int pcibios_irq_map_count(struct pci_controller *phb)
+{
+	const __be32 *imap;
+	int imaplen;
+	struct device_node *parent;
+	u32 intsize, addrsize, parintsize, paraddrsize;
+
+	if (of_property_read_u32(phb->dn, "#interrupt-cells", &intsize))
+		return 0;
+	if (of_property_read_u32(phb->dn, "#address-cells", &addrsize))
+		return 0;
+
+	imap = of_get_property(phb->dn, "interrupt-map", &imaplen);
+	if (!imap) {
+		pr_debug("%pOF : no interrupt-map\n", phb->dn);
+		return 0;
+	}
+	imaplen /= sizeof(u32);
+	pr_debug("%pOF : imaplen=%d\n", phb->dn, imaplen);
+
+	if (imaplen < (addrsize + intsize + 1))
+		return 0;
+
+	imap += intsize + addrsize;
+	parent = of_find_node_by_phandle(be32_to_cpup(imap));
+	if (!parent) {
+		pr_debug("%pOF : no imap parent found !\n", phb->dn);
+		return 0;
+	}
+
+	if (of_property_read_u32(parent, "#interrupt-cells", &parintsize)) {
+		pr_debug("%pOF : parent lacks #interrupt-cells!\n", phb->dn);
+		return 0;
+	}
+
+	if (of_property_read_u32(parent, "#address-cells", &paraddrsize))
+		paraddrsize = 0;
+
+	return imaplen / (addrsize + intsize + 1 + paraddrsize + parintsize);
+}
+
 static void pcibios_irq_map_init(struct pci_controller *phb)
 {
-	phb->irq_count = PCI_NUM_INTX;
+	phb->irq_count = pcibios_irq_map_count(phb);
+	if (phb->irq_count < PCI_NUM_INTX)
+		phb->irq_count = PCI_NUM_INTX;
 
 	pr_debug("%pOF : interrupt map #%d\n", phb->dn, phb->irq_count);
 
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 1/2] powerpc/pci: unmap legacy INTx interrupts when a PHB is removed
From: Cédric Le Goater @ 2020-06-17 16:29 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linuxppc-dev, Oliver O'Halloran, Cédric Le Goater
In-Reply-To: <20200617162938.743439-1-clg@kaod.org>

When a passthrough IO adapter is removed from a pseries machine using
hash MMU and the XIVE interrupt mode, the POWER hypervisor expects the
guest OS to clear all page table entries related to the adapter. If
some are still present, the RTAS call which isolates the PCI slot
returns error 9001 "valid outstanding translations" and the removal of
the IO adapter fails. This is because when the PHBs are scanned, Linux
maps automatically the INTx interrupts in the Linux interrupt number
space but these are never removed.

To solve this problem, we introduce a PPC platform specific
pcibios_remove_bus() routine which clears all interrupt mappings when
the bus is removed. This also clears the associated page table entries
of the ESB pages when using XIVE.

For this purpose, we record the logical interrupt numbers of the
mapped interrupt under the PHB structure and let pcibios_remove_bus()
do the clean up.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/include/asm/pci-bridge.h |  6 +++
 arch/powerpc/kernel/pci-common.c      | 67 +++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index b92e81b256e5..ca75cf264ddf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -48,6 +48,9 @@ struct pci_controller_ops {
 
 /*
  * Structure of a PCI controller (host bridge)
+ *
+ * @irq_count: number of interrupt mappings
+ * @irq_map: interrupt mappings
  */
 struct pci_controller {
 	struct pci_bus *bus;
@@ -127,6 +130,9 @@ struct pci_controller {
 
 	void *private_data;
 	struct npu *npu;
+
+	unsigned int irq_count;
+	unsigned int *irq_map;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index be108616a721..515480a4bac6 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -353,6 +353,68 @@ struct pci_controller *pci_find_controller_for_domain(int domain_nr)
 	return NULL;
 }
 
+static void pcibios_irq_map_init(struct pci_controller *phb)
+{
+	phb->irq_count = PCI_NUM_INTX;
+
+	pr_debug("%pOF : interrupt map #%d\n", phb->dn, phb->irq_count);
+
+	phb->irq_map = kcalloc(phb->irq_count, sizeof(unsigned int),
+			       GFP_KERNEL);
+}
+
+static void pci_irq_map_register(struct pci_dev *pdev, unsigned int virq)
+{
+	struct pci_controller *phb = pci_bus_to_host(pdev->bus);
+	int i;
+
+	if (!phb->irq_map)
+		return;
+
+	for (i = 0; i < phb->irq_count; i++) {
+		/*
+		 * Look for an empty or an equivalent slot, as INTx
+		 * interrupts can be shared between adapters.
+		 */
+		if (phb->irq_map[i] == virq || !phb->irq_map[i]) {
+			phb->irq_map[i] = virq;
+			break;
+		}
+	}
+
+	if (i == phb->irq_count)
+		pr_err("PCI:%s all platform interrupts mapped\n",
+		       pci_name(pdev));
+}
+
+/*
+ * Clearing the mapped interrupts will also clear the underlying
+ * mappings of the ESB pages of the interrupts when under XIVE. It is
+ * a requirement of PowerVM to clear all memory mappings before
+ * removing a PHB.
+ */
+static void pci_irq_map_dispose(struct pci_bus *bus)
+{
+	struct pci_controller *phb = pci_bus_to_host(bus);
+	int i;
+
+	if (!phb->irq_map)
+		return;
+
+	pr_debug("PCI: Clearing interrupt mappings for PHB %04x:%02x...\n",
+		 pci_domain_nr(bus), bus->number);
+	for (i = 0; i < phb->irq_count; i++)
+		irq_dispose_mapping(phb->irq_map[i]);
+
+	kfree(phb->irq_map);
+}
+
+void pcibios_remove_bus(struct pci_bus *bus)
+{
+	pci_irq_map_dispose(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_remove_bus);
+
 /*
  * Reads the interrupt pin to determine if interrupt is use by card.
  * If the interrupt is used, then gets the interrupt line from the
@@ -401,6 +463,8 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
 
 	pci_dev->irq = virq;
 
+	/* Record all interrut mappings for later removal of a PHB */
+	pci_irq_map_register(pci_dev, virq);
 	return 0;
 }
 
@@ -1554,6 +1618,9 @@ void pcibios_scan_phb(struct pci_controller *hose)
 
 	pr_debug("PCI: Scanning PHB %pOF\n", node);
 
+	/* Allocate interrupt mappings array */
+	pcibios_irq_map_init(hose);
+
 	/* Get some IO space for the new PHB */
 	pcibios_setup_phb_io_space(hose);
 
-- 
2.25.4


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox