LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 02/11] powerpc: Rename PPC_NATIVE to PPC_HASH_MMU_NATIVE
From: Nicholas Piggin @ 2021-10-15 15:46 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211015154624.922960-1-npiggin@gmail.com>

PPC_NATIVE now only controls the native HPT code, so rename it to be
more descriptive. Restrict it to Book3S only.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/book3s64/Makefile          | 2 +-
 arch/powerpc/mm/book3s64/hash_utils.c      | 2 +-
 arch/powerpc/platforms/52xx/Kconfig        | 2 +-
 arch/powerpc/platforms/Kconfig             | 4 ++--
 arch/powerpc/platforms/cell/Kconfig        | 2 +-
 arch/powerpc/platforms/chrp/Kconfig        | 2 +-
 arch/powerpc/platforms/embedded6xx/Kconfig | 2 +-
 arch/powerpc/platforms/maple/Kconfig       | 2 +-
 arch/powerpc/platforms/microwatt/Kconfig   | 2 +-
 arch/powerpc/platforms/pasemi/Kconfig      | 2 +-
 arch/powerpc/platforms/powermac/Kconfig    | 2 +-
 arch/powerpc/platforms/powernv/Kconfig     | 2 +-
 arch/powerpc/platforms/pseries/Kconfig     | 2 +-
 13 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile
index 1b56d3af47d4..319f4b7f3357 100644
--- a/arch/powerpc/mm/book3s64/Makefile
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -6,7 +6,7 @@ CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
 
 obj-y				+= hash_pgtable.o hash_utils.o slb.o \
 				   mmu_context.o pgtable.o hash_tlb.o
-obj-$(CONFIG_PPC_NATIVE)	+= hash_native.o
+obj-$(CONFIG_PPC_HASH_MMU_NATIVE)	+= hash_native.o
 obj-$(CONFIG_PPC_RADIX_MMU)	+= radix_pgtable.o radix_tlb.o
 obj-$(CONFIG_PPC_4K_PAGES)	+= hash_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)	+= hash_64k.o
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index c145776d3ae5..ebe3044711ce 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1091,7 +1091,7 @@ void __init hash__early_init_mmu(void)
 		ps3_early_mm_init();
 	else if (firmware_has_feature(FW_FEATURE_LPAR))
 		hpte_init_pseries();
-	else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+	else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE))
 		hpte_init_native();
 
 	if (!mmu_hash_ops.hpte_insert)
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index 99d60acc20c8..b72ed2950ca8 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -34,7 +34,7 @@ config PPC_EFIKA
 	bool "bPlan Efika 5k2. MPC5200B based computer"
 	depends on PPC_MPC52xx
 	select PPC_RTAS
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 
 config PPC_LITE5200
 	bool "Freescale Lite5200 Eval Board"
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e02d29a9d12f..d41dad227de8 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -40,9 +40,9 @@ config EPAPR_PARAVIRT
 
 	  In case of doubt, say Y
 
-config PPC_NATIVE
+config PPC_HASH_MMU_NATIVE
 	bool
-	depends on PPC_BOOK3S_32 || PPC64
+	depends on PPC_BOOK3S
 	help
 	  Support for running natively on the hardware, i.e. without
 	  a hypervisor. This option is not user-selectable but should
diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
index cb70c5f25bc6..db4465c51b56 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -8,7 +8,7 @@ config PPC_CELL_COMMON
 	select PPC_DCR_MMIO
 	select PPC_INDIRECT_PIO
 	select PPC_INDIRECT_MMIO
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select PPC_RTAS
 	select IRQ_EDGE_EOI_HANDLER
 
diff --git a/arch/powerpc/platforms/chrp/Kconfig b/arch/powerpc/platforms/chrp/Kconfig
index 9b5c5505718a..ff30ed579a39 100644
--- a/arch/powerpc/platforms/chrp/Kconfig
+++ b/arch/powerpc/platforms/chrp/Kconfig
@@ -11,6 +11,6 @@ config PPC_CHRP
 	select RTAS_ERROR_LOGGING
 	select PPC_MPC106
 	select PPC_UDBG_16550
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select FORCE_PCI
 	default y
diff --git a/arch/powerpc/platforms/embedded6xx/Kconfig b/arch/powerpc/platforms/embedded6xx/Kconfig
index 4c6d703a4284..c54786f8461e 100644
--- a/arch/powerpc/platforms/embedded6xx/Kconfig
+++ b/arch/powerpc/platforms/embedded6xx/Kconfig
@@ -55,7 +55,7 @@ config MVME5100
 	select FORCE_PCI
 	select PPC_INDIRECT_PCI
 	select PPC_I8259
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select PPC_UDBG_16550
 	help
 	  This option enables support for the Motorola (now Emerson) MVME5100
diff --git a/arch/powerpc/platforms/maple/Kconfig b/arch/powerpc/platforms/maple/Kconfig
index 86ae210bee9a..7fd84311ade5 100644
--- a/arch/powerpc/platforms/maple/Kconfig
+++ b/arch/powerpc/platforms/maple/Kconfig
@@ -9,7 +9,7 @@ config PPC_MAPLE
 	select GENERIC_TBSYNC
 	select PPC_UDBG_16550
 	select PPC_970_NAP
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select PPC_RTAS
 	select MMIO_NVRAM
 	select ATA_NONSTANDARD if ATA
diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig
index 8f6a81978461..62b51e37fc05 100644
--- a/arch/powerpc/platforms/microwatt/Kconfig
+++ b/arch/powerpc/platforms/microwatt/Kconfig
@@ -5,7 +5,7 @@ config PPC_MICROWATT
 	select PPC_XICS
 	select PPC_ICS_NATIVE
 	select PPC_ICP_NATIVE
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select PPC_UDBG_16550
 	select ARCH_RANDOM
 	help
diff --git a/arch/powerpc/platforms/pasemi/Kconfig b/arch/powerpc/platforms/pasemi/Kconfig
index c52731a7773f..bc7137353a7f 100644
--- a/arch/powerpc/platforms/pasemi/Kconfig
+++ b/arch/powerpc/platforms/pasemi/Kconfig
@@ -5,7 +5,7 @@ config PPC_PASEMI
 	select MPIC
 	select FORCE_PCI
 	select PPC_UDBG_16550
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select MPIC_BROKEN_REGREAD
 	help
 	  This option enables support for PA Semi's PWRficient line
diff --git a/arch/powerpc/platforms/powermac/Kconfig b/arch/powerpc/platforms/powermac/Kconfig
index b97bf12801eb..2b56df145b82 100644
--- a/arch/powerpc/platforms/powermac/Kconfig
+++ b/arch/powerpc/platforms/powermac/Kconfig
@@ -6,7 +6,7 @@ config PPC_PMAC
 	select FORCE_PCI
 	select PPC_INDIRECT_PCI if PPC32
 	select PPC_MPC106 if PPC32
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select ZONE_DMA if PPC32
 	default y
 
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 043eefbbdd28..cd754e116184 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -2,7 +2,7 @@
 config PPC_POWERNV
 	depends on PPC64 && PPC_BOOK3S
 	bool "IBM PowerNV (Non-Virtualized) platform support"
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select PPC_XICS
 	select PPC_ICP_NATIVE
 	select PPC_XIVE_NATIVE
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 5e037df2a3a1..69a1ff8c079b 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -17,7 +17,7 @@ config PPC_PSERIES
 	select PPC_RTAS_DAEMON
 	select RTAS_ERROR_LOGGING
 	select PPC_UDBG_16550
-	select PPC_NATIVE
+	select PPC_HASH_MMU_NATIVE
 	select PPC_DOORBELL
 	select HOTPLUG_CPU
 	select ARCH_RANDOM
-- 
2.23.0


^ permalink raw reply related

* [PATCH v1 00/11] powerpc: Make hash MMU code build configurable
From: Nicholas Piggin @ 2021-10-15 15:46 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin

Now that there's a platform that can make good use of it, here's
a series that can prevent the hash MMU code being built for 64s
platforms that don't need it.

Thanks Christophe and Michael for reviews of the RFC, I hope I
got all the issues raised.

Since RFC:
- Split out large code movement from other changes.
- Used mmu ftr test constant folding rather than adding new constant
  true/false for radix_enabled().
- Restore tlbie trace point that had to be commented out in the
  previous.
- Avoid minor (probably unreachable) behaviour change in machine check
  handler when hash was not compiled.
- Fix microwatt updates so !HASH is not enforced.
- Rebase, build fixes.

Thanks,
Nick

Nicholas Piggin (11):
  powerpc: Remove unused FW_FEATURE_NATIVE references
  powerpc: Rename PPC_NATIVE to PPC_HASH_MMU_NATIVE
  powerpc/pseries: Stop selecting PPC_HASH_MMU_NATIVE
  powerpc/64s: Move and rename do_bad_slb_fault as it is not hash
    specific
  powerpc/pseries: move pseries_lpar_register_process_table() out from
    hash specific code
  powerpc/pseries: lparcfg don't include slb_size line in radix mode
  powerpc/64s: move THP trace point creation out of hash specific file
  powerpc/64s: Make flush_and_reload_slb a no-op when radix is enabled
  powerpc/64s: Make hash MMU code build configurable
  powerpc/configs/microwatt: add POWER9_CPU
  powerpc/microwatt: Don't select the hash MMU code

 arch/powerpc/Kconfig                          |   1 +
 arch/powerpc/configs/microwatt_defconfig      |   2 +-
 arch/powerpc/include/asm/book3s/64/mmu.h      |  22 +++-
 .../include/asm/book3s/64/tlbflush-hash.h     |   7 ++
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   4 -
 arch/powerpc/include/asm/book3s/pgtable.h     |   4 +
 arch/powerpc/include/asm/firmware.h           |   8 --
 arch/powerpc/include/asm/interrupt.h          |   2 +-
 arch/powerpc/include/asm/mmu.h                |  14 ++-
 arch/powerpc/include/asm/mmu_context.h        |   2 +
 arch/powerpc/include/asm/paca.h               |   8 ++
 arch/powerpc/kernel/asm-offsets.c             |   2 +
 arch/powerpc/kernel/dt_cpu_ftrs.c             |   8 +-
 arch/powerpc/kernel/entry_64.S                |   4 +-
 arch/powerpc/kernel/exceptions-64s.S          |  20 ++-
 arch/powerpc/kernel/mce.c                     |   2 +-
 arch/powerpc/kernel/mce_power.c               |  16 ++-
 arch/powerpc/kernel/paca.c                    |  18 ++-
 arch/powerpc/kernel/process.c                 |  13 +-
 arch/powerpc/kernel/prom.c                    |   2 +
 arch/powerpc/kernel/setup_64.c                |   4 +
 arch/powerpc/kexec/core_64.c                  |   4 +-
 arch/powerpc/kexec/ranges.c                   |   4 +
 arch/powerpc/kvm/Kconfig                      |   1 +
 arch/powerpc/mm/book3s64/Makefile             |  19 +--
 arch/powerpc/mm/book3s64/hash_native.c        | 104 ----------------
 arch/powerpc/mm/book3s64/hash_pgtable.c       |   1 -
 arch/powerpc/mm/book3s64/hash_utils.c         | 116 ++++++++++++++++--
 .../{hash_hugetlbpage.c => hugetlbpage.c}     |   6 +
 arch/powerpc/mm/book3s64/mmu_context.c        |  16 +++
 arch/powerpc/mm/book3s64/pgtable.c            |  13 ++
 arch/powerpc/mm/book3s64/radix_pgtable.c      |   4 +
 arch/powerpc/mm/book3s64/slb.c                |  16 ---
 arch/powerpc/mm/book3s64/trace.c              |   8 ++
 arch/powerpc/mm/copro_fault.c                 |   2 +
 arch/powerpc/mm/fault.c                       |  17 +++
 arch/powerpc/mm/pgtable.c                     |  10 +-
 arch/powerpc/platforms/52xx/Kconfig           |   2 +-
 arch/powerpc/platforms/Kconfig                |   4 +-
 arch/powerpc/platforms/Kconfig.cputype        |  21 +++-
 arch/powerpc/platforms/cell/Kconfig           |   3 +-
 arch/powerpc/platforms/chrp/Kconfig           |   2 +-
 arch/powerpc/platforms/embedded6xx/Kconfig    |   2 +-
 arch/powerpc/platforms/maple/Kconfig          |   3 +-
 arch/powerpc/platforms/microwatt/Kconfig      |   1 -
 arch/powerpc/platforms/pasemi/Kconfig         |   3 +-
 arch/powerpc/platforms/powermac/Kconfig       |   3 +-
 arch/powerpc/platforms/powernv/Kconfig        |   2 +-
 arch/powerpc/platforms/powernv/idle.c         |   2 +
 arch/powerpc/platforms/powernv/setup.c        |   2 +
 arch/powerpc/platforms/pseries/Kconfig        |   1 -
 arch/powerpc/platforms/pseries/lpar.c         |  67 +++++-----
 arch/powerpc/platforms/pseries/lparcfg.c      |   5 +-
 arch/powerpc/platforms/pseries/mobility.c     |   6 +
 arch/powerpc/platforms/pseries/ras.c          |   2 +
 arch/powerpc/platforms/pseries/reconfig.c     |   2 +
 arch/powerpc/platforms/pseries/setup.c        |   6 +-
 arch/powerpc/xmon/xmon.c                      |   8 +-
 58 files changed, 410 insertions(+), 241 deletions(-)
 rename arch/powerpc/mm/book3s64/{hash_hugetlbpage.c => hugetlbpage.c} (95%)
 create mode 100644 arch/powerpc/mm/book3s64/trace.c

-- 
2.23.0


^ permalink raw reply

* [PATCH v1 01/11] powerpc: Remove unused FW_FEATURE_NATIVE references
From: Nicholas Piggin @ 2021-10-15 15:46 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211015154624.922960-1-npiggin@gmail.com>

FW_FEATURE_NATIVE_ALWAYS and FW_FEATURE_NATIVE_POSSIBLE are always
zero and never do anything. Remove them.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/firmware.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
index 97a3bd9ffeb9..9b702d2b80fb 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -80,8 +80,6 @@ enum {
 	FW_FEATURE_POWERNV_ALWAYS = 0,
 	FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
 	FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
-	FW_FEATURE_NATIVE_POSSIBLE = 0,
-	FW_FEATURE_NATIVE_ALWAYS = 0,
 	FW_FEATURE_POSSIBLE =
 #ifdef CONFIG_PPC_PSERIES
 		FW_FEATURE_PSERIES_POSSIBLE |
@@ -91,9 +89,6 @@ enum {
 #endif
 #ifdef CONFIG_PPC_PS3
 		FW_FEATURE_PS3_POSSIBLE |
-#endif
-#ifdef CONFIG_PPC_NATIVE
-		FW_FEATURE_NATIVE_ALWAYS |
 #endif
 		0,
 	FW_FEATURE_ALWAYS =
@@ -105,9 +100,6 @@ enum {
 #endif
 #ifdef CONFIG_PPC_PS3
 		FW_FEATURE_PS3_ALWAYS &
-#endif
-#ifdef CONFIG_PPC_NATIVE
-		FW_FEATURE_NATIVE_ALWAYS &
 #endif
 		FW_FEATURE_POSSIBLE,
 
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH] powerpc/smp: do not decrement idle task preempt count in CPU offline
From: Valentin Schneider @ 2021-10-15 15:36 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: srikar, peterz, linux-kernel, clg, mingo
In-Reply-To: <20211015145548.2269836-1-nathanl@linux.ibm.com>

On 15/10/21 09:55, Nathan Lynch wrote:
> With PREEMPT_COUNT=y, when a CPU is offlined and then onlined again, we
> get:
>
> BUG: scheduling while atomic: swapper/1/0/0x00000000
> no locks held by swapper/1/0.
> CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.15.0-rc2+ #100
> Call Trace:
>  dump_stack_lvl+0xac/0x108
>  __schedule_bug+0xac/0xe0
>  __schedule+0xcf8/0x10d0
>  schedule_idle+0x3c/0x70
>  do_idle+0x2d8/0x4a0
>  cpu_startup_entry+0x38/0x40
>  start_secondary+0x2ec/0x3a0
>  start_secondary_prolog+0x10/0x14
>
> This is because powerpc's arch_cpu_idle_dead() decrements the idle task's
> preempt count, for reasons explained in commit a7c2bb8279d2 ("powerpc:
> Re-enable preemption before cpu_die()"), specifically "start_secondary()
> expects a preempt_count() of 0."
>
> However, since commit 2c669ef6979c ("powerpc/preempt: Don't touch the idle
> task's preempt_count during hotplug") and commit f1a0a376ca0c ("sched/core:
> Initialize the idle task with preemption disabled"), that justification no
> longer holds.
>
> The idle task isn't supposed to re-enable preemption, so remove the
> vestigial preempt_enable() from the CPU offline path.
>

Humph, I got confused because 2c669ef6979c explicitly mentions hotplug,
but that's the hotplug machinery which is already involved for bringing up
the secondaries at boot time.

IIUC your issue here is the preempt_count being messed up when
hot-unplugging a CPU, which leads to fireworks during hotplug (IOW I didn't
test my last patch against hotplug - my bad!)

Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>

> Tested with pseries and powernv in qemu, and pseries on PowerVM.
>
> Fixes: 2c669ef6979c ("powerpc/preempt: Don't touch the idle task's preempt_count during hotplug")
> Fixes: f1a0a376ca0c ("sched/core: Initialize the idle task with preemption disabled")

I think only the first Fixes: is needed.

> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 9cc7d3dbf439..605bab448f84 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1730,8 +1730,6 @@ void __cpu_die(unsigned int cpu)
>
>  void arch_cpu_idle_dead(void)
>  {
> -	sched_preempt_enable_no_resched();
> -
>       /*
>        * Disable on the down path. This will be re-enabled by
>        * start_secondary() via start_secondary_resume() below
> --
> 2.31.1

^ permalink raw reply

* [PATCH] tracing: Have all levels of checks prevent recursion
From: Steven Rostedt @ 2021-10-15 15:00 UTC (permalink / raw)
  To: LKML
  Cc: 王贇, Peter Zijlstra (Intel), Paul Walmsley,
	James E.J. Bottomley, Paul Mackerras, Jisheng Zhang,
	H. Peter Anvin, live-patching, linux-riscv, Miroslav Benes,
	Joe Lawrence, Helge Deller, x86, linux-csky, Ingo Molnar,
	Petr Mladek, Albert Ou, Jiri Kosina, Nicholas Piggin,
	Borislav Petkov, Josh Poimboeuf, Thomas Gleixner, linux-parisc,
	Palmer Dabbelt, Masami Hiramatsu, Guo Ren, Colin Ian King,
	linuxppc-dev

From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

While writing an email explaining the "bit = 0" logic for a discussion on
making ftrace_test_recursion_trylock() disable preemption, I discovered a
path that makes the "not do the logic if bit is zero" unsafe.

The recursion logic is done in hot paths like the function tracer. Thus,
any code executed causes noticeable overhead. Thus, tricks are done to try
to limit the amount of code executed. This included the recursion testing
logic.

Having recursion testing is important, as there are many paths that can
end up in an infinite recursion cycle when tracing every function in the
kernel. Thus protection is needed to prevent that from happening.

Because it is OK to recurse due to different running context levels (e.g.
an interrupt preempts a trace, and then a trace occurs in the interrupt
handler), a set of bits are used to know which context one is in (normal,
softirq, irq and NMI). If a recursion occurs in the same level, it is
prevented*.

Then there are infrastructure levels of recursion as well. When more than
one callback is attached to the same function to trace, it calls a loop
function to iterate over all the callbacks. Both the callbacks and the
loop function have recursion protection. The callbacks use the
"ftrace_test_recursion_trylock()" which has a "function" set of context
bits to test, and the loop function calls the internal
trace_test_and_set_recursion() directly, with an "internal" set of bits.

If an architecture does not implement all the features supported by ftrace
then the callbacks are never called directly, and the loop function is
called instead, which will implement the features of ftrace.

Since both the loop function and the callbacks do recursion protection, it
was seemed unnecessary to do it in both locations. Thus, a trick was made
to have the internal set of recursion bits at a more significant bit
location than the function bits. Then, if any of the higher bits were set,
the logic of the function bits could be skipped, as any new recursion
would first have to go through the loop function.

This is true for architectures that do not support all the ftrace
features, because all functions being traced must first go through the
loop function before going to the callbacks. But this is not true for
architectures that support all the ftrace features. That's because the
loop function could be called due to two callbacks attached to the same
function, but then a recursion function inside the callback could be
called that does not share any other callback, and it will be called
directly.

i.e.

 traced_function_1: [ more than one callback tracing it ]
   call loop_func

 loop_func:
   trace_recursion set internal bit
   call callback

 callback:
   trace_recursion [ skipped because internal bit is set, return 0 ]
   call traced_function_2

 traced_function_2: [ only traced by above callback ]
   call callback

 callback:
   trace_recursion [ skipped because internal bit is set, return 0 ]
   call traced_function_2

 [ wash, rinse, repeat, BOOM! out of shampoo! ]

Thus, the "bit == 0 skip" trick is not safe, unless the loop function is
call for all functions.

Since we want to encourage architectures to implement all ftrace features,
having them slow down due to this extra logic may encourage the
maintainers to update to the latest ftrace features. And because this
logic is only safe for them, remove it completely.

 [*] There is on layer of recursion that is allowed, and that is to allow
     for the transition between interrupt context (normal -> softirq ->
     irq -> NMI), because a trace may occur before the context update is
     visible to the trace recursion logic.

Link: https://lore.kernel.org/all/609b565a-ed6e-a1da-f025-166691b5d994@linux.alibaba.com/

Cc: stable@vger.kernel.org
Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 41 +++++----------------------------
 1 file changed, 6 insertions(+), 35 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index a9f9c5714e65..168fdf07419a 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -16,23 +16,8 @@
  *  When function tracing occurs, the following steps are made:
  *   If arch does not support a ftrace feature:
  *    call internal function (uses INTERNAL bits) which calls...
- *   If callback is registered to the "global" list, the list
- *    function is called and recursion checks the GLOBAL bits.
- *    then this function calls...
  *   The function callback, which can use the FTRACE bits to
  *    check for recursion.
- *
- * Now if the arch does not support a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are protecting against will
- * go through the same steps again.
- *
- * To prevent the multiple recursion checks, if a recursion
- * bit is set that is higher than the MAX bit of the current
- * check, then we know that the check was made by the previous
- * caller, and we can skip the current check.
  */
 enum {
 	/* Function recursion bits */
@@ -40,12 +25,14 @@ enum {
 	TRACE_FTRACE_NMI_BIT,
 	TRACE_FTRACE_IRQ_BIT,
 	TRACE_FTRACE_SIRQ_BIT,
+	TRACE_FTRACE_TRANSITION_BIT,
 
-	/* INTERNAL_BITs must be greater than FTRACE_BITs */
+	/* Internal use recursion bits */
 	TRACE_INTERNAL_BIT,
 	TRACE_INTERNAL_NMI_BIT,
 	TRACE_INTERNAL_IRQ_BIT,
 	TRACE_INTERNAL_SIRQ_BIT,
+	TRACE_INTERNAL_TRANSITION_BIT,
 
 	TRACE_BRANCH_BIT,
 /*
@@ -86,12 +73,6 @@ enum {
 	 */
 	TRACE_GRAPH_NOTRACE_BIT,
 
-	/*
-	 * When transitioning between context, the preempt_count() may
-	 * not be correct. Allow for a single recursion to cover this case.
-	 */
-	TRACE_TRANSITION_BIT,
-
 	/* Used to prevent recursion recording from recursing. */
 	TRACE_RECORD_RECURSION_BIT,
 };
@@ -132,6 +113,7 @@ enum {
 	TRACE_CTX_IRQ,
 	TRACE_CTX_SOFTIRQ,
 	TRACE_CTX_NORMAL,
+	TRACE_CTX_TRANSITION,
 };
 
 static __always_inline int trace_get_context_bit(void)
@@ -165,40 +147,29 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 	unsigned int val = READ_ONCE(current->trace_recursion);
 	int bit;
 
-	/* A previous recursion check was made */
-	if ((val & TRACE_CONTEXT_MASK) > max)
-		return 0;
-
 	bit = trace_get_context_bit() + start;
 	if (unlikely(val & (1 << bit))) {
 		/*
 		 * It could be that preempt_count has not been updated during
 		 * a switch between contexts. Allow for a single recursion.
 		 */
-		bit = TRACE_TRANSITION_BIT;
+		bit = TRACE_CTX_TRANSITION + start;
 		if (val & (1 << bit)) {
 			do_ftrace_record_recursion(ip, pip);
 			return -1;
 		}
-	} else {
-		/* Normal check passed, clear the transition to allow it again */
-		val &= ~(1 << TRACE_TRANSITION_BIT);
 	}
 
 	val |= 1 << bit;
 	current->trace_recursion = val;
 	barrier();
 
-	return bit + 1;
+	return bit;
 }
 
 static __always_inline void trace_clear_recursion(int bit)
 {
-	if (!bit)
-		return;
-
 	barrier();
-	bit--;
 	trace_recursion_clear(bit);
 }
 
-- 
2.31.1


^ permalink raw reply related

* [PATCH] powerpc/smp: do not decrement idle task preempt count in CPU offline
From: Nathan Lynch @ 2021-10-15 14:55 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: srikar, peterz, linux-kernel, valentin.schneider, clg, mingo

With PREEMPT_COUNT=y, when a CPU is offlined and then onlined again, we
get:

BUG: scheduling while atomic: swapper/1/0/0x00000000
no locks held by swapper/1/0.
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.15.0-rc2+ #100
Call Trace:
 dump_stack_lvl+0xac/0x108
 __schedule_bug+0xac/0xe0
 __schedule+0xcf8/0x10d0
 schedule_idle+0x3c/0x70
 do_idle+0x2d8/0x4a0
 cpu_startup_entry+0x38/0x40
 start_secondary+0x2ec/0x3a0
 start_secondary_prolog+0x10/0x14

This is because powerpc's arch_cpu_idle_dead() decrements the idle task's
preempt count, for reasons explained in commit a7c2bb8279d2 ("powerpc:
Re-enable preemption before cpu_die()"), specifically "start_secondary()
expects a preempt_count() of 0."

However, since commit 2c669ef6979c ("powerpc/preempt: Don't touch the idle
task's preempt_count during hotplug") and commit f1a0a376ca0c ("sched/core:
Initialize the idle task with preemption disabled"), that justification no
longer holds.

The idle task isn't supposed to re-enable preemption, so remove the
vestigial preempt_enable() from the CPU offline path.

Tested with pseries and powernv in qemu, and pseries on PowerVM.

Fixes: 2c669ef6979c ("powerpc/preempt: Don't touch the idle task's preempt_count during hotplug")
Fixes: f1a0a376ca0c ("sched/core: Initialize the idle task with preemption disabled")
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
---
 arch/powerpc/kernel/smp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9cc7d3dbf439..605bab448f84 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1730,8 +1730,6 @@ void __cpu_die(unsigned int cpu)
 
 void arch_cpu_idle_dead(void)
 {
-	sched_preempt_enable_no_resched();
-
 	/*
 	 * Disable on the down path. This will be re-enabled by
 	 * start_secondary() via start_secondary_resume() below
-- 
2.31.1


^ permalink raw reply related

* [PATCH v2 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
From: Naveen Naidu @ 2021-10-15 14:39 UTC (permalink / raw)
  To: bhelgaas
  Cc: linux-pci, open list:PCI ENHANCED ERROR HANDLING EEH FOR POWERPC,
	linux-kernel, Naveen Naidu, Oliver O'Halloran,
	linux-kernel-mentees
In-Reply-To: <cover.1634306198.git.naveennaidu479@gmail.com>

An MMIO read from a PCI device that doesn't exist or doesn't respond
causes a PCI error.  There's no real data to return to satisfy the
CPU read, so most hardware fabricates ~0 data.

Use RESPONSE_IS_PCI_ERROR() to check the response we get when we read
data from hardware.

This helps unify PCI error response checking and make error checks
consistent and easier to find.

Compile tested only.

Signed-off-by: Naveen Naidu <naveennaidu479@gmail.com>
---
 drivers/pci/pcie/dpc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index c556e7beafe3..561c44d9429c 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -79,7 +79,7 @@ static bool dpc_completed(struct pci_dev *pdev)
 	u16 status;
 
 	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_STATUS, &status);
-	if ((status != 0xffff) && (status & PCI_EXP_DPC_STATUS_TRIGGER))
+	if ((!RESPONSE_IS_PCI_ERROR(&status)) && (status & PCI_EXP_DPC_STATUS_TRIGGER))
 		return false;
 
 	if (test_bit(PCI_DPC_RECOVERING, &pdev->priv_flags))
@@ -312,7 +312,7 @@ static irqreturn_t dpc_irq(int irq, void *context)
 
 	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
 
-	if (!(status & PCI_EXP_DPC_STATUS_INTERRUPT) || status == (u16)(~0))
+	if (!(status & PCI_EXP_DPC_STATUS_INTERRUPT) || RESPONSE_IS_PCI_ERROR(&status))
 		return IRQ_NONE;
 
 	pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v2 00/24] Unify PCI error response checking
From: Naveen Naidu @ 2021-10-15 14:38 UTC (permalink / raw)
  To: bhelgaas
  Cc: Rob Herring, linux-hyperv, linux-samsung-soc, Pali Rohár,
	linux-pci, linuxppc-dev, linux-kernel, linux-renesas-soc,
	linux-rockchip, linux-mediatek, linux-kernel-mentees,
	linux-arm-kernel
In-Reply-To: <cover.1634300660.git.naveennaidu479@gmail.com>

On 15/10, Naveen Naidu wrote:
> An MMIO read from a PCI device that doesn't exist or doesn't respond
> causes a PCI error.  There's no real data to return to satisfy the 
> CPU read, so most hardware fabricates ~0 data.
> 
> This patch series adds PCI_ERROR_RESPONSE definition and other helper
> definition SET_PCI_ERROR_RESPONSE and RESPONSE_IS_PCI_ERROR and uses it
> where appropriate to make these checks consistent and easier to find.
> 
> This helps unify PCI error response checking and make error check
> consistent and easier to find.
> 
> This series also ensures that the error response fabrication now happens
> in the PCI_OP_READ and PCI_USER_READ_CONFIG. This removes the
> responsibility from controller drivers to do the error response setting. 
> 
> Patch 1:
>     - Adds the PCI_ERROR_RESPONSE and other related defintions
>     - All other patches are dependent on this patch. This patch needs to
>       be applied first, before the others
> 
> Patch 2:
>     - Error fabrication happens in PCI_OP_READ and PCI_USER_READ_CONFIG
>       whenever the data read via the controller driver fails.
>     - This patch needs to be applied before, Patch 4/24 to Patch 15/24 are
>       applied.
> 
> Patch 3:
>     - Uses SET_PCI_ERROR_RESPONSE() when device is not found and
>       RESPONSE_IS_PCI_ERROR() to check hardware read from the hardware.
> 
> Patch 4 - 15:
>     - Removes redundant error fabrication that happens in controller 
>       drivers when the read from a PCI device fails.
>     - These patches are dependent on Patch 2/24 of the series.
>     - These can be applied in any order.
> 
> Patch 16 - 22:
>     - Uses RESPONSE_IS_PCI_ERROR() to check the reads from hardware
>     - Patches can be applied in any order.
> 
> Patch 23 - 24:
>     - Edits the comments to include PCI_ERROR_RESPONSE alsong with
>       0xFFFFFFFF, so that it becomes easier to grep for faulty 
>       hardware reads.
> 
> Changelog
> =========
> 
> v2:
>     - Instead of using SET_PCI_ERROR_RESPONSE in all controller drivers
>       to fabricate error response, only use them in PCI_OP_READ and
>       PCI_USER_READ_CONFIG
> 
> Naveen Naidu (24):
>  [PATCH 1/24] PCI: Add PCI_ERROR_RESPONSE and it's related definitions
>  [PATCH 2/24] PCI: Set error response in config access defines when ops->read() fails
>  [PATCH 3/24] PCI: Unify PCI error response checking
>  [PATCH 4/24] PCI: Remove redundant error fabrication when device read fails
>  [PATCH 5/24] PCI: thunder: Remove redundant error fabrication when device read fails
>  [PATCH 6/24] PCI: iproc: Remove redundant error fabrication when device read fails
>  [PATCH 7/24] PCI: mediatek: Remove redundant error fabrication when device read fails
>  [PATCH 8/24] PCI: exynos: Remove redundant error fabrication when device read fails
>  [PATCH 9/24] PCI: histb: Remove redundant error fabrication when device read fails
>  [PATCH 10/24] PCI: kirin: Remove redundant error fabrication when device read fails
>  [PATCH 11/24] PCI: aardvark: Remove redundant error fabrication when device read fails
>  [PATCH 12/24] PCI: mvebu: Remove redundant error fabrication when device read fails
>  [PATCH 13/24] PCI: altera: Remove redundant error fabrication when device read fails
>  [PATCH 14/24] PCI: rcar: Remove redundant error fabrication when device read fails
>  [PATCH 15/24] PCI: rockchip: Remove redundant error fabrication when device read fails
>  [PATCH 16/24] PCI/ERR: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
>  [PATCH 17/24] PCI: vmd: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
>  [PATCH 18/24] PCI: pciehp: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
>  [PATCH 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
>  [PATCH 20/24] PCI/PME: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
>  [PATCH 21/24] PCI: cpqphp: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
>  [PATCH 22/24] PCI: keystone: Use PCI_ERROR_RESPONSE to specify hardware error
>  [PATCH 23/24] PCI: hv: Use PCI_ERROR_RESPONSE to specify hardware read error
>  [PATCH 24/24] PCI: xgene: Use PCI_ERROR_RESPONSE to specify hardware error
> 
>  drivers/pci/access.c                        | 36 ++++++++--------
>  drivers/pci/controller/dwc/pci-exynos.c     |  4 +-
>  drivers/pci/controller/dwc/pci-keystone.c   |  4 +-
>  drivers/pci/controller/dwc/pcie-histb.c     |  4 +-
>  drivers/pci/controller/dwc/pcie-kirin.c     |  4 +-
>  drivers/pci/controller/pci-aardvark.c       | 10 +----
>  drivers/pci/controller/pci-hyperv.c         |  2 +-
>  drivers/pci/controller/pci-mvebu.c          |  8 +---
>  drivers/pci/controller/pci-thunder-ecam.c   | 46 +++++++--------------
>  drivers/pci/controller/pci-thunder-pem.c    |  4 +-
>  drivers/pci/controller/pci-xgene.c          |  8 ++--
>  drivers/pci/controller/pcie-altera.c        |  4 +-
>  drivers/pci/controller/pcie-iproc.c         |  4 +-
>  drivers/pci/controller/pcie-mediatek.c      | 11 +----
>  drivers/pci/controller/pcie-rcar-host.c     |  4 +-
>  drivers/pci/controller/pcie-rockchip-host.c |  4 +-
>  drivers/pci/controller/vmd.c                |  2 +-
>  drivers/pci/hotplug/cpqphp_ctrl.c           |  4 +-
>  drivers/pci/hotplug/pciehp_hpc.c            | 10 ++---
>  drivers/pci/pci.c                           | 10 ++---
>  drivers/pci/pcie/dpc.c                      |  4 +-
>  drivers/pci/pcie/pme.c                      |  4 +-
>  drivers/pci/probe.c                         | 10 ++---
>  include/linux/pci.h                         |  9 ++++
>  24 files changed, 87 insertions(+), 123 deletions(-)
> 
> -- 
> 2.25.1
>

Please ignore this stray cover letter. I had a wrong message ID written
for it. Apologies for the inconvenience caused.


^ permalink raw reply

* [PATCH v2 00/24] Unify PCI error response checking
From: Naveen Naidu @ 2021-10-15 14:35 UTC (permalink / raw)
  To: bhelgaas
  Cc: Rob Herring, linux-hyperv, linux-samsung-soc, Pali Rohár,
	linux-rockchip, linux-pci, linuxppc-dev, linux-kernel,
	linux-renesas-soc, Naveen Naidu, linux-mediatek,
	linux-kernel-mentees, linux-arm-kernel

An MMIO read from a PCI device that doesn't exist or doesn't respond
causes a PCI error.  There's no real data to return to satisfy the 
CPU read, so most hardware fabricates ~0 data.

This patch series adds PCI_ERROR_RESPONSE definition and other helper
definition SET_PCI_ERROR_RESPONSE and RESPONSE_IS_PCI_ERROR and uses it
where appropriate to make these checks consistent and easier to find.

This helps unify PCI error response checking and make error check
consistent and easier to find.

This series also ensures that the error response fabrication now happens
in the PCI_OP_READ and PCI_USER_READ_CONFIG. This removes the
responsibility from controller drivers to do the error response setting. 

Patch 1:
    - Adds the PCI_ERROR_RESPONSE and other related defintions
    - All other patches are dependent on this patch. This patch needs to
      be applied first, before the others

Patch 2:
    - Error fabrication happens in PCI_OP_READ and PCI_USER_READ_CONFIG
      whenever the data read via the controller driver fails.
    - This patch needs to be applied before, Patch 4/24 to Patch 15/24 are
      applied.

Patch 3:
    - Uses SET_PCI_ERROR_RESPONSE() when device is not found and
      RESPONSE_IS_PCI_ERROR() to check hardware read from the hardware.

Patch 4 - 15:
    - Removes redundant error fabrication that happens in controller 
      drivers when the read from a PCI device fails.
    - These patches are dependent on Patch 2/24 of the series.
    - These can be applied in any order.

Patch 16 - 22:
    - Uses RESPONSE_IS_PCI_ERROR() to check the reads from hardware
    - Patches can be applied in any order.

Patch 23 - 24:
    - Edits the comments to include PCI_ERROR_RESPONSE alsong with
      0xFFFFFFFF, so that it becomes easier to grep for faulty 
      hardware reads.

Changelog
=========

v2:
    - Instead of using SET_PCI_ERROR_RESPONSE in all controller drivers
      to fabricate error response, only use them in PCI_OP_READ and
      PCI_USER_READ_CONFIG

Naveen Naidu (24):
 [PATCH 1/24] PCI: Add PCI_ERROR_RESPONSE and it's related definitions
 [PATCH 2/24] PCI: Set error response in config access defines when ops->read() fails
 [PATCH 3/24] PCI: Unify PCI error response checking
 [PATCH 4/24] PCI: Remove redundant error fabrication when device read fails
 [PATCH 5/24] PCI: thunder: Remove redundant error fabrication when device read fails
 [PATCH 6/24] PCI: iproc: Remove redundant error fabrication when device read fails
 [PATCH 7/24] PCI: mediatek: Remove redundant error fabrication when device read fails
 [PATCH 8/24] PCI: exynos: Remove redundant error fabrication when device read fails
 [PATCH 9/24] PCI: histb: Remove redundant error fabrication when device read fails
 [PATCH 10/24] PCI: kirin: Remove redundant error fabrication when device read fails
 [PATCH 11/24] PCI: aardvark: Remove redundant error fabrication when device read fails
 [PATCH 12/24] PCI: mvebu: Remove redundant error fabrication when device read fails
 [PATCH 13/24] PCI: altera: Remove redundant error fabrication when device read fails
 [PATCH 14/24] PCI: rcar: Remove redundant error fabrication when device read fails
 [PATCH 15/24] PCI: rockchip: Remove redundant error fabrication when device read fails
 [PATCH 16/24] PCI/ERR: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 17/24] PCI: vmd: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 18/24] PCI: pciehp: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 20/24] PCI/PME: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 21/24] PCI: cpqphp: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 22/24] PCI: keystone: Use PCI_ERROR_RESPONSE to specify hardware error
 [PATCH 23/24] PCI: hv: Use PCI_ERROR_RESPONSE to specify hardware read error
 [PATCH 24/24] PCI: xgene: Use PCI_ERROR_RESPONSE to specify hardware error

 drivers/pci/access.c                        | 36 ++++++++--------
 drivers/pci/controller/dwc/pci-exynos.c     |  4 +-
 drivers/pci/controller/dwc/pci-keystone.c   |  4 +-
 drivers/pci/controller/dwc/pcie-histb.c     |  4 +-
 drivers/pci/controller/dwc/pcie-kirin.c     |  4 +-
 drivers/pci/controller/pci-aardvark.c       | 10 +----
 drivers/pci/controller/pci-hyperv.c         |  2 +-
 drivers/pci/controller/pci-mvebu.c          |  8 +---
 drivers/pci/controller/pci-thunder-ecam.c   | 46 +++++++--------------
 drivers/pci/controller/pci-thunder-pem.c    |  4 +-
 drivers/pci/controller/pci-xgene.c          |  8 ++--
 drivers/pci/controller/pcie-altera.c        |  4 +-
 drivers/pci/controller/pcie-iproc.c         |  4 +-
 drivers/pci/controller/pcie-mediatek.c      | 11 +----
 drivers/pci/controller/pcie-rcar-host.c     |  4 +-
 drivers/pci/controller/pcie-rockchip-host.c |  4 +-
 drivers/pci/controller/vmd.c                |  2 +-
 drivers/pci/hotplug/cpqphp_ctrl.c           |  4 +-
 drivers/pci/hotplug/pciehp_hpc.c            | 10 ++---
 drivers/pci/pci.c                           | 10 ++---
 drivers/pci/pcie/dpc.c                      |  4 +-
 drivers/pci/pcie/pme.c                      |  4 +-
 drivers/pci/probe.c                         | 10 ++---
 include/linux/pci.h                         |  9 ++++
 24 files changed, 87 insertions(+), 123 deletions(-)

-- 
2.25.1


^ permalink raw reply

* [PATCH v2 00/24] Unify PCI error response checking
From: Naveen Naidu @ 2021-10-15 14:26 UTC (permalink / raw)
  To: bhelgaas
  Cc: Rob Herring, linux-hyperv, linux-samsung-soc, Pali Rohár,
	linux-rockchip, linux-pci, linuxppc-dev, linux-kernel,
	linux-renesas-soc, Naveen Naidu, linux-mediatek,
	linux-kernel-mentees, linux-arm-kernel

An MMIO read from a PCI device that doesn't exist or doesn't respond
causes a PCI error.  There's no real data to return to satisfy the 
CPU read, so most hardware fabricates ~0 data.

This patch series adds PCI_ERROR_RESPONSE definition and other helper
definition SET_PCI_ERROR_RESPONSE and RESPONSE_IS_PCI_ERROR and uses it
where appropriate to make these checks consistent and easier to find.

This helps unify PCI error response checking and make error check
consistent and easier to find.

This series also ensures that the error response fabrication now happens
in the PCI_OP_READ and PCI_USER_READ_CONFIG. This removes the
responsibility from controller drivers to do the error response setting. 

Patch 1:
    - Adds the PCI_ERROR_RESPONSE and other related defintions
    - All other patches are dependent on this patch. This patch needs to
      be applied first, before the others

Patch 2:
    - Error fabrication happens in PCI_OP_READ and PCI_USER_READ_CONFIG
      whenever the data read via the controller driver fails.
    - This patch needs to be applied before, Patch 4/24 to Patch 15/24 are
      applied.

Patch 3:
    - Uses SET_PCI_ERROR_RESPONSE() when device is not found and
      RESPONSE_IS_PCI_ERROR() to check hardware read from the hardware.

Patch 4 - 15:
    - Removes redundant error fabrication that happens in controller 
      drivers when the read from a PCI device fails.
    - These patches are dependent on Patch 2/24 of the series.
    - These can be applied in any order.

Patch 16 - 22:
    - Uses RESPONSE_IS_PCI_ERROR() to check the reads from hardware
    - Patches can be applied in any order.

Patch 23 - 24:
    - Edits the comments to include PCI_ERROR_RESPONSE alsong with
      0xFFFFFFFF, so that it becomes easier to grep for faulty 
      hardware reads.

Changelog
=========

v2:
    - Instead of using SET_PCI_ERROR_RESPONSE in all controller drivers
      to fabricate error response, only use them in PCI_OP_READ and
      PCI_USER_READ_CONFIG

Naveen Naidu (24):
 [PATCH 1/24] PCI: Add PCI_ERROR_RESPONSE and it's related definitions
 [PATCH 2/24] PCI: Set error response in config access defines when ops->read() fails
 [PATCH 3/24] PCI: Unify PCI error response checking
 [PATCH 4/24] PCI: Remove redundant error fabrication when device read fails
 [PATCH 5/24] PCI: thunder: Remove redundant error fabrication when device read fails
 [PATCH 6/24] PCI: iproc: Remove redundant error fabrication when device read fails
 [PATCH 7/24] PCI: mediatek: Remove redundant error fabrication when device read fails
 [PATCH 8/24] PCI: exynos: Remove redundant error fabrication when device read fails
 [PATCH 9/24] PCI: histb: Remove redundant error fabrication when device read fails
 [PATCH 10/24] PCI: kirin: Remove redundant error fabrication when device read fails
 [PATCH 11/24] PCI: aardvark: Remove redundant error fabrication when device read fails
 [PATCH 12/24] PCI: mvebu: Remove redundant error fabrication when device read fails
 [PATCH 13/24] PCI: altera: Remove redundant error fabrication when device read fails
 [PATCH 14/24] PCI: rcar: Remove redundant error fabrication when device read fails
 [PATCH 15/24] PCI: rockchip: Remove redundant error fabrication when device read fails
 [PATCH 16/24] PCI/ERR: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 17/24] PCI: vmd: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 18/24] PCI: pciehp: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 20/24] PCI/PME: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 21/24] PCI: cpqphp: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 22/24] PCI: keystone: Use PCI_ERROR_RESPONSE to specify hardware error
 [PATCH 23/24] PCI: hv: Use PCI_ERROR_RESPONSE to specify hardware read error
 [PATCH 24/24] PCI: xgene: Use PCI_ERROR_RESPONSE to specify hardware error

 drivers/pci/access.c                        | 36 ++++++++--------
 drivers/pci/controller/dwc/pci-exynos.c     |  4 +-
 drivers/pci/controller/dwc/pci-keystone.c   |  4 +-
 drivers/pci/controller/dwc/pcie-histb.c     |  4 +-
 drivers/pci/controller/dwc/pcie-kirin.c     |  4 +-
 drivers/pci/controller/pci-aardvark.c       | 10 +----
 drivers/pci/controller/pci-hyperv.c         |  2 +-
 drivers/pci/controller/pci-mvebu.c          |  8 +---
 drivers/pci/controller/pci-thunder-ecam.c   | 46 +++++++--------------
 drivers/pci/controller/pci-thunder-pem.c    |  4 +-
 drivers/pci/controller/pci-xgene.c          |  8 ++--
 drivers/pci/controller/pcie-altera.c        |  4 +-
 drivers/pci/controller/pcie-iproc.c         |  4 +-
 drivers/pci/controller/pcie-mediatek.c      | 11 +----
 drivers/pci/controller/pcie-rcar-host.c     |  4 +-
 drivers/pci/controller/pcie-rockchip-host.c |  4 +-
 drivers/pci/controller/vmd.c                |  2 +-
 drivers/pci/hotplug/cpqphp_ctrl.c           |  4 +-
 drivers/pci/hotplug/pciehp_hpc.c            | 10 ++---
 drivers/pci/pci.c                           | 10 ++---
 drivers/pci/pcie/dpc.c                      |  4 +-
 drivers/pci/pcie/pme.c                      |  4 +-
 drivers/pci/probe.c                         | 10 ++---
 include/linux/pci.h                         |  9 ++++
 24 files changed, 87 insertions(+), 123 deletions(-)

-- 
2.25.1


^ permalink raw reply

* Re: [PATCH] soc: fsl: dpio: protect smp_processor_id when get processor id
From: Robin Murphy @ 2021-10-15 13:39 UTC (permalink / raw)
  To: Meng.Li, Roy.Pledge, leoyang.li, ruxandra.radulescu, horia.geanta
  Cc: linuxppc-dev, linux-kernel, linux-arm-kernel
In-Reply-To: <20211015063601.23303-1-Meng.Li@windriver.com>

On 2021-10-15 07:36, Meng.Li@windriver.com wrote:
> From: Meng Li <meng.li@windriver.com>
> 
> When enable debug kernel configs,there will be calltrace as below:
> 
> BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
> caller is debug_smp_processor_id+0x20/0x30
> CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.63-yocto-standard #1
> Hardware name: NXP Layerscape LX2160ARDB (DT)
> Call trace:
>   dump_backtrace+0x0/0x1a0
>   show_stack+0x24/0x30
>   dump_stack+0xf0/0x13c
>   check_preemption_disabled+0x100/0x110
>   debug_smp_processor_id+0x20/0x30
>   dpaa2_io_query_fq_count+0xdc/0x154
>   dpaa2_eth_stop+0x144/0x314
>   __dev_close_many+0xdc/0x160
>   __dev_change_flags+0xe8/0x220
>   dev_change_flags+0x30/0x70
>   ic_close_devs+0x50/0x78
>   ip_auto_config+0xed0/0xf10
>   do_one_initcall+0xac/0x460
>   kernel_init_freeable+0x30c/0x378
>   kernel_init+0x20/0x128
>   ret_from_fork+0x10/0x38
> 
> Because smp_processor_id() should be invoked in preempt disable status.
> So, add preempt_disable/enable() to protect smp_processor_id().

If preemption doesn't matter anyway, as the comment in the context 
implies, then it probably makes more sense just to use 
raw_smp_processor_id() instead.

Robin.

> Fixes: c89105c9b390 ("staging: fsl-mc: Move DPIO from staging to drivers/soc/fsl")
> Cc: stable@vger.kernel.org
> Signed-off-by: Meng Li <Meng.Li@windriver.com>
> ---
>   drivers/soc/fsl/dpio/dpio-service.c | 7 +++++--
>   1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/soc/fsl/dpio/dpio-service.c b/drivers/soc/fsl/dpio/dpio-service.c
> index 19f47ea9dab0..afc3b89b0fc5 100644
> --- a/drivers/soc/fsl/dpio/dpio-service.c
> +++ b/drivers/soc/fsl/dpio/dpio-service.c
> @@ -58,8 +58,11 @@ static inline struct dpaa2_io *service_select_by_cpu(struct dpaa2_io *d,
>   	 * If cpu == -1, choose the current cpu, with no guarantees about
>   	 * potentially being migrated away.
>   	 */
> -	if (cpu < 0)
> -		cpu = smp_processor_id();
> +        if (cpu < 0) {
> +                preempt_disable();
> +                cpu = smp_processor_id();
> +                preempt_enable();
> +        }
>   
>   	/* If a specific cpu was requested, pick it up immediately */
>   	return dpio_by_cpu[cpu];
> 

^ permalink raw reply

* [PATCH 2/2] KVM: PPC: Book3S HV: Make idle_kvm_start_guest() return 0 if it went to guest
From: Michael Ellerman @ 2021-10-15 13:39 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kvm-ppc, npiggin
In-Reply-To: <20211015133929.832061-1-mpe@ellerman.id.au>

We call idle_kvm_start_guest() from power7_offline() if the thread has
been requested to enter KVM. We pass it the SRR1 value that was returned
from power7_idle_insn() which tells us what sort of wakeup we're
processing.

Depending on the SRR1 value we pass in, the KVM code might enter the
guest, or it might return to us to do some host action if the wakeup
requires it.

If idle_kvm_start_guest() is able to handle the wakeup, and enter the
guest it is supposed to indicate that by returning a zero SRR1 value to
us.

That was the behaviour prior to commit 10d91611f426 ("powerpc/64s:
Reimplement book3s idle code in C"), however in that commit the
handling of SRR1 was reworked, and the zeroing behaviour was lost.

Returning from idle_kvm_start_guest() without zeroing the SRR1 value can
confuse the host offline code, causing the guest to crash and other
weirdness.

Fixes: 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
Cc: stable@vger.kernel.org # v5.2+
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index ec57952b60b0..eb776d0c5d8e 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -264,6 +264,7 @@ _GLOBAL(idle_kvm_start_guest)
 	stdu	r1, -SWITCH_FRAME_SIZE(r4)
 	// Switch to new frame on emergency stack
 	mr	r1, r4
+	std	r3, 32(r1)	// Save SRR1 wakeup value
 	SAVE_NVGPRS(r1)
 
 	/*
@@ -315,6 +316,10 @@ _GLOBAL(idle_kvm_start_guest)
 
 kvm_secondary_got_guest:
 
+	// About to go to guest, clear saved SRR1
+	li	r0, 0
+	std	r0, 32(r1)
+
 	/* Set HSTATE_DSCR(r13) to something sensible */
 	ld	r6, PACA_DSCR_DEFAULT(r13)
 	std	r6, HSTATE_DSCR(r13)
@@ -394,8 +399,8 @@ _GLOBAL(idle_kvm_start_guest)
 	mfspr	r4, SPRN_LPCR
 	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
 	mtspr	SPRN_LPCR, r4
-	/* set up r3 for return */
-	mfspr	r3,SPRN_SRR1
+	// Return SRR1 wakeup value, or 0 if we went into the guest
+	ld	r3, 32(r1)
 	REST_NVGPRS(r1)
 	ld	r1, 0(r1)	// Switch back to caller stack
 	ld	r0, 16(r1)	// Reload LR
-- 
2.25.1


^ permalink raw reply related

* [PATCH 1/2] KVM: PPC: Book3S HV: Fix stack handling in idle_kvm_start_guest()
From: Michael Ellerman @ 2021-10-15 13:39 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kvm-ppc, npiggin

In commit 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in
C") kvm_start_guest() became idle_kvm_start_guest(). The old code
allocated a stack frame on the emergency stack, but didn't use the
frame to store anything, and also didn't store anything in its caller's
frame.

idle_kvm_start_guest() on the other hand is written more like a normal C
function, it creates a frame on entry, and also stores CR/LR into its
callers frame (per the ABI). The problem is that there is no caller
frame on the emergency stack.

The emergency stack for a given CPU is allocated with:

  paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;

So emergency_sp actually points to the first address above the emergency
stack allocation for a given CPU, we must not store above it without
first decrementing it to create a frame. This is different to the
regular kernel stack, paca->kstack, which is initialised to point at an
initial frame that is ready to use.

idle_kvm_start_guest() stores the backchain, CR and LR all of which
write outside the allocation for the emergency stack. It then creates a
stack frame and saves the non-volatile registers. Unfortunately the
frame it creates is not large enough to fit the non-volatiles, and so
the saving of the non-volatile registers also writes outside the
emergency stack allocation.

The end result is that we corrupt whatever is at 0-24 bytes, and 112-248
bytes above the emergency stack allocation.

In practice this has gone unnoticed because the memory immediately above
the emergency stack happens to be used for other stack allocations,
either another CPUs mc_emergency_sp or an IRQ stack. See the order of
calls to irqstack_early_init() and emergency_stack_init().

The low addresses of another stack are the top of that stack, and so are
only used if that stack is under extreme pressue, which essentially
never happens in practice - and if it did there's a high likelyhood we'd
crash due to that stack overflowing.

Still, we shouldn't be corrupting someone else's stack, and it is purely
luck that we aren't corrupting something else.

To fix it we save CR/LR into the caller's frame using the existing r1 on
entry, we then create a SWITCH_FRAME_SIZE frame (which has space for
pt_regs) on the emergency stack with the backchain pointing to the
existing stack, and then finally we switch to the new frame on the
emergency stack.

Fixes: 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
Cc: stable@vger.kernel.org # v5.2+
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 90484425a1e6..ec57952b60b0 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -255,13 +255,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
  * r3 contains the SRR1 wakeup value, SRR1 is trashed.
  */
 _GLOBAL(idle_kvm_start_guest)
-	ld	r4,PACAEMERGSP(r13)
 	mfcr	r5
 	mflr	r0
-	std	r1,0(r4)
-	std	r5,8(r4)
-	std	r0,16(r4)
-	subi	r1,r4,STACK_FRAME_OVERHEAD
+	std	r5, 8(r1)	// Save CR in caller's frame
+	std	r0, 16(r1)	// Save LR in caller's frame
+	// Create frame on emergency stack
+	ld	r4, PACAEMERGSP(r13)
+	stdu	r1, -SWITCH_FRAME_SIZE(r4)
+	// Switch to new frame on emergency stack
+	mr	r1, r4
 	SAVE_NVGPRS(r1)
 
 	/*
@@ -395,10 +397,9 @@ _GLOBAL(idle_kvm_start_guest)
 	/* set up r3 for return */
 	mfspr	r3,SPRN_SRR1
 	REST_NVGPRS(r1)
-	addi	r1, r1, STACK_FRAME_OVERHEAD
-	ld	r0, 16(r1)
-	ld	r5, 8(r1)
-	ld	r1, 0(r1)
+	ld	r1, 0(r1)	// Switch back to caller stack
+	ld	r0, 16(r1)	// Reload LR
+	ld	r5, 8(r1)	// Reload CR
 	mtlr	r0
 	mtcr	r5
 	blr
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v3 1/2] ftrace: disable preemption between ftrace_test_recursion_trylock/unlock()
From: Steven Rostedt @ 2021-10-15 13:35 UTC (permalink / raw)
  To: 王贇
  Cc: Peter Zijlstra (Intel), Paul Walmsley, James E.J. Bottomley,
	Guo Ren, Jisheng Zhang, H. Peter Anvin, live-patching,
	linux-riscv, Miroslav Benes, Paul Mackerras, Joe Lawrence,
	Helge Deller, x86, linux-csky, Ingo Molnar, Petr Mladek,
	Albert Ou, Jiri Kosina, Nicholas Piggin, Borislav Petkov,
	Josh Poimboeuf, Thomas Gleixner, linux-parisc, linux-kernel,
	Palmer Dabbelt, Masami Hiramatsu, Colin Ian King, linuxppc-dev
In-Reply-To: <3c87e825-e907-cba0-e95f-28878356fc71@linux.alibaba.com>

On Fri, 15 Oct 2021 17:12:26 +0800
王贇 <yun.wang@linux.alibaba.com> wrote:

> Maybe take some example would be easier to understand...
> 
> Currently there are two way of using ftrace_test_recursion_trylock(),
> one with TRACE_FTRACE_XXX we mark as A, one with TRACE_LIST_XXX we mark
> as B, then:
> 
> A followed by B on same context got bit > 0
> B followed by A on any context got bit 0
> A followed by A on same context got bit > 0
> A followed by A followed by A on same context got bit -1
> B followed by B on same context got bit > 0
> B followed by B followed by B on same context got bit -1
> 
> If we get rid of the TRACE_TRANSITION_BIT which allowed recursion for
> onetime, then it would be:

We can't get rid of the transition bit. It's there to fix a bug. Or at
least until we finish the "noinst" issue which may be true now? I have to
go revisit it.

The reason for the transition bit, is because we were dropping function
traces, that people relied on being there. The problem is that the
recursion protection allows for nested context. That is, it will not detect
recursion if we an interrupt triggers during a trace (while the recursion
lock is held) and then that interrupt does a trace. It is allowed to call
the ftrace_test_recursion_trylock() again.

But, what happens if the trace occurs after the interrupt triggers, but
before the preempt_count states that we are now in interrupt context? As
preempt_count is used by this code to determine what context we are in, if
a trace happens in this "transition" state, without the transition bit, a
recursion is detected (false positive) and the event is dropped. People are
then confused on how an interrupt happened, but the entry of the interrupt
never triggered (according to the trace).

The transition bit allows one level of recursion to handle this case.

The "noinst" work may also remove all locations that can be traced before
the context is updated. I need to see if that is now the case, and if it
is, we can remove the transition bit.

This is not the design issue I mentioned earlier. I'm still working on that
one.

-- Steve


> 
> A followed by B on same context got bit > 0
> B followed by A on any context got bit 0
> A followed by A on same context got bit -1
> B followed by B on same context got bit -1
> 
> So as long as no continuously AAA it's safe?


^ permalink raw reply

* Re: [PATCH v11 2/3] tty: hvc: pass DMA capable memory to put_chars()
From: Xianting Tian @ 2021-10-15 12:59 UTC (permalink / raw)
  To: gregkh, jirislaby, amit, arnd, osandov
  Cc: shile.zhang, linuxppc-dev, linux-kernel, virtualization
In-Reply-To: <20211015024658.1353987-3-xianting.tian@linux.alibaba.com>

Hi Greg and experts

Is this version ok for you?  thanks

在 2021/10/15 上午10:46, Xianting Tian 写道:
> As well known, hvc backend can register its opertions to hvc backend.
> the operations contain put_chars(), get_chars() and so on.
>
> Some hvc backend may do dma in its operations. eg, put_chars() of
> virtio-console. But in the code of hvc framework, it may pass DMA
> incapable memory to put_chars() under a specific configuration, which
> is explained in commit c4baad5029(virtio-console: avoid DMA from stack):
> 1, c[] is on stack,
>     hvc_console_print():
>          char c[N_OUTBUF] __ALIGNED__;
>          cons_ops[index]->put_chars(vtermnos[index], c, i);
> 2, ch is on stack,
>     static void hvc_poll_put_char(,,char ch)
>     {
>          struct tty_struct *tty = driver->ttys[0];
>          struct hvc_struct *hp = tty->driver_data;
>          int n;
>
>          do {
>                  n = hp->ops->put_chars(hp->vtermno, &ch, 1);
>          } while (n <= 0);
>     }
>
> Commit c4baad5029 is just the fix to avoid DMA from stack memory, which
> is passed to virtio-console by hvc framework in above code. But I think
> the fix is aggressive, it directly uses kmemdup() to alloc new buffer
> from kmalloc area and do memcpy no matter the memory is in kmalloc area
> or not. But most importantly, it should better be fixed in the hvc
> framework, by changing it to never pass stack memory to the put_chars()
> function in the first place. Otherwise, we still face the same issue if
> a new hvc backend using dma added in the furture.
>
> In this patch, add 'char cons_outbuf[]' as part of 'struct hvc_struct',
> so hp->cons_outbuf is no longer the stack memory, we can use it in above
> cases safely. We also add lock to protect cons_outbuf instead of using
> the global lock of hvc.
>
> Introduce another array(cons_hvcs[]) for hvc pointers next to the
> cons_ops[] and vtermnos[] arrays. With the array, we can easily find
> hvc's cons_outbuf and its lock.
>
> With the patch, we can revert the fix c4baad5029.
>
> Signed-off-by: Xianting Tian <xianting.tian@linux.alibaba.com>
> Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
> ---
>   drivers/tty/hvc/hvc_console.c | 36 ++++++++++++++++++++---------------
>   drivers/tty/hvc/hvc_console.h | 21 +++++++++++++++++++-
>   2 files changed, 41 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
> index 5957ab728..11f2463a1 100644
> --- a/drivers/tty/hvc/hvc_console.c
> +++ b/drivers/tty/hvc/hvc_console.c
> @@ -41,16 +41,6 @@
>    */
>   #define HVC_CLOSE_WAIT (HZ/100) /* 1/10 of a second */
>   
> -/*
> - * These sizes are most efficient for vio, because they are the
> - * native transfer size. We could make them selectable in the
> - * future to better deal with backends that want other buffer sizes.
> - */
> -#define N_OUTBUF	16
> -#define N_INBUF		16
> -
> -#define __ALIGNED__ __attribute__((__aligned__(L1_CACHE_BYTES)))
> -
>   static struct tty_driver *hvc_driver;
>   static struct task_struct *hvc_task;
>   
> @@ -142,6 +132,7 @@ static int hvc_flush(struct hvc_struct *hp)
>   static const struct hv_ops *cons_ops[MAX_NR_HVC_CONSOLES];
>   static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
>   	{[0 ... MAX_NR_HVC_CONSOLES - 1] = -1};
> +static struct hvc_struct *cons_hvcs[MAX_NR_HVC_CONSOLES];
>   
>   /*
>    * Console APIs, NOT TTY.  These APIs are available immediately when
> @@ -151,9 +142,11 @@ static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
>   static void hvc_console_print(struct console *co, const char *b,
>   			      unsigned count)
>   {
> -	char c[N_OUTBUF] __ALIGNED__;
> +	char *c;
>   	unsigned i = 0, n = 0;
>   	int r, donecr = 0, index = co->index;
> +	unsigned long flags;
> +	struct hvc_struct *hp;
>   
>   	/* Console access attempt outside of acceptable console range. */
>   	if (index >= MAX_NR_HVC_CONSOLES)
> @@ -163,6 +156,13 @@ static void hvc_console_print(struct console *co, const char *b,
>   	if (vtermnos[index] == -1)
>   		return;
>   
> +	hp = cons_hvcs[index];
> +	if (!hp)
> +		return;
> +
> +	c = hp->cons_outbuf;
> +
> +	spin_lock_irqsave(&hp->cons_outbuf_lock, flags);
>   	while (count > 0 || i > 0) {
>   		if (count > 0 && i < sizeof(c)) {
>   			if (b[n] == '\n' && !donecr) {
> @@ -191,6 +191,7 @@ static void hvc_console_print(struct console *co, const char *b,
>   			}
>   		}
>   	}
> +	spin_unlock_irqrestore(&hp->cons_outbuf_lock, flags);
>   	hvc_console_flush(cons_ops[index], vtermnos[index]);
>   }
>   
> @@ -878,9 +879,13 @@ static void hvc_poll_put_char(struct tty_driver *driver, int line, char ch)
>   	struct tty_struct *tty = driver->ttys[0];
>   	struct hvc_struct *hp = tty->driver_data;
>   	int n;
> +	unsigned long flags;
>   
>   	do {
> -		n = hp->ops->put_chars(hp->vtermno, &ch, 1);
> +		spin_lock_irqsave(&hp->cons_outbuf_lock, flags);
> +		hp->cons_outbuf[0] = ch;
> +		n = hp->ops->put_chars(hp->vtermno, &hp->cons_outbuf[0], 1);
> +		spin_unlock_irqrestore(&hp->cons_outbuf_lock, flags);
>   	} while (n <= 0);
>   }
>   #endif
> @@ -922,8 +927,7 @@ struct hvc_struct *hvc_alloc(uint32_t vtermno, int data,
>   			return ERR_PTR(err);
>   	}
>   
> -	hp = kzalloc(ALIGN(sizeof(*hp), sizeof(long)) + outbuf_size,
> -			GFP_KERNEL);
> +	hp = kzalloc(struct_size(hp, outbuf, outbuf_size), GFP_KERNEL);
>   	if (!hp)
>   		return ERR_PTR(-ENOMEM);
>   
> @@ -931,13 +935,13 @@ struct hvc_struct *hvc_alloc(uint32_t vtermno, int data,
>   	hp->data = data;
>   	hp->ops = ops;
>   	hp->outbuf_size = outbuf_size;
> -	hp->outbuf = &((char *)hp)[ALIGN(sizeof(*hp), sizeof(long))];
>   
>   	tty_port_init(&hp->port);
>   	hp->port.ops = &hvc_port_ops;
>   
>   	INIT_WORK(&hp->tty_resize, hvc_set_winsz);
>   	spin_lock_init(&hp->lock);
> +	spin_lock_init(&hp->cons_outbuf_lock);
>   	mutex_lock(&hvc_structs_mutex);
>   
>   	/*
> @@ -964,6 +968,7 @@ struct hvc_struct *hvc_alloc(uint32_t vtermno, int data,
>   	if (i < MAX_NR_HVC_CONSOLES) {
>   		cons_ops[i] = ops;
>   		vtermnos[i] = vtermno;
> +		cons_hvcs[i] = hp;
>   	}
>   
>   	list_add_tail(&(hp->next), &hvc_structs);
> @@ -988,6 +993,7 @@ int hvc_remove(struct hvc_struct *hp)
>   	if (hp->index < MAX_NR_HVC_CONSOLES) {
>   		vtermnos[hp->index] = -1;
>   		cons_ops[hp->index] = NULL;
> +		cons_hvcs[hp->index] = NULL;
>   	}
>   
>   	/* Don't whack hp->irq because tty_hangup() will need to free the irq. */
> diff --git a/drivers/tty/hvc/hvc_console.h b/drivers/tty/hvc/hvc_console.h
> index 18d005814..2c32ab67b 100644
> --- a/drivers/tty/hvc/hvc_console.h
> +++ b/drivers/tty/hvc/hvc_console.h
> @@ -32,12 +32,21 @@
>    */
>   #define HVC_ALLOC_TTY_ADAPTERS	8
>   
> +/*
> + * These sizes are most efficient for vio, because they are the
> + * native transfer size. We could make them selectable in the
> + * future to better deal with backends that want other buffer sizes.
> + */
> +#define N_OUTBUF	16
> +#define N_INBUF		16
> +
> +#define __ALIGNED__ __attribute__((__aligned__(L1_CACHE_BYTES)))
> +
>   struct hvc_struct {
>   	struct tty_port port;
>   	spinlock_t lock;
>   	int index;
>   	int do_wakeup;
> -	char *outbuf;
>   	int outbuf_size;
>   	int n_outbuf;
>   	uint32_t vtermno;
> @@ -48,6 +57,16 @@ struct hvc_struct {
>   	struct work_struct tty_resize;
>   	struct list_head next;
>   	unsigned long flags;
> +
> +	/*
> +	 * the buf and its lock are used in hvc console api for putting chars,
> +	 * and also used in hvc_poll_put_char() for putting single char.
> +	 */
> +	spinlock_t cons_outbuf_lock;
> +	char cons_outbuf[N_OUTBUF] __ALIGNED__;
> +
> +	/* the buf is used for putting chars to tty */
> +	char outbuf[] __ALIGNED__;
>   };
>   
>   /* implemented by a low level driver */

^ permalink raw reply

* Re: [PATCH 2/2] kbuild: use more subdir- for visiting subdirectories while cleaning
From: Michael Ellerman @ 2021-10-15 12:09 UTC (permalink / raw)
  To: Masahiro Yamada, linux-kbuild; +Cc: linuxppc-dev
In-Reply-To: <20211013063622.548590-2-masahiroy@kernel.org>

Masahiro Yamada <masahiroy@kernel.org> writes:
>
...
> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> index aa6808e70647..b61d8be3c226 100644
> --- a/arch/powerpc/Makefile
> +++ b/arch/powerpc/Makefile
> @@ -411,9 +409,6 @@ install:
>  	sh -x $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" vmlinux \
>  	System.map "$(INSTALL_PATH)"
>  
> -archclean:
> -	$(Q)$(MAKE) $(clean)=$(boot)
> -
>  ifeq ($(KBUILD_EXTMOD),)
>  # We need to generate vdso-offsets.h before compiling certain files in kernel/.
>  # In order to do that, we should use the archprepare target, but we can't since

Seems to work.

Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)

cheers

^ permalink raw reply

* Re: [PATCH v2 06/13] asm-generic: Use HAVE_FUNCTION_DESCRIPTORS to define associated stubs
From: Nicholas Piggin @ 2021-10-15 11:52 UTC (permalink / raw)
  To: Andrew Morton, Arnd Bergmann, Benjamin Herrenschmidt,
	Christophe Leroy, Helge Deller, Greg Kroah-Hartman,
	James E.J. Bottomley, Kees Cook, Michael Ellerman, Paul Mackerras
  Cc: linux-arch, linux-ia64, linux-parisc, linux-kernel, linux-mm,
	linuxppc-dev
In-Reply-To: <1634284464.kd8scm0ckz.astroid@bobo.none>

Excerpts from Nicholas Piggin's message of October 15, 2021 6:02 pm:
> Excerpts from Christophe Leroy's message of October 15, 2021 4:24 pm:
>> 
>> 
>> Le 15/10/2021 à 08:16, Nicholas Piggin a écrit :
>>> Excerpts from Christophe Leroy's message of October 14, 2021 3:49 pm:
>>>> Replace HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR by
>>>> HAVE_FUNCTION_DESCRIPTORS and use it instead of
>>>> 'dereference_function_descriptor' macro to know
>>>> whether an arch has function descriptors.
>>>>
>>>> To limit churn in one of the following patches, use
>>>> an #ifdef/#else construct with empty first part
>>>> instead of an #ifndef in asm-generic/sections.h
>>> 
>>> Is it worth putting this into Kconfig if you're going to
>>> change it? In any case
>> 
>> That was what I wanted to do in the begining but how can I do that in 
>> Kconfig ?
>> 
>> #ifdef __powerpc64__
>> #if defined(_CALL_ELF) && _CALL_ELF == 2
>> #define PPC64_ELF_ABI_v2
>> #else
>> #define PPC64_ELF_ABI_v1
>> #endif
>> #endif /* __powerpc64__ */
>> 
>> #ifdef PPC64_ELF_ABI_v1
>> #define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
> 
> We have ELFv2 ABI / function descriptors iff big-endian so you could 
> just select based on that.

Of course that should read ELFv1. To be clearer: BE is ELFv1 ABI and
LE is ELFv2 ABI.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v2] powerpc/s64: Clarify that radix lacks DEBUG_PAGEALLOC
From: Michael Ellerman @ 2021-10-15 10:41 UTC (permalink / raw)
  To: Christophe Leroy, Joel Stanley, Jordan Niethe; +Cc: linuxppc-dev
In-Reply-To: <6fb9e0ec-8ac8-1cc3-38de-78749ffee623@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 13/10/2021 à 23:34, Joel Stanley a écrit :
>> The page_alloc.c code will call into __kernel_map_pages when
>> DEBUG_PAGEALLOC is configured and enabled.
>> 
>> As the implementation assumes hash, this should crash spectacularly if
>> not for a bit of luck in __kernel_map_pages. In this function
>> linear_map_hash_count is always zero, the for loop exits without doing
>> any damage.
>> 
>> There are no other platforms that determine if they support
>> debug_pagealloc at runtime. Instead of adding code to mm/page_alloc.c to
>> do that, this change turns the map/unmap into a noop when in radix
>> mode and prints a warning once.
>> 
>> Signed-off-by: Joel Stanley <joel@jms.id.au>
>> ---
>> v2: Put __kernel_map_pages in pgtable.h
>> 
>>   arch/powerpc/include/asm/book3s/64/hash.h    |  2 ++
>>   arch/powerpc/include/asm/book3s/64/pgtable.h | 11 +++++++++++
>>   arch/powerpc/include/asm/book3s/64/radix.h   |  3 +++
>>   arch/powerpc/mm/book3s64/hash_utils.c        |  2 +-
>>   arch/powerpc/mm/book3s64/radix_pgtable.c     |  7 +++++++
>>   5 files changed, 24 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
>> index d959b0195ad9..674fe0e890dc 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hash.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
>> @@ -255,6 +255,8 @@ int hash__create_section_mapping(unsigned long start, unsigned long end,
>>   				 int nid, pgprot_t prot);
>>   int hash__remove_section_mapping(unsigned long start, unsigned long end);
>>   
>> +void hash__kernel_map_pages(struct page *page, int numpages, int enable);
>> +
>>   #endif /* !__ASSEMBLY__ */
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index 5d34a8646f08..265661ded238 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -1101,6 +1101,17 @@ static inline void vmemmap_remove_mapping(unsigned long start,
>>   }
>>   #endif
>>   
>> +#ifdef CONFIG_DEBUG_PAGEALLOC
>> +static inline void __kernel_map_pages(struct page *page, int numpages, int enable)
>> +{
>> +	if (radix_enabled()) {
>> +		radix__kernel_map_pages(page, numpages, enable);
>> +		return;
>> +	}
>> +	hash__kernel_map_pages(page, numpages, enable);
>
> I'd have prefered something like below
>
> 	if (radix_enabled())
> 		radix__kernel_map_pages(page, numpages, enable);
> 	else
> 		hash__kernel_map_pages(page, numpages, enable);

I did that when applying.

cheers

^ permalink raw reply

* Re: [PATCH v3 1/2] ftrace: disable preemption between ftrace_test_recursion_trylock/unlock()
From: Petr Mladek @ 2021-10-15  7:28 UTC (permalink / raw)
  To: 王贇
  Cc: Peter Zijlstra (Intel), Paul Walmsley, James E.J. Bottomley,
	Guo Ren, Jisheng Zhang, H. Peter Anvin, live-patching,
	linux-riscv, Miroslav Benes, Paul Mackerras, Joe Lawrence,
	Helge Deller, x86, linux-csky, Ingo Molnar, Albert Ou,
	Jiri Kosina, Steven Rostedt, Borislav Petkov, Nicholas Piggin,
	Josh Poimboeuf, Thomas Gleixner, linux-parisc, linux-kernel,
	Palmer Dabbelt, Masami Hiramatsu, Colin Ian King, linuxppc-dev
In-Reply-To: <5e907ed3-806b-b0e5-518d-d2f3b265377f@linux.alibaba.com>

On Fri 2021-10-15 11:13:08, 王贇 wrote:
> 
> 
> On 2021/10/14 下午11:14, Petr Mladek wrote:
> [snip]
> >> -	return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX);
> >> +	int bit;
> >> +
> >> +	bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX);
> >> +	/*
> >> +	 * The zero bit indicate we are nested
> >> +	 * in another trylock(), which means the
> >> +	 * preemption already disabled.
> >> +	 */
> >> +	if (bit > 0)
> >> +		preempt_disable_notrace();
> > 
> > Is this safe? The preemption is disabled only when
> > trace_test_and_set_recursion() was called by ftrace_test_recursion_trylock().
> > 
> > We must either always disable the preemtion when bit >= 0.
> > Or we have to disable the preemtion already in
> > trace_test_and_set_recursion().
> 
> Internal calling of trace_test_and_set_recursion() will disable preemption
> on succeed, it should be safe.

trace_test_and_set_recursion() does _not_ disable preemtion!
It works only because all callers disable the preemption.

It means that the comment is wrong. It is not guarantted that the
preemption will be disabled. It works only by chance.


> We can also consider move the logical into trace_test_and_set_recursion()
> and trace_clear_recursion(), but I'm not very sure about that... ftrace
> internally already make sure preemption disabled

How? Because it calls trace_test_and_set_recursion() via the trylock() API?


> , what uncovered is those users who call API trylock/unlock, isn't
> it?

And this is exactly the problem. trace_test_and_set_recursion() is in
a public header. Anyone could use it. And if anyone uses it in the
future without the trylock() and does not disable the preemtion
explicitely then we are lost again.

And it is even more dangerous. The original code disabled the
preemtion on various layers. As a result, the preemtion was disabled
several times for sure. It means that the deeper layers were
always on the safe side.

With this patch, if the first trace_test_and_set_recursion() caller
does not disable preemtion then trylock() will not disable it either
and the entire code is procceed with preemtion enabled.


> > Finally, the comment confused me a lot. The difference between nesting and
> > recursion is far from clear. And the code is tricky liky like hell :-)
> > I propose to add some comments, see below for a proposal.
> The comments do confusing, I'll make it something like:
> 
> The zero bit indicate trace recursion happened, whatever
> the recursively call was made by ftrace handler or ftrace
> itself, the preemption already disabled.

I am sorry but it is still confusing. We need to find a better way
how to clearly explain the difference between the safe and
unsafe recursion.

My understanding is that the recursion is:

  + "unsafe" when the trace code recursively enters the same trace point.

  + "safe" when ftrace_test_recursion_trylock() is called recursivelly
    while still processing the same trace entry.

> >> +
> >> +	return bit;
> >>  }
> >>  /**
> >> @@ -222,9 +233,13 @@ static __always_inline int ftrace_test_recursion_trylock(unsigned long ip,
> >>   * @bit: The return of a successful ftrace_test_recursion_trylock()
> >>   *
> >>   * This is used at the end of a ftrace callback.
> >> + *
> >> + * Preemption will be enabled (if it was previously enabled).
> >>   */
> >>  static __always_inline void ftrace_test_recursion_unlock(int bit)
> >>  {
> >> +	if (bit)
> > 
> > This is not symetric with trylock(). It should be:
> > 
> > 	if (bit > 0)
> > 
> > Anyway, trace_clear_recursion() quiently ignores bit != 0
> 
> Yes, bit == 0 should not happen in here.

Yes, it "should" not happen. My point is that we could make the API
more safe. We could do the right thing when
ftrace_test_recursion_unlock() is called with negative @bit.
Ideally, we should also warn about the mis-use.


Anyway, let's wait for Steven. It seems that he found another problem
with the API that should be solved first. The fix will probably
also help to better understand the "safe" vs "unsafe" recursion.

Best Regards,
Petr

^ permalink raw reply

* [PATCH v1 8/8] powerpc/fsl_booke: Enable STRICT_KERNEL_RWX
From: Christophe Leroy @ 2021-10-15 10:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <d1ad9fdd9b27da3fdfa16510bb542ed51fa6e134.1634292136.git.christophe.leroy@csgroup.eu>

Enable STRICT_KERNEL_RWX on fsl_booke.

For that, we need additional TLBCAMs dedicated to linear mapping,
based on the alignment of _sinittext.

By default, up to 768 Mbytes of memory are mapped.
It uses 3 TLBCAMs of size 256 Mbytes.

With a data alignment of 16, we need up to 9 TLBCAMs:
  16/16/16/16/64/64/64/256/256

With a data alignment of 4, we need up to 12 TLBCAMs:
  4/4/4/4/16/16/16/64/64/64/256/256

With a data alignment of 1, we need up to 15 TLBCAMs:
  1/1/1/1/4/4/4/16/16/16/64/64/64/256/256

By default, set a 16 Mbytes alignment as a compromise between memory
usage and number of TLBCAMs. This can be adjusted manually when needed.

For the time being, it doens't work when the base is randomised.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/Kconfig | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6b9f523882c5..939a47642a9c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX	if (PPC_BOOK3S || PPC_8xx || 40x) && !HIBERNATION
+	select ARCH_HAS_STRICT_KERNEL_RWX	if FSL_BOOKE && !HIBERNATION && !RANDOMIZE_BASE
 	select ARCH_HAS_STRICT_MODULE_RWX	if ARCH_HAS_STRICT_KERNEL_RWX && !PPC_BOOK3S_32
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
@@ -778,7 +779,8 @@ config DATA_SHIFT_BOOL
 	bool "Set custom data alignment"
 	depends on ADVANCED_OPTIONS
 	depends on STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE
-	depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && !STRICT_KERNEL_RWX)
+	depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && !STRICT_KERNEL_RWX) || \
+		   FSL_BOOKE
 	help
 	  This option allows you to set the kernel data alignment. When
 	  RAM is mapped by blocks, the alignment needs to fit the size and
@@ -791,11 +793,13 @@ config DATA_SHIFT
 	default 24 if STRICT_KERNEL_RWX && PPC64
 	range 17 28 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32
 	range 19 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_8xx
+	range 20 24 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && PPC_FSL_BOOKE
 	default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
 	default 18 if (DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32
 	default 23 if STRICT_KERNEL_RWX && PPC_8xx
 	default 23 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx && PIN_TLB_DATA
 	default 19 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx
+	default 24 if STRICT_KERNEL_RWX && FSL_BOOKE
 	default PPC_PAGE_SHIFT
 	help
 	  On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO.
@@ -1123,7 +1127,10 @@ config LOWMEM_CAM_NUM_BOOL
 config LOWMEM_CAM_NUM
 	depends on FSL_BOOKE
 	int "Number of CAMs to use to map low memory" if LOWMEM_CAM_NUM_BOOL
-	default 3
+	default 3 if !STRICT_KERNEL_RWX
+	default 9 if DATA_SHIFT >= 24
+	default 12 if DATA_SHIFT >= 22
+	default 15
 
 config DYNAMIC_MEMSTART
 	bool "Enable page aligned dynamic load address for kernel"
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 5/8] powerpc/fsl_booke: Tell map_mem_in_cams() if init is done
From: Christophe Leroy @ 2021-10-15 10:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <d1ad9fdd9b27da3fdfa16510bb542ed51fa6e134.1634292136.git.christophe.leroy@csgroup.eu>

In order to be able to call map_mem_in_cams() once more
after init for STRICT_KERNEL_RWX, add an argument.

For now, map_mem_in_cams() is always called only during init.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/mmu_decl.h           |  2 +-
 arch/powerpc/mm/nohash/fsl_book3e.c  | 12 ++++++------
 arch/powerpc/mm/nohash/kaslr_booke.c |  2 +-
 arch/powerpc/mm/nohash/tlb.c         |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index dd1cabc2ea0f..e13a3c0caa02 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -126,7 +126,7 @@ unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx,
-				     bool dryrun);
+				     bool dryrun, bool init);
 extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 				 phys_addr_t phys);
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c
index fdf1029e62f0..375b2b8238c1 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -167,7 +167,7 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 
 static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 					unsigned long ram, int max_cam_idx,
-					bool dryrun)
+					bool dryrun, bool init)
 {
 	int i;
 	unsigned long amount_mapped = 0;
@@ -202,12 +202,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	return amount_mapped;
 }
 
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, bool init)
 {
 	unsigned long virt = PAGE_OFFSET;
 	phys_addr_t phys = memstart_addr;
 
-	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun);
+	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init);
 }
 
 #ifdef CONFIG_PPC32
@@ -248,7 +248,7 @@ void __init adjust_total_lowmem(void)
 	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
 	i = switch_to_as1();
-	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false);
+	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, true);
 	restore_to_as0(i, 0, 0, 1);
 
 	pr_info("Memory CAM mapping: ");
@@ -319,11 +319,11 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
 		/* map a 64M area for the second relocation */
 		if (memstart_addr > start)
 			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM,
-					false);
+					false, true);
 		else
 			map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
 					0x4000000, CONFIG_LOWMEM_CAM_NUM,
-					false);
+					false, true);
 		restore_to_as0(n, offset, __va(dt_ptr), 1);
 		/* We should never reach here */
 		panic("Relocation error");
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 4c74e8a5482b..8fc49b1b4a91 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -314,7 +314,7 @@ static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size
 		pr_warn("KASLR: No safe seed for randomizing the kernel base.\n");
 
 	ram = min_t(phys_addr_t, __max_low_memory, size);
-	ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true);
+	ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, false);
 	linear_sz = min_t(unsigned long, ram, SZ_512M);
 
 	/* If the linear size is smaller than 64M, do not randmize */
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 5872f69141d5..fc195b9f524b 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -643,7 +643,7 @@ static void early_init_this_mmu(void)
 
 		if (map)
 			linear_map_top = map_mem_in_cams(linear_map_top,
-							 num_cams, false);
+							 num_cams, true, true);
 	}
 #endif
 
@@ -764,7 +764,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
 
 		linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
-					    true);
+					    false, true);
 
 		ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
 	} else
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 7/8] powerpc/fsl_booke: Update of TLBCAMs after init
From: Christophe Leroy @ 2021-10-15 10:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <d1ad9fdd9b27da3fdfa16510bb542ed51fa6e134.1634292136.git.christophe.leroy@csgroup.eu>

After init, set readonly memory as ROX and set readwrite
memory as RWX, if STRICT_KERNEL_RWX is enabled.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/mmu_decl.h          |  2 +-
 arch/powerpc/mm/nohash/fsl_book3e.c | 32 +++++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index e13a3c0caa02..0dd4c18f8363 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -168,7 +168,7 @@ static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; }
 static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
 #endif
 
-#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx)
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_FSL_BOOK3E)
 void mmu_mark_initmem_nx(void);
 void mmu_mark_rodata_ro(void);
 #else
diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c b/arch/powerpc/mm/nohash/fsl_book3e.c
index c1bc11f46344..978e0bcdfa2c 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -181,7 +181,7 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	/* Calculate CAM values */
 	for (i = 0; boundary && i < max_cam_idx; i++) {
 		unsigned long cam_sz;
-		pgprot_t prot = PAGE_KERNEL_X;
+		pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX;
 
 		cam_sz = calc_cam_sz(boundary, virt, phys);
 		if (!dryrun)
@@ -194,7 +194,7 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	}
 	for (ram -= amount_mapped; ram && i < max_cam_idx; i++) {
 		unsigned long cam_sz;
-		pgprot_t prot = PAGE_KERNEL_X;
+		pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL;
 
 		cam_sz = calc_cam_sz(ram, virt, phys);
 		if (!dryrun)
@@ -209,8 +209,13 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	if (dryrun)
 		return amount_mapped;
 
-	loadcam_multi(0, i, max_cam_idx);
-	tlbcam_index = i;
+	if (init) {
+		loadcam_multi(0, i, max_cam_idx);
+		tlbcam_index = i;
+	} else {
+		loadcam_multi(0, i, 0);
+		WARN_ON(i > tlbcam_index);
+	}
 
 #ifdef CONFIG_PPC64
 	get_paca()->tcd.esel_next = i;
@@ -279,6 +284,25 @@ void __init adjust_total_lowmem(void)
 	memblock_set_current_limit(memstart_addr + __max_low_memory);
 }
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mmu_mark_rodata_ro(void)
+{
+	/* Everything is done in mmu_mark_initmem_nx() */
+}
+#endif
+
+void mmu_mark_initmem_nx(void)
+{
+	unsigned long remapped;
+
+	if (!strict_kernel_rwx_enabled())
+		return;
+
+	remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false);
+
+	WARN_ON(__max_low_memory != remapped);
+}
+
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 				phys_addr_t first_memblock_size)
 {
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 1/8] powerpc/booke: Disable STRICT_KERNEL_RWX, DEBUG_PAGEALLOC and KFENCE
From: Christophe Leroy @ 2021-10-15 10:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

fsl_booke and 44x are not able to map kernel linear memory with
pages, so they can't support DEBUG_PAGEALLOC and KFENCE, and
STRICT_KERNEL_RWX is also a problem for now.

Enable those only on book3s (both 32 and 64 except KFENCE), 8xx and 40x.

Fixes: 88df6e90fa97 ("[POWERPC] DEBUG_PAGEALLOC for 32-bit")
Fixes: 95902e6c8864 ("powerpc/mm: Implement STRICT_KERNEL_RWX on PPC32")
Fixes: 90cbac0e995d ("powerpc: Enable KFENCE for PPC32")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ba5b66189358..6b9f523882c5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,7 +138,7 @@ config PPC
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_SET_MEMORY
-	select ARCH_HAS_STRICT_KERNEL_RWX	if ((PPC_BOOK3S_64 || PPC32) && !HIBERNATION)
+	select ARCH_HAS_STRICT_KERNEL_RWX	if (PPC_BOOK3S || PPC_8xx || 40x) && !HIBERNATION
 	select ARCH_HAS_STRICT_MODULE_RWX	if ARCH_HAS_STRICT_KERNEL_RWX && !PPC_BOOK3S_32
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
@@ -150,7 +150,7 @@ config PPC
 	select ARCH_OPTIONAL_KERNEL_RWX		if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
-	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC32 || PPC_BOOK3S_64
+	select ARCH_SUPPORTS_DEBUG_PAGEALLOC	if PPC_BOOK3S || PPC_8xx || 40x
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_USE_MEMTEST
@@ -190,7 +190,7 @@ config PPC
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_KASAN			if PPC32 && PPC_PAGE_SHIFT <= 14
 	select HAVE_ARCH_KASAN_VMALLOC		if PPC32 && PPC_PAGE_SHIFT <= 14
-	select HAVE_ARCH_KFENCE			if PPC32
+	select HAVE_ARCH_KFENCE			if PPC_BOOK3S_32 || PPC_8xx || 40x
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if COMPAT
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 4/8] powerpc/fsl_booke: Enable reloading of TLBCAM without switching to AS1
From: Christophe Leroy @ 2021-10-15 10:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <d1ad9fdd9b27da3fdfa16510bb542ed51fa6e134.1634292136.git.christophe.leroy@csgroup.eu>

Avoid switching to AS1 when reloading TLBCAM after init for
STRICT_KERNEL_RWX.

When we setup AS1 we expect the entire accessible memory to be mapped
through one entry, this is not the case anymore at the end of init.

We are not changing the size of TLBCAMs, only flags, so no need to
switch to AS1.

So change loadcam_multi() to not switch to AS1 when the given
temporary tlb entry in 0.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/nohash/tlb_low.S | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S
index 5add4a51e51f..dd39074de9af 100644
--- a/arch/powerpc/mm/nohash/tlb_low.S
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -369,7 +369,7 @@ _GLOBAL(_tlbivax_bcast)
  * extern void loadcam_entry(unsigned int index)
  *
  * Load TLBCAM[index] entry in to the L2 CAM MMU
- * Must preserve r7, r8, r9, r10 and r11
+ * Must preserve r7, r8, r9, r10, r11, r12
  */
 _GLOBAL(loadcam_entry)
 	mflr	r5
@@ -401,7 +401,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
  *
  * r3 = first entry to write
  * r4 = number of entries to write
- * r5 = temporary tlb entry
+ * r5 = temporary tlb entry (0 means no switch to AS1)
  */
 _GLOBAL(loadcam_multi)
 	mflr	r8
@@ -409,6 +409,8 @@ _GLOBAL(loadcam_multi)
 	mfmsr	r11
 	andi.	r11,r11,MSR_IS
 	bne	10f
+	mr.	r12, r5
+	beq	10f
 
 	/*
 	 * Set up temporary TLB entry that is the same as what we're
@@ -446,6 +448,8 @@ _GLOBAL(loadcam_multi)
 	/* Don't return to AS=0 if we were in AS=1 at function start */
 	andi.	r11,r11,MSR_IS
 	bne	3f
+	cmpwi	r12, 0
+	beq	3f
 
 	/* Return to AS=0 and clear the temporary entry */
 	mfmsr	r6
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 2/8] powerpc/fsl_booke: Rename fsl_booke.c to fsl_book3e.c
From: Christophe Leroy @ 2021-10-15 10:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <d1ad9fdd9b27da3fdfa16510bb542ed51fa6e134.1634292136.git.christophe.leroy@csgroup.eu>

We have a myriad of CONFIG symbols around different variants
of BOOKEs, which would be worth tidying up one day.

But at least, make file names and CONFIG option match:

We have CONFIG_FSL_BOOKE and CONFIG_PPC_FSL_BOOK3E.

fsl_booke.c is selected by and only by CONFIG_PPC_FSL_BOOK3E.
So rename it fsl_book3e to reduce confusion.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/nohash/Makefile                      | 4 ++--
 arch/powerpc/mm/nohash/{fsl_booke.c => fsl_book3e.c} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename arch/powerpc/mm/nohash/{fsl_booke.c => fsl_book3e.c} (100%)

diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index 0424f6ce5bd8..b1f630d423d8 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_PPC_BOOK3E_64)  	+= tlb_low_64e.o book3e_pgtable.o
 obj-$(CONFIG_40x)		+= 40x.o
 obj-$(CONFIG_44x)		+= 44x.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx.o
-obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_booke.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)	+= fsl_book3e.o
 obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr_booke.o
 ifdef CONFIG_HUGETLB_PAGE
 obj-$(CONFIG_PPC_FSL_BOOK3E)	+= book3e_hugetlbpage.o
@@ -16,4 +16,4 @@ endif
 # Disable kcov instrumentation on sensitive code
 # This is necessary for booting with kcov enabled on book3e machines
 KCOV_INSTRUMENT_tlb.o := n
-KCOV_INSTRUMENT_fsl_booke.o := n
+KCOV_INSTRUMENT_fsl_book3e.o := n
diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_book3e.c
similarity index 100%
rename from arch/powerpc/mm/nohash/fsl_booke.c
rename to arch/powerpc/mm/nohash/fsl_book3e.c
-- 
2.31.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox