LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 11/15] selftests/seccomp: Remove SYSCALL_NUM_RET_SHARE_REG in favor of SYSCALL_RET_SET
From: Kees Cook @ 2020-09-12 11:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thadeu Lima de Souza Cascardo, Will Drewry, Kees Cook,
	linux-xtensa, linux-mips, Andy Lutomirski, Max Filippov,
	linux-arm-kernel, linux-kselftest, linuxppc-dev,
	Christian Brauner
In-Reply-To: <20200912110820.597135-1-keescook@chromium.org>

Instead of special-casing the specific case of shared registers, create
a default SYSCALL_RET_SET() macro (mirroring SYSCALL_NUM_SET()), that
writes to the SYSCALL_RET register. For architectures that can't set the
return value (for whatever reason), they can define SYSCALL_RET_SET()
without an associated SYSCALL_RET() macro. This also paves the way for
architectures that need to do special things to set the return value
(e.g. powerpc).

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 33 +++++++++++++------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 2790d9cd50f4..623953a53032 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1753,8 +1753,8 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 #elif defined(__s390__)
 # define ARCH_REGS		s390_regs
 # define SYSCALL_NUM(_regs)	(_regs).gprs[2]
-# define SYSCALL_RET(_regs)	(_regs).gprs[2]
-# define SYSCALL_NUM_RET_SHARE_REG
+# define SYSCALL_RET_SET(_regs, _val)			\
+		TH_LOG("Can't modify syscall return on this architecture")
 #elif defined(__mips__)
 # include <asm/unistd_nr_n32.h>
 # include <asm/unistd_nr_n64.h>
@@ -1776,8 +1776,8 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 		else					\
 			(_regs).regs[2] = _nr;		\
 	} while (0)
-# define SYSCALL_RET(_regs)	(_regs).regs[2]
-# define SYSCALL_NUM_RET_SHARE_REG
+# define SYSCALL_RET_SET(_regs, _val)			\
+		TH_LOG("Can't modify syscall return on this architecture")
 #elif defined(__xtensa__)
 # define ARCH_REGS		struct user_pt_regs
 # define SYSCALL_NUM(_regs)	(_regs).syscall
@@ -1804,9 +1804,26 @@ TEST_F(TRACE_poke, getpid_runs_normally)
 		SYSCALL_NUM(_regs) = (_nr);	\
 	} while (0)
 #endif
+/*
+ * Most architectures can change the syscall return value by just
+ * writing to the SYSCALL_RET register. This is the default if not
+ * defined above. If an architecture cannot set the return value
+ * (for example when the syscall and return value register is
+ * shared), report it with TH_LOG() in an arch-specific definition
+ * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
+ */
+#if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
+# error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
+#endif
+#ifndef SYSCALL_RET_SET
+# define SYSCALL_RET_SET(_regs, _val)		\
+	do {					\
+		SYSCALL_RET(_regs) = (_val);	\
+	} while (0)
+#endif
 
 /* When the syscall return can't be changed, stub out the tests for it. */
-#ifdef SYSCALL_NUM_RET_SHARE_REG
+#ifndef SYSCALL_RET
 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
 #else
 # define EXPECT_SYSCALL_RETURN(val, action)		\
@@ -1870,11 +1887,7 @@ void change_syscall(struct __test_metadata *_metadata,
 
 	/* If syscall is skipped, change return value. */
 	if (syscall == -1)
-#ifdef SYSCALL_NUM_RET_SHARE_REG
-		TH_LOG("Can't modify syscall return on this architecture");
-#else
-		SYSCALL_RET(regs) = result;
-#endif
+		SYSCALL_RET_SET(regs, result);
 
 	/* Flush any register changes made. */
 	if (memcmp(&orig, &regs, sizeof(orig)) != 0)
-- 
2.25.1


^ permalink raw reply related

* [PATCH 14/15] selftests/clone3: Avoid OS-defined clone_args
From: Kees Cook @ 2020-09-12 11:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thadeu Lima de Souza Cascardo, Will Drewry, Kees Cook,
	linux-xtensa, linux-mips, Andy Lutomirski, Max Filippov,
	linux-arm-kernel, linux-kselftest, linuxppc-dev,
	Christian Brauner
In-Reply-To: <20200912110820.597135-1-keescook@chromium.org>

As the UAPI headers start to appear in distros, we need to avoid
outdated versions of struct clone_args to be able to test modern
features. Additionally pull in the syscall numbers correctly.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
I needed to fix this to get MIPS to build the seccomp selftests.
---
 .../testing/selftests/clone3/clone3_selftests.h  | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h
index 91c1a78ddb39..bc0f34e37ae1 100644
--- a/tools/testing/selftests/clone3/clone3_selftests.h
+++ b/tools/testing/selftests/clone3/clone3_selftests.h
@@ -4,11 +4,19 @@
 #define _CLONE3_SELFTESTS_H
 
 #define _GNU_SOURCE
+
+/* Pull in syscall numbers. */
+#include <unistd.h>
+#include <sys/syscall.h>
+
+/* Avoid old OS versions of "struct clone_args". */
+#define clone_args old_clone_args
 #include <sched.h>
 #include <linux/sched.h>
+#undef clone_args
+
 #include <linux/types.h>
 #include <stdint.h>
-#include <syscall.h>
 #include <sys/wait.h>
 
 #include "../kselftest.h"
@@ -25,6 +33,7 @@
 
 #ifndef __NR_clone3
 #define __NR_clone3 -1
+#endif
 struct clone_args {
 	__aligned_u64 flags;
 	__aligned_u64 pidfd;
@@ -34,13 +43,16 @@ struct clone_args {
 	__aligned_u64 stack;
 	__aligned_u64 stack_size;
 	__aligned_u64 tls;
+#ifndef CLONE_ARGS_SIZE_VER1
 #define CLONE_ARGS_SIZE_VER1 80
+#endif
 	__aligned_u64 set_tid;
 	__aligned_u64 set_tid_size;
+#ifndef CLONE_ARGS_SIZE_VER2
 #define CLONE_ARGS_SIZE_VER2 88
+#endif
 	__aligned_u64 cgroup;
 };
-#endif /* __NR_clone3 */
 
 static pid_t sys_clone3(struct clone_args *args, size_t size)
 {
-- 
2.25.1


^ permalink raw reply related

* [PATCH 15/15] selftests/seccomp: Use __NR_mknodat instead of __NR_mknod
From: Kees Cook @ 2020-09-12 11:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: Thadeu Lima de Souza Cascardo, Will Drewry, Kees Cook,
	linux-xtensa, linux-mips, Andy Lutomirski, Max Filippov,
	linux-arm-kernel, linux-kselftest, linuxppc-dev,
	Christian Brauner
In-Reply-To: <20200912110820.597135-1-keescook@chromium.org>

The __NR_mknod syscall doesn't exist on arm64 (only __NR_mknodat).
Switch to the modern syscall.

Fixes: ad5682184a81 ("selftests/seccomp: Check for EPOLLHUP for user_notif")
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 26c712c6a575..b34ede28f314 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -3773,7 +3773,7 @@ TEST(user_notification_filter_empty)
 	if (pid == 0) {
 		int listener;
 
-		listener = user_notif_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
 		if (listener < 0)
 			_exit(EXIT_FAILURE);
 
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v5 20/23] powerpc/book3s64/hash/kuap: Enable kuap on hash
From: Michael Ellerman @ 2020-09-12 11:54 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev; +Cc: Aneesh Kumar K.V
In-Reply-To: <20200827040931.297759-21-aneesh.kumar@linux.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/book3s64/pkeys.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
> index 391230f93da2..16ea0b2f0ea5 100644
> --- a/arch/powerpc/mm/book3s64/pkeys.c
> +++ b/arch/powerpc/mm/book3s64/pkeys.c
> @@ -257,7 +257,12 @@ void __init setup_kuep(bool disabled)
>  #ifdef CONFIG_PPC_KUAP
>  void __init setup_kuap(bool disabled)
>  {
> -	if (disabled || !early_radix_enabled())
> +	if (disabled)
> +		return;
> +	/*
> +	 * On hash if PKEY feature is not enabled, disable KUAP too.
> +	 */
> +	if (!early_radix_enabled() && !early_mmu_has_feature(MMU_FTR_PKEY))
>  		return;
>  
>  	if (smp_processor_id() == boot_cpuid) {

I'm seeing userspace crashes, bisect points at this commit. But
obviously this is just the commit that enables the code.

It seems to be TM related, a lot of the TM selftests are failing with
seemingly random segfaults.

eg:

  ./tm-tmspr
  test: tm_tmspr
  tags: git_version:v5.9-rc2-146-g928c664a7f4e
  !! child died by signal 11
  failure: tm_tmspr

And this warning fires:

  [  308.825455] ------------[ cut here ]------------
  [  308.825485] WARNING: CPU: 5 PID: 105478 at arch/powerpc/include/asm/book3s/64/kup.h:287 syscall_exit_prepare+0x350/0x390
  [  308.825495] Modules linked in: iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ip6table_filter ip6_tables iptable_filter fuse kvm_hv kvm binfmt_misc squashfs mlx4_ib ib_uverbs dm_multipath scsi_dh_rdac scsi_dh_alua ib_core mlx4_en bnx2x lpfc mlx4_core crc_t10dif sr_mod crct10dif_generic cdrom vmx_crypto scsi_transport_fc sg mdio gf128mul crct10dif_vpmsum crct10dif_common powernv_rng crc32c_vpmsum rng_core leds_powernv powernv_op_panel led_class sunrpc ip_tables x_tables autofs4
  [  308.825596] CPU: 5 PID: 105478 Comm: tm-tmspr Tainted: P                  5.9.0-rc2-00146-g928c664a7f4e #1
  [  308.825607] NIP:  c000000000038640 LR: c00000000000ddcc CTR: c0000000003c32d0
  [  308.825616] REGS: c000001e41877b00 TRAP: 0700   Tainted: P                   (5.9.0-rc2-00146-g928c664a7f4e)
  [  308.825625] MSR:  900000010282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE,TM[E]>  CR: 44004484  XER: 20000000
  [  308.825649] CFAR: c000000000038358 IRQMASK: 0 
                 GPR00: c00000000000ddcc c000001e41877da0 c00000000143f000 0000000000000000 
                 GPR04: c000001e41877e80 0000000000000000 00007ffec6ed0000 0000000000000008 
                 GPR08: 0000000000000002 3cffffffffffffff fcffffffffffffff 00000000f5638000 
                 GPR12: 0000000000004400 c0000007ffffa680 0000000000000000 00007ffff7ff0000 
                 GPR16: 00007ffff7f54410 00007ffff7f50320 00007ffec6eff240 00007ffff7f54420 
                 GPR20: 00000000000004ba 00000001000016c0 0000000100034840 0000000000000000 
                 GPR24: 0000000000000000 00007ffec6eff8f0 0000000000000000 00007ffec6efea80 
                 GPR28: 00007ffffffff38f c000001e2e05d600 00007ffec6eff180 c000001e41877e80 
  [  308.825724] NIP [c000000000038640] syscall_exit_prepare+0x350/0x390
  [  308.825734] LR [c00000000000ddcc] system_call_common+0xfc/0x27c
  [  308.825741] Call Trace:
  [  308.825751] [c000001e41877da0] [c000001e41877e10] 0xc000001e41877e10 (unreliable)
  [  308.825763] [c000001e41877e10] [c00000000000ddcc] system_call_common+0xfc/0x27c
  [  308.825771] Instruction dump:
  [  308.825778] 713c0800 4082004c f87f0018 395d0080 39001800 7ce050a8 7ce74078 7ce051ad 
  [  308.825794] 40c2fff4 4bfffd54 60000000 60000000 <0fe00000> 4bfffd18 60000000 60000000 
  [  308.825811] ---[ end trace 57dae589ab37c54c ]---


That's on power8, both bare metal and LPAR.

cheers

^ permalink raw reply

* Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Michael Ellerman @ 2020-09-13  1:46 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	Jordan Niethe, LKML, Nicholas Piggin, Valentin Schneider,
	Oliver O'Halloran, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200912044603.GA11808@linux.vnet.ibm.com>

Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
> * Michael Ellerman <mpe@ellerman.id.au> [2020-09-11 21:55:23]:
>
>> Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
>> > Current code assumes that cpumask of cpus sharing a l2-cache mask will
>> > always be a superset of cpu_sibling_mask.
>> >
>> > Lets stop that assumption. cpu_l2_cache_mask is a superset of
>> > cpu_sibling_mask if and only if shared_caches is set.
>> 
>> I'm seeing oopses with this:
>> 
>> [    0.117392][    T1] smp: Bringing up secondary CPUs ...
>> [    0.156515][    T1] smp: Brought up 2 nodes, 2 CPUs
>> [    0.158265][    T1] numa: Node 0 CPUs: 0
>> [    0.158520][    T1] numa: Node 1 CPUs: 1
>> [    0.167453][    T1] BUG: Unable to handle kernel data access on read at 0x8000000041228298
>> [    0.167992][    T1] Faulting instruction address: 0xc00000000018c128
>> [    0.168817][    T1] Oops: Kernel access of bad area, sig: 11 [#1]
>> [    0.168964][    T1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>> [    0.169417][    T1] Modules linked in:
>> [    0.170047][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc2-00095-g7430ad5aa700 #209
>> [    0.170305][    T1] NIP:  c00000000018c128 LR: c00000000018c0cc CTR: c00000000004dce0
>> [    0.170498][    T1] REGS: c00000007e343880 TRAP: 0380   Not tainted  (5.9.0-rc2-00095-g7430ad5aa700)
>> [    0.170602][    T1] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 44002222  XER: 00000000
>> [    0.170985][    T1] CFAR: c00000000018c288 IRQMASK: 0
>> [    0.170985][    T1] GPR00: 0000000000000000 c00000007e343b10 c00000000173e400 0000000000004000
>> [    0.170985][    T1] GPR04: 0000000000000000 0000000000000800 0000000000000800 0000000000000000
>> [    0.170985][    T1] GPR08: 0000000000000000 c00000000122c298 c00000003fffc000 c00000007fd05ce8
>> [    0.170985][    T1] GPR12: c00000007e0119f8 c000000001930000 00000000ffff8ade 0000000000000000
>> [    0.170985][    T1] GPR16: c00000007e3c0640 0000000000000917 c00000007e3c0658 0000000000000008
>> [    0.170985][    T1] GPR20: c0000000015d0bb8 00000000ffff8ade c000000000f57400 c000000001817c28
>> [    0.170985][    T1] GPR24: c00000000176dc80 c00000007e3c0890 c00000007e3cfe00 0000000000000000
>> [    0.170985][    T1] GPR28: c000000001772310 c00000007e011900 c00000007e3c0800 0000000000000001
>> [    0.172750][    T1] NIP [c00000000018c128] build_sched_domains+0x808/0x14b0
>> [    0.172900][    T1] LR [c00000000018c0cc] build_sched_domains+0x7ac/0x14b0
>> [    0.173186][    T1] Call Trace:
>> [    0.173484][    T1] [c00000007e343b10] [c00000000018bfe8] build_sched_domains+0x6c8/0x14b0 (unreliable)
>> [    0.173821][    T1] [c00000007e343c50] [c00000000018dcdc] sched_init_domains+0xec/0x130
>> [    0.174037][    T1] [c00000007e343ca0] [c0000000010d59d8] sched_init_smp+0x50/0xc4
>> [    0.174207][    T1] [c00000007e343cd0] [c0000000010b45c4] kernel_init_freeable+0x1b4/0x378
>> [    0.174378][    T1] [c00000007e343db0] [c0000000000129fc] kernel_init+0x24/0x158
>> [    0.174740][    T1] [c00000007e343e20] [c00000000000d9d0] ret_from_kernel_thread+0x5c/0x6c
>> [    0.175050][    T1] Instruction dump:
>> [    0.175626][    T1] 554905ee 71480040 7d2907b4 4182016c 2c290000 3920006e 913e002c 41820034
>> [    0.175841][    T1] 7c6307b4 e9300020 78631f24 7d58182a <7d2a482a> f93e0080 7d404828 314a0001
>> [    0.178340][    T1] ---[ end trace 6876b88dd1d4b3bb ]---
>> [    0.178512][    T1]
>> [    1.180458][    T1] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
>> 
>> That's qemu:
>> 
>> qemu-system-ppc64 -nographic -vga none -M pseries -cpu POWER8 \
>>   -kernel build~/vmlinux \
>>   -m 2G,slots=2,maxmem=4G \
>>   -object memory-backend-ram,size=1G,id=m0 \
>>   -object memory-backend-ram,size=1G,id=m1 \
>>   -numa node,nodeid=0,memdev=m0 \
>>   -numa node,nodeid=1,memdev=m1 \
>>   -smp 2,sockets=2,maxcpus=2  \
>> 
>
> Thanks Michael for the report and also for identifying the patch and also
> giving an easy reproducer. That made my task easy. (My only problem was all
> my PowerKVM hosts had a old compiler that refuse to compile never kernels.)
>
> So in this setup, CPU doesn't have a l2-cache. And in that scenario, we
> miss updating the l2-cache domain. Actually the initial patch had this
> exact code. However it was my mistake. I should have reassessed it before
> making changes suggested by Gautham.
>
> Patch below. Do let me know if you want me to send the patch separately.

>> On mambo I get:
>> 
>> [    0.005069][    T1] smp: Bringing up secondary CPUs ...
>> [    0.011656][    T1] smp: Brought up 2 nodes, 8 CPUs
>> [    0.011682][    T1] numa: Node 0 CPUs: 0-3
>> [    0.011709][    T1] numa: Node 1 CPUs: 4-7
>> [    0.012015][    T1] BUG: arch topology borken
>> [    0.012040][    T1]      the SMT domain not a subset of the CACHE domain
>> [    0.012107][    T1] BUG: Unable to handle kernel data access on read at 0x80000001012e7398
>> [    0.012142][    T1] Faulting instruction address: 0xc0000000001aa4f0
>> [    0.012174][    T1] Oops: Kernel access of bad area, sig: 11 [#1]
>> [    0.012206][    T1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
>> [    0.012236][    T1] Modules linked in:
>> [    0.012264][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc2-00095-g7430ad5aa700 #1
>> [    0.012304][    T1] NIP:  c0000000001aa4f0 LR: c0000000001aa498 CTR: 0000000000000000
>> [    0.012341][    T1] REGS: c0000000ef583880 TRAP: 0380   Not tainted  (5.9.0-rc2-00095-g7430ad5aa700)
>> [    0.012379][    T1] MSR:  9000000002009033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 44002222  XER: 00040000
>> [    0.012439][    T1] CFAR: c0000000000101b0 IRQMASK: 0
>> [    0.012439][    T1] GPR00: 0000000000000000 c0000000ef583b10 c0000000017fd000 0000000000004000
>> [    0.012439][    T1] GPR04: 0000000000000000 0000000000000800 0000000000000000 0000000000000000
>> [    0.012439][    T1] GPR08: 0000000000000000 c0000000012eb398 c0000000ffffc000 0000000000000000
>> [    0.012439][    T1] GPR12: 0000000000000020 c0000000019f0000 00000000ffff8ad1 0000000000000000
>> [    0.012439][    T1] GPR16: c0000000ef068658 c0000000018d7ba8 0000000000000008 c000000001690bb8
>> [    0.012439][    T1] GPR20: c00000000182dc80 c0000000ef06be90 00000000ffff8ad1 c000000001014aa8
>> [    0.012439][    T1] GPR24: 0000000000000917 c0000000ef068e00 0000000000000000 c0000000ef06be00
>> [    0.012439][    T1] GPR28: 0000000000000001 c0000000ef068640 c0000000ef4a1800 c000000001832310
>> [    0.012774][    T1] NIP [c0000000001aa4f0] build_sched_domains+0x5c0/0x14f0
>> [    0.012812][    T1] LR [c0000000001aa498] build_sched_domains+0x568/0x14f0
>> [    0.012842][    T1] Call Trace:
>> [    0.012872][    T1] [c0000000ef583b10] [c0000000001aa3b4] build_sched_domains+0x484/0x14f0 (unreliable)
>> [    0.012922][    T1] [c0000000ef583c50] [c0000000001ac3d8] sched_init_domains+0xd8/0x120
>> [    0.012966][    T1] [c0000000ef583ca0] [c0000000011962d0] sched_init_smp+0x50/0xc4
>> [    0.013008][    T1] [c0000000ef583cd0] [c00000000117451c] kernel_init_freeable+0x1b4/0x378
>> [    0.013051][    T1] [c0000000ef583db0] [c000000000012994] kernel_init+0x2c/0x158
>> [    0.013092][    T1] [c0000000ef583e20] [c00000000000d9d0] ret_from_kernel_thread+0x5c/0x6c
>> [    0.013128][    T1] Instruction dump:
>> [    0.013151][    T1] e93b003a 712a0040 552a05ee 418203c4 2c2a0000 3920006e 913b002c 41820034
>> [    0.013206][    T1] 7c6307b4 e93d0020 78631f24 7d54182a <7d2a482a> f93b0080 7d404828 314a0001
>> [    0.013267][    T1] ---[ end trace 1bf5f6f38a9fd096 ]---
>> 
>
> I haven't tried Mambo. But the problem report looks similar.

The patch fixes qemu, and I don't see the crash on mambo, but I still
see:

  [    0.010536] smp: Bringing up secondary CPUs ...
  [    0.019189] smp: Brought up 2 nodes, 8 CPUs
  [    0.019210] numa: Node 0 CPUs: 0-3
  [    0.019235] numa: Node 1 CPUs: 4-7
  [    0.023669] BUG: arch topology borken
  [    0.023690]      the SMT domain not a subset of the CACHE domain
  [    0.023726] BUG: arch topology borken
  [    0.023747]      the CACHE domain not a subset of the MC domain
  [    0.023808] BUG: arch topology borken
  [    0.023829]      the SMT domain not a subset of the CACHE domain
  [    0.023865] BUG: arch topology borken
  [    0.023887]      the CACHE domain not a subset of the MC domain
  [    0.023948] BUG: arch topology borken
  [    0.023969]      the SMT domain not a subset of the CACHE domain
  [    0.024005] BUG: arch topology borken
  [    0.024026]      the CACHE domain not a subset of the MC domain
  [    0.024087] BUG: arch topology borken
  [    0.024108]      the SMT domain not a subset of the CACHE domain
  [    0.024144] BUG: arch topology borken
  [    0.024165]      the CACHE domain not a subset of the MC domain
  [    0.024227] BUG: arch topology borken
  [    0.024248]      the SMT domain not a subset of the CACHE domain
  [    0.024284] BUG: arch topology borken
  [    0.024305]      the CACHE domain not a subset of the MC domain
  [    0.024366] BUG: arch topology borken
  [    0.024387]      the SMT domain not a subset of the CACHE domain
  [    0.024423] BUG: arch topology borken
  [    0.024444]      the CACHE domain not a subset of the MC domain
  [    0.024505] BUG: arch topology borken
  [    0.024527]      the SMT domain not a subset of the CACHE domain
  [    0.024563] BUG: arch topology borken
  [    0.024584]      the CACHE domain not a subset of the MC domain
  [    0.024645] BUG: arch topology borken
  [    0.024666]      the SMT domain not a subset of the CACHE domain
  [    0.024702] BUG: arch topology borken
  [    0.024723]      the CACHE domain not a subset of the MC domain

That's the p9 mambo model, using skiboot.tcl from skiboot, with CPUS=2,
THREADS=4 and MAMBO_NUMA=1.

Node layout is:

[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000000000000-0x00000000ffffffff]
[    0.000000]   node   1: [mem 0x0000200000000000-0x00002000ffffffff]
[    0.000000] Initmem setup node 0 [mem 0x0000000000000000-0x00000000ffffffff]
[    0.000000] On node 0 totalpages: 65536
[    0.000000] Initmem setup node 1 [mem 0x0000200000000000-0x00002000ffffffff]
[    0.000000] On node 1 totalpages: 65536


There aren't any l2-cache properties in the device-tree under cpus.

I'll try and have a closer look tonight.

cheers

^ permalink raw reply

* Re: [PATCH v2] selftests/seccomp: fix ptrace tests on powerpc
From: Michael Ellerman @ 2020-09-13  7:35 UTC (permalink / raw)
  To: Kees Cook, Thadeu Lima de Souza Cascardo
  Cc: linuxppc-dev, Shuah Khan, Oleg Nesterov, linux-kselftest
In-Reply-To: <202009111550.07017FE49@keescook>

Kees Cook <keescook@chromium.org> writes:
> On Fri, Sep 11, 2020 at 03:10:12PM -0300, Thadeu Lima de Souza Cascardo wrote:
...
>> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
>> index 7a6d40286a42..0ddc0846e9c0 100644
>> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
>> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
>> @@ -1916,10 +1957,15 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
>>  	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
>>  			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
>>  
>> -	if (!entry)
>> +	if (!entry && !variant)
>>  		return;
>>  
>> -	nr = get_syscall(_metadata, tracee);
>> +	if (entry)
>> +		nr = get_syscall(_metadata, tracee);
>> +	else if (variant)
>> +		nr = variant->syscall_nr;
>> +	if (variant)
>> +		variant->syscall_nr = nr;
>
> So, to be clear this is _only_ an issue for the ptrace side of things,
> yes? i.e. seccomp's setting of the return value will correct stick?

Yes. There's a comment which (hopefully) explains the difference here:

  https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/kernel/ptrace/ptrace.c?commit=ab29a807a7ddaa7c84d2f4cb8d29e74e33759072#n239

Which says:

static int do_seccomp(struct pt_regs *regs)
{
	if (!test_thread_flag(TIF_SECCOMP))
		return 0;

	/*
	 * The ABI we present to seccomp tracers is that r3 contains
	 * the syscall return value and orig_gpr3 contains the first
	 * syscall parameter. This is different to the ptrace ABI where
	 * both r3 and orig_gpr3 contain the first syscall parameter.
	 */
	regs->gpr[3] = -ENOSYS;


cheers

^ permalink raw reply

* [PATCH] mm/debug_vm_pgtable: Avoid doing memory allocation with pgtable_t mapped.
From: Aneesh Kumar K.V @ 2020-09-13 11:03 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: kernel test robot, linuxppc-dev, Aneesh Kumar K.V,
	Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

With highmem, pte_alloc_map() keep the level4 page table mapped using
kmap_atomic(). Avoid doing new memory allocation with page table
mapped like above.

[    9.409233] BUG: sleeping function called from invalid context at mm/page_alloc.c:4822
[    9.410557] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper
[    9.411932] no locks held by swapper/1.
[    9.412595] CPU: 0 PID: 1 Comm: swapper Not tainted 5.9.0-rc3-00323-gc50eb1ed654b5 #2
[    9.413824] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[    9.415207] Call Trace:
[    9.415651]  ? ___might_sleep.cold+0xa7/0xcc
[    9.416367]  ? __alloc_pages_nodemask+0x14c/0x5b0
[    9.417055]  ? swap_migration_tests+0x50/0x293
[    9.417704]  ? debug_vm_pgtable+0x4bc/0x708
[    9.418287]  ? swap_migration_tests+0x293/0x293
[    9.418911]  ? do_one_initcall+0x82/0x3cb
[    9.419465]  ? parse_args+0x1bd/0x280
[    9.419983]  ? rcu_read_lock_sched_held+0x36/0x60
[    9.420673]  ? trace_initcall_level+0x1f/0xf3
[    9.421279]  ? trace_initcall_level+0xbd/0xf3
[    9.421881]  ? do_basic_setup+0x9d/0xdd
[    9.422410]  ? do_basic_setup+0xc3/0xdd
[    9.422938]  ? kernel_init_freeable+0x72/0xa3
[    9.423539]  ? rest_init+0x134/0x134
[    9.424055]  ? kernel_init+0x5/0x12c
[    9.424574]  ? ret_from_fork+0x19/0x30

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index d12bde82ae95..612c665a1136 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -994,7 +994,13 @@ static int __init debug_vm_pgtable(void)
 	p4dp = p4d_alloc(mm, pgdp, vaddr);
 	pudp = pud_alloc(mm, p4dp, vaddr);
 	pmdp = pmd_alloc(mm, pudp, vaddr);
-	ptep = pte_alloc_map(mm, pmdp, vaddr);
+	/*
+	 * Allocate pgtable_t
+	 */
+	if (pte_alloc(mm, pmdp)) {
+		pr_err("pgtable allocation failed\n");
+		return 1;
+	}
 
 	/*
 	 * Save all the page table page addresses as the page table
@@ -1048,8 +1054,7 @@ static int __init debug_vm_pgtable(void)
 	 * proper page table lock.
 	 */
 
-	ptl = pte_lockptr(mm, pmdp);
-	spin_lock(ptl);
+	ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
 	pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
 	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
 	pte_unmap_unlock(ptep, ptl);
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH] selftests/seccomp: fix ptrace tests on powerpc
From: Michael Ellerman @ 2020-09-13 12:34 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo, Kees Cook
  Cc: linuxppc-dev, Shuah Khan, Oleg Nesterov, linux-kselftest
In-Reply-To: <20200911180637.GI4002@mussarela>

Thadeu Lima de Souza Cascardo <cascardo@canonical.com> writes:
> On Tue, Sep 08, 2020 at 04:18:17PM -0700, Kees Cook wrote:
>> On Tue, Jun 30, 2020 at 01:47:39PM -0300, Thadeu Lima de Souza Cascardo wrote:
...
>> > @@ -1809,10 +1818,15 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
>> >  	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
>> >  			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
>> >  
>> > -	if (!entry)
>> > +	if (!entry && !syscall_nr)
>> >  		return;
>> >  
>> > -	nr = get_syscall(_metadata, tracee);
>> > +	if (entry)
>> > +		nr = get_syscall(_metadata, tracee);
>> > +	else
>> > +		nr = *syscall_nr;
>> 
>> This is weird? Shouldn't get_syscall() be modified to do the right thing
>> here instead of depending on the extra arg?
>> 
>
> R0 might be clobered. Same documentation mentions it as volatile. So, during
> syscall exit, we can't tell for sure that R0 will have the original syscall
> number. So, we need to grab it during syscall enter, save it somewhere and
> reuse it. I used the test context/args for that.

The user r0 (in regs->gpr[0]) shouldn't be clobbered.

But it is modified if the tracer skips the syscall, by setting the
syscall number to -1. Or if the tracer changes the syscall number.

So if you need the original syscall number in the exit path then I think
you do need to save it at entry.

cheers

^ permalink raw reply

* Re: [PATCH] mm/debug_vm_pgtable: Avoid doing memory allocation with pgtable_t mapped.
From: Matthew Wilcox @ 2020-09-13 13:42 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: kernel test robot, Anshuman Khandual, linux-mm, akpm,
	linuxppc-dev
In-Reply-To: <20200913110327.645310-1-aneesh.kumar@linux.ibm.com>

On Sun, Sep 13, 2020 at 04:33:27PM +0530, Aneesh Kumar K.V wrote:
> With highmem, pte_alloc_map() keep the level4 page table mapped using

.noitcerid etisoppo eht ni selbat egap eht srebmun ygolonimret xuniL


^ permalink raw reply

* RE: [PATCHv7 10/12] arm64: dts: layerscape: Add PCIe EP node for ls1088a
From: Z.q. Hou @ 2020-09-13 16:26 UTC (permalink / raw)
  To: Rob Herring
  Cc: devicetree@vger.kernel.org, lorenzo.pieralisi@arm.com,
	Xiaowei Bao, Roy Zang, linux-pci@vger.kernel.org,
	linux-kernel@vger.kernel.org, Leo Li, M.h. Lian,
	jingoohan1@gmail.com, andrew.murray@arm.com, Mingkai Hu,
	gustavo.pimentel@synopsys.com, bhelgaas@google.com,
	shawnguo@kernel.org, kishon@ti.com, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20200910164751.GA501845@bogus>

Hi Rob,

Thanks a lot for your comments!

> -----Original Message-----
> From: Rob Herring <robh@kernel.org>
> Sent: 2020年9月11日 0:48
> To: Z.q. Hou <zhiqiang.hou@nxp.com>
> Cc: linux-pci@vger.kernel.org; devicetree@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> linuxppc-dev@lists.ozlabs.org; bhelgaas@google.com;
> lorenzo.pieralisi@arm.com; shawnguo@kernel.org; Leo Li
> <leoyang.li@nxp.com>; kishon@ti.com; gustavo.pimentel@synopsys.com;
> Roy Zang <roy.zang@nxp.com>; jingoohan1@gmail.com;
> andrew.murray@arm.com; Mingkai Hu <mingkai.hu@nxp.com>; M.h. Lian
> <minghuan.lian@nxp.com>; Xiaowei Bao <xiaowei.bao@nxp.com>
> Subject: Re: [PATCHv7 10/12] arm64: dts: layerscape: Add PCIe EP node for
> ls1088a
> 
> On Tue, Aug 11, 2020 at 05:54:39PM +0800, Zhiqiang Hou wrote:
> > From: Xiaowei Bao <xiaowei.bao@nxp.com>
> >
> > Add PCIe EP node for ls1088a to support EP mode.
> >
> > Signed-off-by: Xiaowei Bao <xiaowei.bao@nxp.com>
> > Reviewed-by: Andrew Murray <andrew.murray@arm.com>
> > Signed-off-by: Hou Zhiqiang <Zhiqiang.Hou@nxp.com>
> > ---
> > V7:
> >  - Rebase the patch without functionality change.
> >
> >  .../arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 31
> +++++++++++++++++++
> >  1 file changed, 31 insertions(+)
> >
> > diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> > index 169f4742ae3b..915592141f1b 100644
> > --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> > +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
> > @@ -499,6 +499,17 @@
> >  			status = "disabled";
> >  		};
> >
> > +		pcie_ep@3400000 {
> 
> pci-ep@...

The DT node name must be "xxx-xx" style? If yes, the LS1046A EP node also has the name "pcie_ep", it also need to be fixed.

Thanks,
Zhiqiang

> 
> > +			compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
> > +			reg = <0x00 0x03400000 0x0 0x00100000
> > +			       0x20 0x00000000 0x8 0x00000000>;
> > +			reg-names = "regs", "addr_space";
> > +			num-ib-windows = <24>;
> > +			num-ob-windows = <128>;
> > +			max-functions = /bits/ 8 <2>;
> > +			status = "disabled";
> > +		};
> > +
> >  		pcie@3500000 {
> >  			compatible = "fsl,ls1088a-pcie";
> >  			reg = <0x00 0x03500000 0x0 0x00100000   /* controller
> registers */
> > @@ -525,6 +536,16 @@
> >  			status = "disabled";
> >  		};
> >
> > +		pcie_ep@3500000 {
> > +			compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
> > +			reg = <0x00 0x03500000 0x0 0x00100000
> > +			       0x28 0x00000000 0x8 0x00000000>;
> > +			reg-names = "regs", "addr_space";
> > +			num-ib-windows = <6>;
> > +			num-ob-windows = <8>;
> > +			status = "disabled";
> > +		};
> > +
> >  		pcie@3600000 {
> >  			compatible = "fsl,ls1088a-pcie";
> >  			reg = <0x00 0x03600000 0x0 0x00100000   /* controller
> registers */
> > @@ -551,6 +572,16 @@
> >  			status = "disabled";
> >  		};
> >
> > +		pcie_ep@3600000 {
> > +			compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
> > +			reg = <0x00 0x03600000 0x0 0x00100000
> > +			       0x30 0x00000000 0x8 0x00000000>;
> > +			reg-names = "regs", "addr_space";
> > +			num-ib-windows = <6>;
> > +			num-ob-windows = <8>;
> > +			status = "disabled";
> > +		};
> > +
> >  		smmu: iommu@5000000 {
> >  			compatible = "arm,mmu-500";
> >  			reg = <0 0x5000000 0 0x800000>;
> > --
> > 2.17.1
> >

^ permalink raw reply

* RE: [PATCHv7 02/12] PCI: designware-ep: Add the doorbell mode of MSI-X in EP mode
From: Z.q. Hou @ 2020-09-13 16:28 UTC (permalink / raw)
  To: Rob Herring
  Cc: devicetree@vger.kernel.org, lorenzo.pieralisi@arm.com,
	Xiaowei Bao, Roy Zang, linux-pci@vger.kernel.org,
	linux-kernel@vger.kernel.org, Leo Li, M.h. Lian,
	jingoohan1@gmail.com, andrew.murray@arm.com, Mingkai Hu,
	gustavo.pimentel@synopsys.com, bhelgaas@google.com,
	shawnguo@kernel.org, kishon@ti.com, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20200910175804.GA592152@bogus>

Hi Rob,

Thanks a lot for your comments!

> -----Original Message-----
> From: Rob Herring <robh@kernel.org>
> Sent: 2020年9月11日 1:58
> To: Z.q. Hou <zhiqiang.hou@nxp.com>
> Cc: linux-pci@vger.kernel.org; devicetree@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> linuxppc-dev@lists.ozlabs.org; bhelgaas@google.com;
> lorenzo.pieralisi@arm.com; shawnguo@kernel.org; Leo Li
> <leoyang.li@nxp.com>; kishon@ti.com; gustavo.pimentel@synopsys.com;
> Roy Zang <roy.zang@nxp.com>; jingoohan1@gmail.com;
> andrew.murray@arm.com; Mingkai Hu <mingkai.hu@nxp.com>; M.h. Lian
> <minghuan.lian@nxp.com>; Xiaowei Bao <xiaowei.bao@nxp.com>
> Subject: Re: [PATCHv7 02/12] PCI: designware-ep: Add the doorbell mode of
> MSI-X in EP mode
> 
> On Tue, Aug 11, 2020 at 05:54:31PM +0800, Zhiqiang Hou wrote:
> > From: Xiaowei Bao <xiaowei.bao@nxp.com>
> >
> > Add the doorbell mode of MSI-X in DWC EP driver.
> >
> > Signed-off-by: Xiaowei Bao <xiaowei.bao@nxp.com>
> > Reviewed-by: Andrew Murray <andrew.murray@arm.com>
> > Signed-off-by: Hou Zhiqiang <Zhiqiang.Hou@nxp.com>
> > ---
> > V7:
> >  - Rebase the patch without functionality change.
> >
> >  drivers/pci/controller/dwc/pcie-designware-ep.c | 14 ++++++++++++++
> >  drivers/pci/controller/dwc/pcie-designware.h    | 12 ++++++++++++
> >  2 files changed, 26 insertions(+)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > index e5bd3a5ef380..e76b504ed465 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > @@ -471,6 +471,20 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep
> *ep, u8 func_no,
> >  	return 0;
> >  }
> >
> > +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8
> > +func_no,
> 
> return void. It never has an error.
> 
> It could also just be an inline function.

Yes, make sense and will change in next version.

Thanks,
Zhiqiang

> 
> > +				       u16 interrupt_num)
> > +{
> > +	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +	u32 msg_data;
> > +
> > +	msg_data = (func_no << PCIE_MSIX_DOORBELL_PF_SHIFT) |
> > +		   (interrupt_num - 1);
> > +
> > +	dw_pcie_writel_dbi(pci, PCIE_MSIX_DOORBELL, msg_data);
> > +
> > +	return 0;
> > +}
> > +
> >  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> >  			      u16 interrupt_num)
> >  {
> > diff --git a/drivers/pci/controller/dwc/pcie-designware.h
> > b/drivers/pci/controller/dwc/pcie-designware.h
> > index 89f8271ec5ee..745b4938225a 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware.h
> > +++ b/drivers/pci/controller/dwc/pcie-designware.h
> > @@ -97,6 +97,9 @@
> >  #define PCIE_MISC_CONTROL_1_OFF		0x8BC
> >  #define PCIE_DBI_RO_WR_EN		BIT(0)
> >
> > +#define PCIE_MSIX_DOORBELL		0x948
> > +#define PCIE_MSIX_DOORBELL_PF_SHIFT	24
> > +
> >  #define PCIE_PL_CHK_REG_CONTROL_STATUS			0xB20
> >  #define PCIE_PL_CHK_REG_CHK_REG_START			BIT(0)
> >  #define PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS		BIT(1)
> > @@ -434,6 +437,8 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep
> *ep, u8 func_no,
> >  			     u8 interrupt_num);
> >  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> >  			     u16 interrupt_num);
> > +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8
> func_no,
> > +				       u16 interrupt_num);
> >  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar);
> > #else  static inline void dw_pcie_ep_linkup(struct dw_pcie_ep *ep) @@
> > -475,6 +480,13 @@ static inline int dw_pcie_ep_raise_msix_irq(struct
> dw_pcie_ep *ep, u8 func_no,
> >  	return 0;
> >  }
> >
> > +static inline int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep
> *ep,
> > +						     u8 func_no,
> > +						     u16 interrupt_num)
> > +{
> > +	return 0;
> > +}
> > +
> >  static inline void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum
> > pci_barno bar)  {  }
> > --
> > 2.17.1
> >

^ permalink raw reply

* Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Srikar Dronamraju @ 2020-09-13 16:53 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	Jordan Niethe, LKML, Nicholas Piggin, Ingo Molnar,
	Oliver O'Halloran, linuxppc-dev, Valentin Schneider
In-Reply-To: <87imciqwhq.fsf@mpe.ellerman.id.au>

* Michael Ellerman <mpe@ellerman.id.au> [2020-09-13 11:46:41]:

> Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
> > * Michael Ellerman <mpe@ellerman.id.au> [2020-09-11 21:55:23]:
> >
> >> Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
> >> > Current code assumes that cpumask of cpus sharing a l2-cache mask will
> >> > always be a superset of cpu_sibling_mask.
> >> >
> >> > Lets stop that assumption. cpu_l2_cache_mask is a superset of
> >> > cpu_sibling_mask if and only if shared_caches is set.
> >> 
> >> I'm seeing oopses with this:
> >> 
> 
> The patch fixes qemu, and I don't see the crash on mambo, but I still
> see:
>   [    0.010536] smp: Bringing up secondary CPUs ...
>   [    0.019189] smp: Brought up 2 nodes, 8 CPUs
>   [    0.019210] numa: Node 0 CPUs: 0-3
>   [    0.019235] numa: Node 1 CPUs: 4-7
>   [    0.024444]      the CACHE domain not a subset of the MC domain
>   [    0.024505] BUG: arch topology borken
>   [    0.024527]      the SMT domain not a subset of the CACHE domain
>   [    0.024563] BUG: arch topology borken
>   [    0.024584]      the CACHE domain not a subset of the MC domain
>   [    0.024645] BUG: arch topology borken
>   [    0.024666]      the SMT domain not a subset of the CACHE domain
>   [    0.024702] BUG: arch topology borken
>   [    0.024723]      the CACHE domain not a subset of the MC domain
> 
> That's the p9 mambo model, using skiboot.tcl from skiboot, with CPUS=2,
> THREADS=4 and MAMBO_NUMA=1.
> 

I was able to reproduce with
 qemu-system-ppc64 -nographic -vga none -M pseries -cpu POWER8 \
   -kernel build~/vmlinux \
   -m 2G,slots=2,maxmem=4G \
   -object memory-backend-ram,size=1G,id=m0 \
   -object memory-backend-ram,size=1G,id=m1 \
   -numa node,nodeid=0,memdev=m0 \
   -numa node,nodeid=1,memdev=m1 \
   -smp 8,threads=4,sockets=2,maxcpus=8  \


If the CPU doesn't have a l2-cache element, then CPU not only has to set
itself in the cpu_l2_cache but also the siblings. Otherwise it will so
happen that the Siblings will have 4 Cpus set, and the Cache domain will
have just one cpu set, leading to this BUG message.

Patch follows this mail.

> Node layout is:
> 
> [    0.000000] Early memory node ranges
> [    0.000000]   node   0: [mem 0x0000000000000000-0x00000000ffffffff]
> [    0.000000]   node   1: [mem 0x0000200000000000-0x00002000ffffffff]
> [    0.000000] Initmem setup node 0 [mem 0x0000000000000000-0x00000000ffffffff]
> [    0.000000] On node 0 totalpages: 65536
> [    0.000000] Initmem setup node 1 [mem 0x0000200000000000-0x00002000ffffffff]
> [    0.000000] On node 1 totalpages: 65536
> 
> 
> There aren't any l2-cache properties in the device-tree under cpus.
> 
> I'll try and have a closer look tonight.
> 
> cheers

-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply

* Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Srikar Dronamraju @ 2020-09-13 17:10 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	Jordan Niethe, LKML, Nicholas Piggin, Valentin Schneider,
	Oliver O'Halloran, linuxppc-dev, Ingo Molnar
In-Reply-To: <87imciqwhq.fsf@mpe.ellerman.id.au>

Fix to make it work where CPUs dont have a l2-cache element.

------------>8-----------------------------------------8<---------------------

From b25d47b01b7195b1df19083a4043fa6a87a901a3 Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 9 Jul 2020 13:33:38 +0530
Subject: [PATCH v5.2 05/10] powerpc/smp: Dont assume l2-cache to be superset of
 sibling

Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption. cpu_l2_cache_mask is a superset of
cpu_sibling_mask if and only if shared_caches is set.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: LKML <linux-kernel@vger.kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Cc: Jordan Niethe <jniethe5@gmail.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
Changelog v1 -> v2:
	Set cpumask after verifying l2-cache. (Gautham)

Changelog v5 -> v5.2:
	If cpu has no l2-cache set cpumask as per its
                 sibling mask. (Michael Ellerman)

 arch/powerpc/kernel/smp.c | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9f4333d..168532e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1186,9 +1186,23 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
 	int i;
 
 	l2_cache = cpu_to_l2cache(cpu);
-	if (!l2_cache)
+	if (!l2_cache) {
+		struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+
+		/*
+		 * If no l2cache for this CPU, assume all siblings to share
+		 * cache with this CPU.
+		 */
+		if (has_big_cores)
+			sibling_mask = cpu_smallcore_mask;
+
+		for_each_cpu(i, sibling_mask(cpu))
+			set_cpus_related(cpu, i, cpu_l2_cache_mask);
+
 		return false;
+	}
 
+	cpumask_set_cpu(cpu, mask_fn(cpu));
 	for_each_cpu(i, cpu_online_mask) {
 		/*
 		 * when updating the marks the current CPU has not been marked
@@ -1271,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
 	 * add it to it's own thread sibling mask.
 	 */
 	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
 	for (i = first_thread; i < first_thread + threads_per_core; i++)
 		if (cpu_online(i))
 			set_cpus_related(i, cpu, cpu_sibling_mask);
 
 	add_cpu_to_smallcore_masks(cpu);
-	/*
-	 * Copy the thread sibling mask into the cache sibling mask
-	 * and mark any CPUs that share an L2 with this CPU.
-	 */
-	for_each_cpu(i, cpu_sibling_mask(cpu))
-		set_cpus_related(cpu, i, cpu_l2_cache_mask);
 	update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-	/*
-	 * Copy the cache sibling mask into core sibling mask and mark
-	 * any CPUs on the same chip as this CPU.
-	 */
-	for_each_cpu(i, cpu_l2_cache_mask(cpu))
-		set_cpus_related(cpu, i, cpu_core_mask);
+	if (pkg_id == -1) {
+		struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+		/*
+		 * Copy the sibling mask into core sibling mask and
+		 * mark any CPUs on the same chip as this CPU.
+		 */
+		if (shared_caches)
+			mask = cpu_l2_cache_mask;
+
+		for_each_cpu(i, mask(cpu))
+			set_cpus_related(cpu, i, cpu_core_mask);
 
-	if (pkg_id == -1)
 		return;
+	}
 
 	for_each_cpu(i, cpu_online_mask)
 		if (get_physical_package_id(i) == pkg_id)
-- 
2.17.1


^ permalink raw reply related

* RE: [PATCHv7 04/12] PCI: designware-ep: Modify MSI and MSIX CAP way of finding
From: Z.q. Hou @ 2020-09-13 17:24 UTC (permalink / raw)
  To: Rob Herring
  Cc: devicetree@vger.kernel.org, lorenzo.pieralisi@arm.com,
	Xiaowei Bao, Roy Zang, linux-pci@vger.kernel.org,
	linux-kernel@vger.kernel.org, Leo Li, M.h. Lian,
	jingoohan1@gmail.com, andrew.murray@arm.com, Mingkai Hu,
	gustavo.pimentel@synopsys.com, bhelgaas@google.com,
	shawnguo@kernel.org, kishon@ti.com, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20200910181009.GB592152@bogus>

Hi Rob,

Thanks a lot for your comments!

> -----Original Message-----
> From: Rob Herring <robh@kernel.org>
> Sent: 2020年9月11日 2:10
> To: Z.q. Hou <zhiqiang.hou@nxp.com>
> Cc: linux-pci@vger.kernel.org; devicetree@vger.kernel.org;
> linux-kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> linuxppc-dev@lists.ozlabs.org; bhelgaas@google.com;
> lorenzo.pieralisi@arm.com; shawnguo@kernel.org; Leo Li
> <leoyang.li@nxp.com>; kishon@ti.com; gustavo.pimentel@synopsys.com;
> Roy Zang <roy.zang@nxp.com>; jingoohan1@gmail.com;
> andrew.murray@arm.com; Mingkai Hu <mingkai.hu@nxp.com>; M.h. Lian
> <minghuan.lian@nxp.com>; Xiaowei Bao <xiaowei.bao@nxp.com>
> Subject: Re: [PATCHv7 04/12] PCI: designware-ep: Modify MSI and MSIX CAP
> way of finding
> 
> On Tue, Aug 11, 2020 at 05:54:33PM +0800, Zhiqiang Hou wrote:
> > From: Xiaowei Bao <xiaowei.bao@nxp.com>
> >
> > Each PF of EP device should have its own MSI or MSIX capabitily
> > struct, so create a dw_pcie_ep_func struct and move the msi_cap and
> > msix_cap to this struct from dw_pcie_ep, and manage the PFs via a
> > list.
> >
> > Signed-off-by: Xiaowei Bao <xiaowei.bao@nxp.com>
> > Signed-off-by: Hou Zhiqiang <Zhiqiang.Hou@nxp.com>
> > ---
> > V7:
> >  - Rebase the patch without functionality change.
> >
> >  .../pci/controller/dwc/pcie-designware-ep.c   | 139
> +++++++++++++++---
> >  drivers/pci/controller/dwc/pcie-designware.h  |  18 ++-
> >  2 files changed, 136 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > index 56bd1cd71f16..4680a51c49c0 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > @@ -28,6 +28,19 @@ void dw_pcie_ep_init_notify(struct dw_pcie_ep
> *ep)
> > }  EXPORT_SYMBOL_GPL(dw_pcie_ep_init_notify);
> >
> > +struct dw_pcie_ep_func *
> > +dw_pcie_ep_get_func_from_ep(struct dw_pcie_ep *ep, u8 func_no) {
> > +	struct dw_pcie_ep_func *ep_func;
> > +
> > +	list_for_each_entry(ep_func, &ep->func_list, list) {
> > +		if (ep_func->func_no == func_no)
> > +			return ep_func;
> > +	}
> > +
> > +	return NULL;
> > +}
> > +
> >  static unsigned int dw_pcie_ep_func_select(struct dw_pcie_ep *ep, u8
> > func_no)  {
> >  	unsigned int func_offset = 0;
> > @@ -68,6 +81,47 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci,
> enum pci_barno bar)
> >  		__dw_pcie_ep_reset_bar(pci, func_no, bar, 0);  }
> >
> > +static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie_ep *ep, u8
> func_no,
> > +		u8 cap_ptr, u8 cap)
> > +{
> > +	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +	unsigned int func_offset = 0;
> > +	u8 cap_id, next_cap_ptr;
> > +	u16 reg;
> > +
> > +	if (!cap_ptr)
> > +		return 0;
> > +
> > +	func_offset = dw_pcie_ep_func_select(ep, func_no);
> > +
> > +	reg = dw_pcie_readw_dbi(pci, func_offset + cap_ptr);
> > +	cap_id = (reg & 0x00ff);
> > +
> > +	if (cap_id > PCI_CAP_ID_MAX)
> > +		return 0;
> > +
> > +	if (cap_id == cap)
> > +		return cap_ptr;
> > +
> > +	next_cap_ptr = (reg & 0xff00) >> 8;
> > +	return __dw_pcie_ep_find_next_cap(ep, func_no, next_cap_ptr, cap); }
> > +
> > +static u8 dw_pcie_ep_find_capability(struct dw_pcie_ep *ep, u8
> > +func_no, u8 cap) {
> > +	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +	unsigned int func_offset = 0;
> > +	u8 next_cap_ptr;
> > +	u16 reg;
> > +
> > +	func_offset = dw_pcie_ep_func_select(ep, func_no);
> > +
> > +	reg = dw_pcie_readw_dbi(pci, func_offset + PCI_CAPABILITY_LIST);
> > +	next_cap_ptr = (reg & 0x00ff);
> > +
> > +	return __dw_pcie_ep_find_next_cap(ep, func_no, next_cap_ptr, cap); }
> 
> These are almost the same as __dw_pcie_find_next_cap and
> dw_pcie_find_capability. Please modify them to take a function offset and
> work for both host and endpoints.

Yes, will rework in next version.

> 
> > +
> >  static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no,
> >  				   struct pci_epf_header *hdr)
> >  {
> > @@ -257,13 +311,18 @@ static int dw_pcie_ep_get_msi(struct pci_epc
> *epc, u8 func_no)
> >  	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> >  	u32 val, reg;
> >  	unsigned int func_offset = 0;
> > +	struct dw_pcie_ep_func *ep_func;
> >
> > -	if (!ep->msi_cap)
> > +	ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> > +	if (!ep_func)
> > +		return -EINVAL;
> > +
> > +	if (!ep_func->msi_cap)
> >  		return -EINVAL;
> >
> >  	func_offset = dw_pcie_ep_func_select(ep, func_no);
> >
> > -	reg = ep->msi_cap + func_offset + PCI_MSI_FLAGS;
> > +	reg = ep_func->msi_cap + func_offset + PCI_MSI_FLAGS;
> >  	val = dw_pcie_readw_dbi(pci, reg);
> >  	if (!(val & PCI_MSI_FLAGS_ENABLE))
> >  		return -EINVAL;
> > @@ -279,13 +338,18 @@ static int dw_pcie_ep_set_msi(struct pci_epc
> *epc, u8 func_no, u8 interrupts)
> >  	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> >  	u32 val, reg;
> >  	unsigned int func_offset = 0;
> > +	struct dw_pcie_ep_func *ep_func;
> > +
> > +	ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> > +	if (!ep_func)
> > +		return -EINVAL;
> >
> > -	if (!ep->msi_cap)
> > +	if (!ep_func->msi_cap)
> >  		return -EINVAL;
> >
> >  	func_offset = dw_pcie_ep_func_select(ep, func_no);
> >
> > -	reg = ep->msi_cap + func_offset + PCI_MSI_FLAGS;
> > +	reg = ep_func->msi_cap + func_offset + PCI_MSI_FLAGS;
> 
> If msi_cap is now per function, then shouldn't it already include
> 'func_offset'?

Good suggestion, will rework in next version.

Regards,
Zhiqiang

> 
> >  	val = dw_pcie_readw_dbi(pci, reg);
> >  	val &= ~PCI_MSI_FLAGS_QMASK;
> >  	val |= (interrupts << 1) & PCI_MSI_FLAGS_QMASK; @@ -302,13 +366,18
> > @@ static int dw_pcie_ep_get_msix(struct pci_epc *epc, u8 func_no)
> >  	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> >  	u32 val, reg;
> >  	unsigned int func_offset = 0;
> > +	struct dw_pcie_ep_func *ep_func;
> > +
> > +	ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> > +	if (!ep_func)
> > +		return -EINVAL;
> >
> > -	if (!ep->msix_cap)
> > +	if (!ep_func->msix_cap)
> >  		return -EINVAL;
> >
> >  	func_offset = dw_pcie_ep_func_select(ep, func_no);
> >
> > -	reg = ep->msix_cap + func_offset + PCI_MSIX_FLAGS;
> > +	reg = ep_func->msix_cap + func_offset + PCI_MSIX_FLAGS;
> >  	val = dw_pcie_readw_dbi(pci, reg);
> >  	if (!(val & PCI_MSIX_FLAGS_ENABLE))
> >  		return -EINVAL;
> > @@ -325,25 +394,30 @@ static int dw_pcie_ep_set_msix(struct pci_epc
> *epc, u8 func_no, u16 interrupts,
> >  	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> >  	u32 val, reg;
> >  	unsigned int func_offset = 0;
> > +	struct dw_pcie_ep_func *ep_func;
> >
> > -	if (!ep->msix_cap)
> > +	ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> > +	if (!ep_func)
> > +		return -EINVAL;
> > +
> > +	if (!ep_func->msix_cap)
> >  		return -EINVAL;
> >
> >  	dw_pcie_dbi_ro_wr_en(pci);
> >
> >  	func_offset = dw_pcie_ep_func_select(ep, func_no);
> >
> > -	reg = ep->msix_cap + func_offset + PCI_MSIX_FLAGS;
> > +	reg = ep_func->msix_cap + func_offset + PCI_MSIX_FLAGS;
> >  	val = dw_pcie_readw_dbi(pci, reg);
> >  	val &= ~PCI_MSIX_FLAGS_QSIZE;
> >  	val |= interrupts;
> >  	dw_pcie_writew_dbi(pci, reg, val);
> >
> > -	reg = ep->msix_cap + func_offset + PCI_MSIX_TABLE;
> > +	reg = ep_func->msix_cap + func_offset + PCI_MSIX_TABLE;
> >  	val = offset | bir;
> >  	dw_pcie_writel_dbi(pci, reg, val);
> >
> > -	reg = ep->msix_cap + func_offset + PCI_MSIX_PBA;
> > +	reg = ep_func->msix_cap + func_offset + PCI_MSIX_PBA;
> >  	val = (offset + (interrupts * PCI_MSIX_ENTRY_SIZE)) | bir;
> >  	dw_pcie_writel_dbi(pci, reg, val);
> >
> > @@ -426,6 +500,7 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep
> *ep, u8 func_no,
> >  			     u8 interrupt_num)
> >  {
> >  	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +	struct dw_pcie_ep_func *ep_func;
> >  	struct pci_epc *epc = ep->epc;
> >  	unsigned int aligned_offset;
> >  	unsigned int func_offset = 0;
> > @@ -435,25 +510,29 @@ int dw_pcie_ep_raise_msi_irq(struct
> dw_pcie_ep *ep, u8 func_no,
> >  	bool has_upper;
> >  	int ret;
> >
> > -	if (!ep->msi_cap)
> > +	ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> > +	if (!ep_func)
> > +		return -EINVAL;
> > +
> > +	if (!ep_func->msi_cap)
> >  		return -EINVAL;
> >
> >  	func_offset = dw_pcie_ep_func_select(ep, func_no);
> >
> >  	/* Raise MSI per the PCI Local Bus Specification Revision 3.0, 6.8.1. */
> > -	reg = ep->msi_cap + func_offset + PCI_MSI_FLAGS;
> > +	reg = ep_func->msi_cap + func_offset + PCI_MSI_FLAGS;
> >  	msg_ctrl = dw_pcie_readw_dbi(pci, reg);
> >  	has_upper = !!(msg_ctrl & PCI_MSI_FLAGS_64BIT);
> > -	reg = ep->msi_cap + func_offset + PCI_MSI_ADDRESS_LO;
> > +	reg = ep_func->msi_cap + func_offset + PCI_MSI_ADDRESS_LO;
> >  	msg_addr_lower = dw_pcie_readl_dbi(pci, reg);
> >  	if (has_upper) {
> > -		reg = ep->msi_cap + func_offset + PCI_MSI_ADDRESS_HI;
> > +		reg = ep_func->msi_cap + func_offset + PCI_MSI_ADDRESS_HI;
> >  		msg_addr_upper = dw_pcie_readl_dbi(pci, reg);
> > -		reg = ep->msi_cap + func_offset + PCI_MSI_DATA_64;
> > +		reg = ep_func->msi_cap + func_offset + PCI_MSI_DATA_64;
> >  		msg_data = dw_pcie_readw_dbi(pci, reg);
> >  	} else {
> >  		msg_addr_upper = 0;
> > -		reg = ep->msi_cap + func_offset + PCI_MSI_DATA_32;
> > +		reg = ep_func->msi_cap + func_offset + PCI_MSI_DATA_32;
> >  		msg_data = dw_pcie_readw_dbi(pci, reg);
> >  	}
> >  	aligned_offset = msg_addr_lower & (epc->mem->window.page_size -
> 1);
> > @@ -489,6 +568,7 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep
> *ep, u8 func_no,
> >  			      u16 interrupt_num)
> >  {
> >  	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +	struct dw_pcie_ep_func *ep_func;
> >  	struct pci_epf_msix_tbl *msix_tbl;
> >  	struct pci_epc *epc = ep->epc;
> >  	unsigned int func_offset = 0;
> > @@ -499,9 +579,16 @@ int dw_pcie_ep_raise_msix_irq(struct
> dw_pcie_ep *ep, u8 func_no,
> >  	int ret;
> >  	u8 bir;
> >
> > +	ep_func = dw_pcie_ep_get_func_from_ep(ep, func_no);
> > +	if (!ep_func)
> > +		return -EINVAL;
> > +
> > +	if (!ep_func->msix_cap)
> > +		return -EINVAL;
> > +
> >  	func_offset = dw_pcie_ep_func_select(ep, func_no);
> >
> > -	reg = ep->msix_cap + func_offset + PCI_MSIX_TABLE;
> > +	reg = ep_func->msix_cap + func_offset + PCI_MSIX_TABLE;
> >  	tbl_offset = dw_pcie_readl_dbi(pci, reg);
> >  	bir = (tbl_offset & PCI_MSIX_TABLE_BIR);
> >  	tbl_offset &= PCI_MSIX_TABLE_OFFSET; @@ -596,11 +683,15 @@ int
> > dw_pcie_ep_init(struct dw_pcie_ep *ep)  {
> >  	int ret;
> >  	void *addr;
> > +	u8 func_no;
> >  	struct pci_epc *epc;
> >  	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> >  	struct device *dev = pci->dev;
> >  	struct device_node *np = dev->of_node;
> >  	const struct pci_epc_features *epc_features;
> > +	struct dw_pcie_ep_func *ep_func;
> > +
> > +	INIT_LIST_HEAD(&ep->func_list);
> >
> >  	if (!pci->dbi_base || !pci->dbi_base2) {
> >  		dev_err(dev, "dbi_base/dbi_base2 is not populated\n"); @@ -660,9
> > +751,19 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
> >  	if (ret < 0)
> >  		epc->max_functions = 1;
> >
> > -	ep->msi_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSI);
> > +	for (func_no = 0; func_no < epc->max_functions; func_no++) {
> > +		ep_func = devm_kzalloc(dev, sizeof(*ep_func), GFP_KERNEL);
> > +		if (!ep_func)
> > +			return -ENOMEM;
> >
> > -	ep->msix_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSIX);
> > +		ep_func->func_no = func_no;
> > +		ep_func->msi_cap = dw_pcie_ep_find_capability(ep, func_no,
> > +							      PCI_CAP_ID_MSI);
> > +		ep_func->msix_cap = dw_pcie_ep_find_capability(ep, func_no,
> > +							       PCI_CAP_ID_MSIX);
> > +
> > +		list_add_tail(&ep_func->list, &ep->func_list);
> > +	}
> >
> >  	if (ep->ops->ep_init)
> >  		ep->ops->ep_init(ep);
> > diff --git a/drivers/pci/controller/dwc/pcie-designware.h
> > b/drivers/pci/controller/dwc/pcie-designware.h
> > index 745b4938225a..19c4ba486239 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware.h
> > +++ b/drivers/pci/controller/dwc/pcie-designware.h
> > @@ -230,8 +230,16 @@ struct dw_pcie_ep_ops {
> >  	unsigned int (*func_conf_select)(struct dw_pcie_ep *ep, u8 func_no);
> > };
> >
> > +struct dw_pcie_ep_func {
> > +	struct list_head	list;
> > +	u8			func_no;
> > +	u8			msi_cap;	/* MSI capability offset */
> > +	u8			msix_cap;	/* MSI-X capability offset */
> > +};
> > +
> >  struct dw_pcie_ep {
> >  	struct pci_epc		*epc;
> > +	struct list_head	func_list;
> >  	const struct dw_pcie_ep_ops *ops;
> >  	phys_addr_t		phys_base;
> >  	size_t			addr_size;
> > @@ -244,8 +252,6 @@ struct dw_pcie_ep {
> >  	u32			num_ob_windows;
> >  	void __iomem		*msi_mem;
> >  	phys_addr_t		msi_mem_phys;
> > -	u8			msi_cap;	/* MSI capability offset */
> > -	u8			msix_cap;	/* MSI-X capability offset */
> >  	struct pci_epf_bar	*epf_bar[PCI_STD_NUM_BARS];
> >  };
> >
> > @@ -440,6 +446,8 @@ int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep
> > *ep, u8 func_no,  int dw_pcie_ep_raise_msix_irq_doorbell(struct
> dw_pcie_ep *ep, u8 func_no,
> >  				       u16 interrupt_num);
> >  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar);
> > +struct dw_pcie_ep_func *
> > +dw_pcie_ep_get_func_from_ep(struct dw_pcie_ep *ep, u8 func_no);
> >  #else
> >  static inline void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)  { @@
> > -490,5 +498,11 @@ static inline int
> > dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep,  static
> > inline void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno
> > bar)  {  }
> > +
> > +static inline struct dw_pcie_ep_func *
> > +dw_pcie_ep_get_func_from_ep(struct dw_pcie_ep *ep, u8 func_no) {
> > +	return NULL;
> > +}
> >  #endif
> >  #endif /* _PCIE_DESIGNWARE_H */
> > --
> > 2.17.1
> >

^ permalink raw reply

* RE: [PATCHv7 12/12] misc: pci_endpoint_test: Add driver data for Layerscape PCIe controllers
From: Z.q. Hou @ 2020-09-13 17:24 UTC (permalink / raw)
  To: Rob Herring
  Cc: Roy Zang, lorenzo.pieralisi@arm.com, jingoohan1@gmail.com,
	linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org,
	Leo Li, kishon@ti.com, M.h. Lian, devicetree@vger.kernel.org,
	robh+dt@kernel.org, Mingkai Hu, linux-pci@vger.kernel.org,
	bhelgaas@google.com, andrew.murray@arm.com,
	gustavo.pimentel@synopsys.com, shawnguo@kernel.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20200910181756.GA622331@bogus>

Hi Rob,

Thanks a lot for your review and ack!

Regards,
Zhiqiang

> -----Original Message-----
> From: Rob Herring <robh@kernel.org>
> Sent: 2020年9月11日 2:18
> To: Z.q. Hou <zhiqiang.hou@nxp.com>
> Cc: bhelgaas@google.com; shawnguo@kernel.org; M.h. Lian
> <minghuan.lian@nxp.com>; Leo Li <leoyang.li@nxp.com>;
> linuxppc-dev@lists.ozlabs.org; robh+dt@kernel.org;
> linux-arm-kernel@lists.infradead.org; Roy Zang <roy.zang@nxp.com>;
> andrew.murray@arm.com; linux-pci@vger.kernel.org;
> lorenzo.pieralisi@arm.com; gustavo.pimentel@synopsys.com; Mingkai Hu
> <mingkai.hu@nxp.com>; linux-kernel@vger.kernel.org;
> jingoohan1@gmail.com; kishon@ti.com; devicetree@vger.kernel.org
> Subject: Re: [PATCHv7 12/12] misc: pci_endpoint_test: Add driver data for
> Layerscape PCIe controllers
> 
> On Tue, 11 Aug 2020 17:54:41 +0800, Zhiqiang Hou wrote:
> > From: Hou Zhiqiang <Zhiqiang.Hou@nxp.com>
> >
> > The commit 0a121f9bc3f5 ("misc: pci_endpoint_test: Use streaming DMA
> > APIs for buffer allocation") changed to use streaming DMA APIs,
> > however,
> > dma_map_single() might not return a 4KB aligned address, so add the
> > default_data as driver data for Layerscape PCIe controllers to make it
> > 4KB aligned.
> >
> > Signed-off-by: Hou Zhiqiang <Zhiqiang.Hou@nxp.com>
> > ---
> > V7:
> >  - New patch.
> >
> >  drivers/misc/pci_endpoint_test.c | 8 ++++++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> >
> 
> Acked-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* Re: [PATCH v5 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Satheesh Rajendran @ 2020-09-13 17:42 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling,
	Srikar Dronamraju, Peter Zijlstra, Jordan Niethe, LKML,
	Nicholas Piggin, Ingo Molnar, Oliver O'Halloran, linuxppc-dev,
	Valentin Schneider
In-Reply-To: <87y2lgr0ic.fsf@mpe.ellerman.id.au>

On Fri, Sep 11, 2020 at 09:55:23PM +1000, Michael Ellerman wrote:
> Srikar Dronamraju <srikar@linux.vnet.ibm.com> writes:
> > Current code assumes that cpumask of cpus sharing a l2-cache mask will
> > always be a superset of cpu_sibling_mask.
> >
> > Lets stop that assumption. cpu_l2_cache_mask is a superset of
> > cpu_sibling_mask if and only if shared_caches is set.
> 
> I'm seeing oopses with this:
> 
> [    0.117392][    T1] smp: Bringing up secondary CPUs ...
> [    0.156515][    T1] smp: Brought up 2 nodes, 2 CPUs
> [    0.158265][    T1] numa: Node 0 CPUs: 0
> [    0.158520][    T1] numa: Node 1 CPUs: 1
> [    0.167453][    T1] BUG: Unable to handle kernel data access on read at 0x8000000041228298
> [    0.167992][    T1] Faulting instruction address: 0xc00000000018c128
> [    0.168817][    T1] Oops: Kernel access of bad area, sig: 11 [#1]
> [    0.168964][    T1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> [    0.169417][    T1] Modules linked in:
> [    0.170047][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc2-00095-g7430ad5aa700 #209
> [    0.170305][    T1] NIP:  c00000000018c128 LR: c00000000018c0cc CTR: c00000000004dce0
> [    0.170498][    T1] REGS: c00000007e343880 TRAP: 0380   Not tainted  (5.9.0-rc2-00095-g7430ad5aa700)
> [    0.170602][    T1] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 44002222  XER: 00000000
> [    0.170985][    T1] CFAR: c00000000018c288 IRQMASK: 0
> [    0.170985][    T1] GPR00: 0000000000000000 c00000007e343b10 c00000000173e400 0000000000004000
> [    0.170985][    T1] GPR04: 0000000000000000 0000000000000800 0000000000000800 0000000000000000
> [    0.170985][    T1] GPR08: 0000000000000000 c00000000122c298 c00000003fffc000 c00000007fd05ce8
> [    0.170985][    T1] GPR12: c00000007e0119f8 c000000001930000 00000000ffff8ade 0000000000000000
> [    0.170985][    T1] GPR16: c00000007e3c0640 0000000000000917 c00000007e3c0658 0000000000000008
> [    0.170985][    T1] GPR20: c0000000015d0bb8 00000000ffff8ade c000000000f57400 c000000001817c28
> [    0.170985][    T1] GPR24: c00000000176dc80 c00000007e3c0890 c00000007e3cfe00 0000000000000000
> [    0.170985][    T1] GPR28: c000000001772310 c00000007e011900 c00000007e3c0800 0000000000000001
> [    0.172750][    T1] NIP [c00000000018c128] build_sched_domains+0x808/0x14b0
> [    0.172900][    T1] LR [c00000000018c0cc] build_sched_domains+0x7ac/0x14b0
> [    0.173186][    T1] Call Trace:
> [    0.173484][    T1] [c00000007e343b10] [c00000000018bfe8] build_sched_domains+0x6c8/0x14b0 (unreliable)
> [    0.173821][    T1] [c00000007e343c50] [c00000000018dcdc] sched_init_domains+0xec/0x130
> [    0.174037][    T1] [c00000007e343ca0] [c0000000010d59d8] sched_init_smp+0x50/0xc4
> [    0.174207][    T1] [c00000007e343cd0] [c0000000010b45c4] kernel_init_freeable+0x1b4/0x378
> [    0.174378][    T1] [c00000007e343db0] [c0000000000129fc] kernel_init+0x24/0x158
> [    0.174740][    T1] [c00000007e343e20] [c00000000000d9d0] ret_from_kernel_thread+0x5c/0x6c
> [    0.175050][    T1] Instruction dump:
> [    0.175626][    T1] 554905ee 71480040 7d2907b4 4182016c 2c290000 3920006e 913e002c 41820034
> [    0.175841][    T1] 7c6307b4 e9300020 78631f24 7d58182a <7d2a482a> f93e0080 7d404828 314a0001
> [    0.178340][    T1] ---[ end trace 6876b88dd1d4b3bb ]---
> [    0.178512][    T1]
> [    1.180458][    T1] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
> 
> That's qemu:
> 
> qemu-system-ppc64 -nographic -vga none -M pseries -cpu POWER8 \
>   -kernel build~/vmlinux \
>   -m 2G,slots=2,maxmem=4G \
>   -object memory-backend-ram,size=1G,id=m0 \
>   -object memory-backend-ram,size=1G,id=m1 \
>   -numa node,nodeid=0,memdev=m0 \
>   -numa node,nodeid=1,memdev=m1 \
>   -smp 2,sockets=2,maxcpus=2  \

PowerKVM guest vCPUs does not yet have L2 and L3 cache elements
I had this bug raised some time ago, probably related?
https://bugs.launchpad.net/qemu/+bug/1774605


Regards,
-Satheesh.
> 
> 
> On mambo I get:
> 
> [    0.005069][    T1] smp: Bringing up secondary CPUs ...
> [    0.011656][    T1] smp: Brought up 2 nodes, 8 CPUs
> [    0.011682][    T1] numa: Node 0 CPUs: 0-3
> [    0.011709][    T1] numa: Node 1 CPUs: 4-7
> [    0.012015][    T1] BUG: arch topology borken
> [    0.012040][    T1]      the SMT domain not a subset of the CACHE domain
> [    0.012107][    T1] BUG: Unable to handle kernel data access on read at 0x80000001012e7398
> [    0.012142][    T1] Faulting instruction address: 0xc0000000001aa4f0
> [    0.012174][    T1] Oops: Kernel access of bad area, sig: 11 [#1]
> [    0.012206][    T1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
> [    0.012236][    T1] Modules linked in:
> [    0.012264][    T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc2-00095-g7430ad5aa700 #1
> [    0.012304][    T1] NIP:  c0000000001aa4f0 LR: c0000000001aa498 CTR: 0000000000000000
> [    0.012341][    T1] REGS: c0000000ef583880 TRAP: 0380   Not tainted  (5.9.0-rc2-00095-g7430ad5aa700)
> [    0.012379][    T1] MSR:  9000000002009033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 44002222  XER: 00040000
> [    0.012439][    T1] CFAR: c0000000000101b0 IRQMASK: 0
> [    0.012439][    T1] GPR00: 0000000000000000 c0000000ef583b10 c0000000017fd000 0000000000004000
> [    0.012439][    T1] GPR04: 0000000000000000 0000000000000800 0000000000000000 0000000000000000
> [    0.012439][    T1] GPR08: 0000000000000000 c0000000012eb398 c0000000ffffc000 0000000000000000
> [    0.012439][    T1] GPR12: 0000000000000020 c0000000019f0000 00000000ffff8ad1 0000000000000000
> [    0.012439][    T1] GPR16: c0000000ef068658 c0000000018d7ba8 0000000000000008 c000000001690bb8
> [    0.012439][    T1] GPR20: c00000000182dc80 c0000000ef06be90 00000000ffff8ad1 c000000001014aa8
> [    0.012439][    T1] GPR24: 0000000000000917 c0000000ef068e00 0000000000000000 c0000000ef06be00
> [    0.012439][    T1] GPR28: 0000000000000001 c0000000ef068640 c0000000ef4a1800 c000000001832310
> [    0.012774][    T1] NIP [c0000000001aa4f0] build_sched_domains+0x5c0/0x14f0
> [    0.012812][    T1] LR [c0000000001aa498] build_sched_domains+0x568/0x14f0
> [    0.012842][    T1] Call Trace:
> [    0.012872][    T1] [c0000000ef583b10] [c0000000001aa3b4] build_sched_domains+0x484/0x14f0 (unreliable)
> [    0.012922][    T1] [c0000000ef583c50] [c0000000001ac3d8] sched_init_domains+0xd8/0x120
> [    0.012966][    T1] [c0000000ef583ca0] [c0000000011962d0] sched_init_smp+0x50/0xc4
> [    0.013008][    T1] [c0000000ef583cd0] [c00000000117451c] kernel_init_freeable+0x1b4/0x378
> [    0.013051][    T1] [c0000000ef583db0] [c000000000012994] kernel_init+0x2c/0x158
> [    0.013092][    T1] [c0000000ef583e20] [c00000000000d9d0] ret_from_kernel_thread+0x5c/0x6c
> [    0.013128][    T1] Instruction dump:
> [    0.013151][    T1] e93b003a 712a0040 552a05ee 418203c4 2c2a0000 3920006e 913b002c 41820034
> [    0.013206][    T1] 7c6307b4 e93d0020 78631f24 7d54182a <7d2a482a> f93b0080 7d404828 314a0001
> [    0.013267][    T1] ---[ end trace 1bf5f6f38a9fd096 ]---
> 
> 
> Did I miss a lead-up patch?
> 
> See here for what I have applied:
>   https://github.com/linuxppc/linux/commits/next-test
> 
> cheers

^ permalink raw reply

* [PATCH] powerpc/papr_scm: Add PAPR command family to pass-through command-set
From: Vaibhav Jain @ 2020-09-13 21:19 UTC (permalink / raw)
  To: linuxppc-dev, linux-nvdimm
  Cc: Santosh Sivaraj, Oliver O'Halloran, Aneesh Kumar K . V,
	Vaibhav Jain, Dan Williams, Ira Weiny

Add NVDIMM_FAMILY_PAPR to the list of valid 'dimm_family_mask'
acceptable by papr_scm. This is needed as since commit
92fe2aa859f5 ("libnvdimm: Validate command family indices") libnvdimm
performs a validation of 'nd_cmd_pkg.nd_family' received as part of
ND_CMD_CALL processing to ensure only known command families can use
the general ND_CMD_CALL pass-through functionality.

Without this change the ND_CMD_CALL pass-through targeting
NVDIMM_FAMILY_PAPR error out with -EINVAL.

Fixes: 92fe2aa859f5 ("libnvdimm: Validate command family indices")
Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/papr_scm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 5493bc847bd08..27268370dee00 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -898,6 +898,9 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	p->bus_desc.of_node = p->pdev->dev.of_node;
 	p->bus_desc.provider_name = kstrdup(p->pdev->name, GFP_KERNEL);
 
+	/* Set the dimm command family mask to accept PDSMs */
+	set_bit(NVDIMM_FAMILY_PAPR, &p->bus_desc.dimm_family_mask);
+
 	if (!p->bus_desc.provider_name)
 		return -ENOMEM;
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH] powerpc/papr_scm: Support dynamic enable/disable of performance statistics
From: Vaibhav Jain @ 2020-09-13 21:21 UTC (permalink / raw)
  To: linuxppc-dev, linux-nvdimm
  Cc: Santosh Sivaraj, Oliver O'Halloran, Aneesh Kumar K . V,
	Vaibhav Jain, Dan Williams, Ira Weiny

Collection of performance statistics of an NVDIMM can be dynamically
enabled/disabled from the Hypervisor Management Console even when the
guest lpar is running. The current implementation however will check if
the performance statistics collection is supported during NVDIMM probe
and if yes will assume that to be the case afterwards.

Hence we update papr_scm to remove this assumption from the code by
eliminating the 'stat_buffer_len' member from 'struct papr_scm_priv'
that was used to cache the max buffer size needed to fetch NVDIMM
performance stats from PHYP. With that struct member gone, various
functions that depended on it are updated. Specifically
perf_stats_show() is updated to query the PHYP first for the size of
buffer needed to hold all performance statistics instead of relying on
'stat_buffer_len'

Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/papr_scm.c | 53 +++++++++++------------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 27268370dee00..6697e1c3b9ebe 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -112,9 +112,6 @@ struct papr_scm_priv {
 
 	/* Health information for the dimm */
 	u64 health_bitmap;
-
-	/* length of the stat buffer as expected by phyp */
-	size_t stat_buffer_len;
 };
 
 static LIST_HEAD(papr_nd_regions);
@@ -230,14 +227,15 @@ static int drc_pmem_query_n_bind(struct papr_scm_priv *p)
  * - If buff_stats == NULL the return value is the size in byes of the buffer
  * needed to hold all supported performance-statistics.
  * - If buff_stats != NULL and num_stats == 0 then we copy all known
- * performance-statistics to 'buff_stat' and expect to be large enough to
- * hold them.
+ * performance-statistics to 'buff_stat' and expect it to be large enough to
+ * hold them. The 'buff_size' args contains the size of the 'buff_stats'
  * - if buff_stats != NULL and num_stats > 0 then copy the requested
  * performance-statistics to buff_stats.
  */
 static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p,
 				    struct papr_scm_perf_stats *buff_stats,
-				    unsigned int num_stats)
+				    unsigned int num_stats,
+				    size_t buff_size)
 {
 	unsigned long ret[PLPAR_HCALL_BUFSIZE];
 	size_t size;
@@ -261,12 +259,18 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p,
 			size = sizeof(struct papr_scm_perf_stats) +
 				num_stats * sizeof(struct papr_scm_perf_stat);
 		else
-			size = p->stat_buffer_len;
+			size = buff_size;
 	} else {
 		/* In case of no out buffer ignore the size */
 		size = 0;
 	}
 
+	/* verify that the buffer size needed is sufficient */
+	if (size > buff_size) {
+		__WARN();
+		return -EINVAL;
+	}
+
 	/* Do the HCALL asking PHYP for info */
 	rc = plpar_hcall(H_SCM_PERFORMANCE_STATS, ret, p->drc_index,
 			 buff_stats ? virt_to_phys(buff_stats) : 0,
@@ -277,6 +281,10 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p,
 		dev_err(&p->pdev->dev,
 			"Unknown performance stats, Err:0x%016lX\n", ret[0]);
 		return -ENOENT;
+	} else if (rc == H_AUTHORITY) {
+		dev_dbg(&p->pdev->dev,
+			"Performance stats in-accessible\n");
+		return -EPERM;
 	} else if (rc != H_SUCCESS) {
 		dev_err(&p->pdev->dev,
 			"Failed to query performance stats, Err:%lld\n", rc);
@@ -526,10 +534,6 @@ static int papr_pdsm_fuel_gauge(struct papr_scm_priv *p,
 	struct papr_scm_perf_stat *stat;
 	struct papr_scm_perf_stats *stats;
 
-	/* Silently fail if fetching performance metrics isn't  supported */
-	if (!p->stat_buffer_len)
-		return 0;
-
 	/* Allocate request buffer enough to hold single performance stat */
 	size = sizeof(struct papr_scm_perf_stats) +
 		sizeof(struct papr_scm_perf_stat);
@@ -543,9 +547,11 @@ static int papr_pdsm_fuel_gauge(struct papr_scm_priv *p,
 	stat->stat_val = 0;
 
 	/* Fetch the fuel gauge and populate it in payload */
-	rc = drc_pmem_query_stats(p, stats, 1);
+	rc = drc_pmem_query_stats(p, stats, 1, size);
 	if (rc < 0) {
 		dev_dbg(&p->pdev->dev, "Err(%d) fetching fuel gauge\n", rc);
+		/* Silently fail if unable to fetch performance metric */
+		rc = 0;
 		goto free_stats;
 	}
 
@@ -786,23 +792,25 @@ static ssize_t perf_stats_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
 	int index;
-	ssize_t rc;
+	ssize_t rc, buff_len;
 	struct seq_buf s;
 	struct papr_scm_perf_stat *stat;
 	struct papr_scm_perf_stats *stats;
 	struct nvdimm *dimm = to_nvdimm(dev);
 	struct papr_scm_priv *p = nvdimm_provider_data(dimm);
 
-	if (!p->stat_buffer_len)
-		return -ENOENT;
+	/* fetch the length of buffer needed to get all stats */
+	buff_len = drc_pmem_query_stats(p, NULL, 0, 0);
+	if (buff_len <= 0)
+		return buff_len;
 
 	/* Allocate the buffer for phyp where stats are written */
-	stats = kzalloc(p->stat_buffer_len, GFP_KERNEL);
+	stats = kzalloc(buff_len, GFP_KERNEL);
 	if (!stats)
 		return -ENOMEM;
 
 	/* Ask phyp to return all dimm perf stats */
-	rc = drc_pmem_query_stats(p, stats, 0);
+	rc = drc_pmem_query_stats(p, stats, 0, buff_len);
 	if (rc)
 		goto free_stats;
 	/*
@@ -891,7 +899,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	struct nd_region_desc ndr_desc;
 	unsigned long dimm_flags;
 	int target_nid, online_nid;
-	ssize_t stat_size;
 
 	p->bus_desc.ndctl = papr_scm_ndctl;
 	p->bus_desc.module = THIS_MODULE;
@@ -962,16 +969,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	list_add_tail(&p->region_list, &papr_nd_regions);
 	mutex_unlock(&papr_ndr_lock);
 
-	/* Try retriving the stat buffer and see if its supported */
-	stat_size = drc_pmem_query_stats(p, NULL, 0);
-	if (stat_size > 0) {
-		p->stat_buffer_len = stat_size;
-		dev_dbg(&p->pdev->dev, "Max perf-stat size %lu-bytes\n",
-			p->stat_buffer_len);
-	} else {
-		dev_info(&p->pdev->dev, "Dimm performance stats unavailable\n");
-	}
-
 	return 0;
 
 err:	nvdimm_bus_unregister(p->bus);
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH] spi: fsl-espi: Only process interrupts for expected events
From: Chris Packham @ 2020-09-13 22:03 UTC (permalink / raw)
  To: broonie@kernel.org, npiggin@gmail.com, hkallweit1@gmail.com,
	Joakim Tjernlund
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org, linux-spi@vger.kernel.org
In-Reply-To: <20200904002812.7300-1-chris.packham@alliedtelesis.co.nz>

Hi All,

On 4/09/20 12:28 pm, Chris Packham wrote:
> The SPIE register contains counts for the TX FIFO so any time the irq
> handler was invoked we would attempt to process the RX/TX fifos. Use the
> SPIM value to mask the events so that we only process interrupts that
> were expected.
>
> This was a latent issue exposed by commit 3282a3da25bd ("powerpc/64:
> Implement soft interrupt replay in C").
>
> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> Cc: stable@vger.kernel.org
> ---
ping?
>
> Notes:
>      I've tested this on a T2080RDB and a custom board using the T2081 SoC. With
>      this change I don't see any spurious instances of the "Transfer done but
>      SPIE_DON isn't set!" or "Transfer done but rx/tx fifo's aren't empty!" messages
>      and the updates to spi flash are successful.
>      
>      I think this should go into the stable trees that contain 3282a3da25bd but I
>      haven't added a Fixes: tag because I think 3282a3da25bd exposed the issue as
>      opposed to causing it.
>
>   drivers/spi/spi-fsl-espi.c | 5 +++--
>   1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/spi/spi-fsl-espi.c b/drivers/spi/spi-fsl-espi.c
> index 7e7c92cafdbb..cb120b68c0e2 100644
> --- a/drivers/spi/spi-fsl-espi.c
> +++ b/drivers/spi/spi-fsl-espi.c
> @@ -574,13 +574,14 @@ static void fsl_espi_cpu_irq(struct fsl_espi *espi, u32 events)
>   static irqreturn_t fsl_espi_irq(s32 irq, void *context_data)
>   {
>   	struct fsl_espi *espi = context_data;
> -	u32 events;
> +	u32 events, mask;
>   
>   	spin_lock(&espi->lock);
>   
>   	/* Get interrupt events(tx/rx) */
>   	events = fsl_espi_read_reg(espi, ESPI_SPIE);
> -	if (!events) {
> +	mask = fsl_espi_read_reg(espi, ESPI_SPIM);
> +	if (!(events & mask)) {
>   		spin_unlock(&espi->lock);
>   		return IRQ_NONE;
>   	}

^ permalink raw reply

* Re: [PATCH v1 4/5] powerpc/fault: Avoid heavy search_exception_tables() verification
From: Nicholas Piggin @ 2020-09-14  2:23 UTC (permalink / raw)
  To: Aneesh Kumar K.V, Benjamin Herrenschmidt, Christophe Leroy,
	Michael Ellerman, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1ded5e11-a9e0-a98f-295c-c623e0a5ed36@csgroup.eu>

Excerpts from Christophe Leroy's message of September 9, 2020 4:20 pm:
> 
> 
> Le 09/09/2020 à 08:04, Aneesh Kumar K.V a écrit :
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> 
>>> search_exception_tables() is an heavy operation, we have to avoid it.
>>> When KUAP is selected, we'll know the fault has been blocked by KUAP.
>>> Otherwise, it behaves just as if the address was already in the TLBs
>>> and no fault was generated.
>>>
>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>> ---
>>>   arch/powerpc/mm/fault.c | 20 +++++---------------
>>>   1 file changed, 5 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>>> index 525e0c2b5406..edde169ba3a6 100644
>>> --- a/arch/powerpc/mm/fault.c
>>> +++ b/arch/powerpc/mm/fault.c
>>> @@ -214,24 +214,14 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
>>>   	if (address >= TASK_SIZE)
>>>   		return true;
>>>   
>>> -	if (!is_exec && (error_code & DSISR_PROTFAULT) &&
>>> -	    !search_exception_tables(regs->nip)) {
>>> +	// Read/write fault blocked by KUAP is bad, it can never succeed.
>>> +	if (bad_kuap_fault(regs, address, is_write)) {
>>>   		pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n",
>>> -				    address,
>>> -				    from_kuid(&init_user_ns, current_uid()));
>>> -	}
>>> -
>>> -	// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
>>> -	if (!search_exception_tables(regs->nip))
>>> -		return true;
>> 
>> We still need to keep this ? Without that we detect the lack of
>> exception tables pretty late.
> 
> Is that a problem at all to detect the lack of exception tables late ?
> That case is very unlikely and will lead to failure anyway. So, is it 
> worth impacting performance of the likely case which will always have an 
> exception table and where we expect the exception to run as fast as 
> possible ?
> 
> The other architectures I have looked at (arm64 and x86) only have the 
> exception table search together with the down_read_trylock(&mm->mmap_sem).

Yeah I don't see how it'd be a problem. User could arrange for page 
table to already be at this address and avoid the fault so it's not the 
right way to stop an attacker, KUAP is.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH] spi: fsl-espi: Only process interrupts for expected events
From: Nicholas Piggin @ 2020-09-14  2:28 UTC (permalink / raw)
  To: broonie@kernel.org
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org, linux-spi@vger.kernel.org
In-Reply-To: <ecedc71d-100a-7d7a-ff7f-ef1a3086dd74@alliedtelesis.co.nz>

Excerpts from Chris Packham's message of September 14, 2020 8:03 am:
> Hi All,
> 
> On 4/09/20 12:28 pm, Chris Packham wrote:
>> The SPIE register contains counts for the TX FIFO so any time the irq
>> handler was invoked we would attempt to process the RX/TX fifos. Use the
>> SPIM value to mask the events so that we only process interrupts that
>> were expected.
>>
>> This was a latent issue exposed by commit 3282a3da25bd ("powerpc/64:
>> Implement soft interrupt replay in C").
>>
>> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
>> Cc: stable@vger.kernel.org
>> ---
> ping?

I don't know the code/hardware but thanks for tracking this down.

Was there anything more to be done with Jocke's observations, or would 
that be a follow-up patch if anything?

If this patch fixes your problem it should probably go in, unless there 
are any objections.

Thanks,
Nick

>>
>> Notes:
>>      I've tested this on a T2080RDB and a custom board using the T2081 SoC. With
>>      this change I don't see any spurious instances of the "Transfer done but
>>      SPIE_DON isn't set!" or "Transfer done but rx/tx fifo's aren't empty!" messages
>>      and the updates to spi flash are successful.
>>      
>>      I think this should go into the stable trees that contain 3282a3da25bd but I
>>      haven't added a Fixes: tag because I think 3282a3da25bd exposed the issue as
>>      opposed to causing it.
>>
>>   drivers/spi/spi-fsl-espi.c | 5 +++--
>>   1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/spi/spi-fsl-espi.c b/drivers/spi/spi-fsl-espi.c
>> index 7e7c92cafdbb..cb120b68c0e2 100644
>> --- a/drivers/spi/spi-fsl-espi.c
>> +++ b/drivers/spi/spi-fsl-espi.c
>> @@ -574,13 +574,14 @@ static void fsl_espi_cpu_irq(struct fsl_espi *espi, u32 events)
>>   static irqreturn_t fsl_espi_irq(s32 irq, void *context_data)
>>   {
>>   	struct fsl_espi *espi = context_data;
>> -	u32 events;
>> +	u32 events, mask;
>>   
>>   	spin_lock(&espi->lock);
>>   
>>   	/* Get interrupt events(tx/rx) */
>>   	events = fsl_espi_read_reg(espi, ESPI_SPIE);
>> -	if (!events) {
>> +	mask = fsl_espi_read_reg(espi, ESPI_SPIM);
>> +	if (!(events & mask)) {
>>   		spin_unlock(&espi->lock);
>>   		return IRQ_NONE;
>>   	}

^ permalink raw reply

* Re: [PATCH 12/15] selftests/seccomp: powerpc: Fix seccomp return value testing
From: Michael Ellerman @ 2020-09-14  3:38 UTC (permalink / raw)
  To: Kees Cook, linux-kernel
  Cc: Thadeu Lima de Souza Cascardo, Will Drewry, Kees Cook,
	linux-xtensa, linux-mips, Andy Lutomirski, Max Filippov,
	linux-arm-kernel, linux-kselftest, linuxppc-dev,
	Christian Brauner
In-Reply-To: <20200912110820.597135-13-keescook@chromium.org>

Kees Cook <keescook@chromium.org> writes:
> On powerpc, the errno is not inverted, and depends on ccr.so being
> set. Add this to a powerpc definition of SYSCALL_RET_SET().
>
> Co-developed-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
> Link: https://lore.kernel.org/linux-kselftest/20200911181012.171027-1-cascardo@canonical.com/
> Fixes: 5d83c2b37d43 ("selftests/seccomp: Add powerpc support")
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)

This looks right to me, and matches what strace does AFAICS.

Reviewed-by: Michael Ellerman <mpe@ellerman.id.au>

cheers

> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 623953a53032..bbab2420d708 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -1750,6 +1750,21 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS		struct pt_regs
>  # define SYSCALL_NUM(_regs)	(_regs).gpr[0]
>  # define SYSCALL_RET(_regs)	(_regs).gpr[3]
> +# define SYSCALL_RET_SET(_regs, _val)				\
> +	do {							\
> +		typeof(_val) _result = (_val);			\
> +		/*						\
> +		 * A syscall error is signaled by CR0 SO bit	\
> +		 * and the code is stored as a positive value.	\
> +		 */						\
> +		if (_result < 0) {				\
> +			SYSCALL_RET(_regs) = -result;		\
> +			(_regs).ccr |= 0x10000000;		\
> +		} else {					\
> +			SYSCALL_RET(_regs) = result;		\
> +			(_regs).ccr &= ~0x10000000;		\
> +		}						\
> +	} while (0)
>  #elif defined(__s390__)
>  # define ARCH_REGS		s390_regs
>  # define SYSCALL_NUM(_regs)	(_regs).gprs[2]
> -- 
> 2.25.1

^ permalink raw reply

* [PATCH -next] soc/qman: convert to use be32_add_cpu()
From: Liu Shixin @ 2020-09-14  4:17 UTC (permalink / raw)
  To: Li Yang; +Cc: Liu Shixin, linuxppc-dev, linux-kernel, linux-arm-kernel

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
drivers/soc/fsl/qbman/qman_test_api.c---
 drivers/soc/fsl/qbman/qman_test_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/qbman/qman_test_api.c b/drivers/soc/fsl/qbman/qman_test_api.c
index 2895d062cf51..7066b2f1467c 100644
--- a/drivers/soc/fsl/qbman/qman_test_api.c
+++ b/drivers/soc/fsl/qbman/qman_test_api.c
@@ -86,7 +86,7 @@ static void fd_inc(struct qm_fd *fd)
 	len--;
 	qm_fd_set_param(fd, fmt, off, len);
 
-	fd->cmd = cpu_to_be32(be32_to_cpu(fd->cmd) + 1);
+	be32_add_cpu(&fd->cmd, 1);
 }
 
 /* The only part of the 'fd' we can't memcmp() is the ppid */
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2 0/4] more mm switching vs TLB shootdown and lazy tlb fixes
From: Nicholas Piggin @ 2020-09-14  4:52 UTC (permalink / raw)
  To: linux-mm @ kvack . org
  Cc: linux-arch, Jens Axboe, Peter Zijlstra, Aneesh Kumar K . V,
	linux-kernel, Nicholas Piggin, sparclinux, Andrew Morton,
	linuxppc-dev, David S . Miller

This is an attempt to fix a few different related issues around
switching mm, TLB flushing, and lazy tlb mm handling.

This will require all architectures to eventually move to disabling
irqs over activate_mm, but it's possible we could add another arch
call after irqs are re-enabled for those few which can't do their
entire activation with irqs disabled.

Testing so far indicates this has fixed a mm refcounting bug that
powerpc was running into (via distro report and backport). I haven't
had any real feedback on this series outside powerpc (and it doesn't
really affect other archs), so I propose patches 1,2,4 go via the
powerpc tree.

There is no dependency between them and patch 3, I put it there only
because it follows the history of the code (powerpc code was written
using the sparc64 logic), but I guess they have to go via different arch
trees. Dave, I'll leave patch 3 with you.

Thanks,
Nick

Since v1:
- Updates from Michael Ellerman's review comments.

Nicholas Piggin (4):
  mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race
  powerpc: select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
  sparc64: remove mm_cpumask clearing to fix kthread_use_mm race
  powerpc/64s/radix: Fix mm_cpumask trimming race vs kthread_use_mm

 arch/Kconfig                           |  7 +++
 arch/powerpc/Kconfig                   |  1 +
 arch/powerpc/include/asm/mmu_context.h |  2 +-
 arch/powerpc/include/asm/tlb.h         | 13 ------
 arch/powerpc/mm/book3s64/radix_tlb.c   | 23 ++++++---
 arch/sparc/kernel/smp_64.c             | 65 ++++++--------------------
 fs/exec.c                              | 17 ++++++-
 7 files changed, 54 insertions(+), 74 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH v2 1/4] mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race
From: Nicholas Piggin @ 2020-09-14  4:52 UTC (permalink / raw)
  To: linux-mm @ kvack . org
  Cc: linux-arch, Jens Axboe, Peter Zijlstra, Aneesh Kumar K . V,
	linux-kernel, Nicholas Piggin, sparclinux, Andrew Morton,
	linuxppc-dev, David S . Miller
In-Reply-To: <20200914045219.3736466-1-npiggin@gmail.com>

Reading and modifying current->mm and current->active_mm and switching
mm should be done with irqs off, to prevent races seeing an intermediate
state.

This is similar to commit 38cf307c1f20 ("mm: fix kthread_use_mm() vs TLB
invalidate"). At exec-time when the new mm is activated, the old one
should usually be single-threaded and no longer used, unless something
else is holding an mm_users reference (which may be possible).

Absent other mm_users, there is also a race with preemption and lazy tlb
switching. Consider the kernel_execve case where the current thread is
using a lazy tlb active mm:

  call_usermodehelper()
    kernel_execve()
      old_mm = current->mm;
      active_mm = current->active_mm;
      *** preempt *** -------------------->  schedule()
                                               prev->active_mm = NULL;
                                               mmdrop(prev active_mm);
                                             ...
                      <--------------------  schedule()
      current->mm = mm;
      current->active_mm = mm;
      if (!old_mm)
          mmdrop(active_mm);

If we switch back to the kernel thread from a different mm, there is a
double free of the old active_mm, and a missing free of the new one.

Closing this race only requires interrupts to be disabled while ->mm
and ->active_mm are being switched, but the TLB problem requires also
holding interrupts off over activate_mm. Unfortunately not all archs
can do that yet, e.g., arm defers the switch if irqs are disabled and
expects finish_arch_post_lock_switch() to be called to complete the
flush; um takes a blocking lock in activate_mm().

So as a first step, disable interrupts across the mm/active_mm updates
to close the lazy tlb preempt race, and provide an arch option to
extend that to activate_mm which allows architectures doing IPI based
TLB shootdowns to close the second race.

This is a bit ugly, but in the interest of fixing the bug and backporting
before all architectures are converted this is a compromise.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig |  7 +++++++
 fs/exec.c    | 17 +++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index af14a567b493..94821e3f94d1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -414,6 +414,13 @@ config MMU_GATHER_NO_GATHER
 	bool
 	depends on MMU_GATHER_TABLE_FREE
 
+config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+	bool
+	help
+	  Temporary select until all architectures can be converted to have
+	  irqs disabled over activate_mm. Architectures that do IPI based TLB
+	  shootdowns should enable this.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/fs/exec.c b/fs/exec.c
index a91003e28eaa..d4fb18baf1fb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1130,11 +1130,24 @@ static int exec_mmap(struct mm_struct *mm)
 	}
 
 	task_lock(tsk);
-	active_mm = tsk->active_mm;
 	membarrier_exec_mmap(mm);
-	tsk->mm = mm;
+
+	local_irq_disable();
+	active_mm = tsk->active_mm;
 	tsk->active_mm = mm;
+	tsk->mm = mm;
+	/*
+	 * This prevents preemption while active_mm is being loaded and
+	 * it and mm are being updated, which could cause problems for
+	 * lazy tlb mm refcounting when these are updated by context
+	 * switches. Not all architectures can handle irqs off over
+	 * activate_mm yet.
+	 */
+	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+		local_irq_enable();
 	activate_mm(active_mm, mm);
+	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+		local_irq_enable();
 	tsk->mm->vmacache_seqnum = 0;
 	vmacache_flush(tsk);
 	task_unlock(tsk);
-- 
2.23.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox