LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 5/5] KVM: selftests: Remove __NR_userfaultfd syscall fallback
From: Sean Christopherson @ 2021-08-20 22:50 UTC (permalink / raw)
  To: Russell King, Catalin Marinas, Will Deacon, Guo Ren,
	Thomas Bogendoerfer, Michael Ellerman, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Steven Rostedt, Ingo Molnar,
	Oleg Nesterov, Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	Mathieu Desnoyers, Paul E. McKenney, Boqun Feng, Paolo Bonzini,
	Shuah Khan
  Cc: linux-s390, kvm, Ben Gardon, linux-kernel, linux-csky, linux-mips,
	Peter Foley, Paul Mackerras, linux-kselftest, Sean Christopherson,
	Shakeel Butt, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20210820225002.310652-1-seanjc@google.com>

Revert the __NR_userfaultfd syscall fallback added for KVM selftests now
that x86's unistd_{32,63}.h overrides are under uapi/ and thus not in
KVM sefltests' search path, i.e. now that KVM gets x86 syscall numbers
from the installed kernel headers.

No functional change intended.

Cc: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/arch/x86/include/uapi/asm/unistd_64.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/arch/x86/include/uapi/asm/unistd_64.h b/tools/arch/x86/include/uapi/asm/unistd_64.h
index 4205ed4158bf..cb52a3a8b8fc 100644
--- a/tools/arch/x86/include/uapi/asm/unistd_64.h
+++ b/tools/arch/x86/include/uapi/asm/unistd_64.h
@@ -1,7 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __NR_userfaultfd
-#define __NR_userfaultfd 282
-#endif
 #ifndef __NR_perf_event_open
 # define __NR_perf_event_open 298
 #endif
-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply related

* [PATCH v2 4/5] KVM: selftests: Add a test for KVM_RUN+rseq to detect task migration bugs
From: Sean Christopherson @ 2021-08-20 22:50 UTC (permalink / raw)
  To: Russell King, Catalin Marinas, Will Deacon, Guo Ren,
	Thomas Bogendoerfer, Michael Ellerman, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Steven Rostedt, Ingo Molnar,
	Oleg Nesterov, Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	Mathieu Desnoyers, Paul E. McKenney, Boqun Feng, Paolo Bonzini,
	Shuah Khan
  Cc: linux-s390, kvm, Ben Gardon, linux-kernel, linux-csky, linux-mips,
	Peter Foley, Paul Mackerras, linux-kselftest, Sean Christopherson,
	Shakeel Butt, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20210820225002.310652-1-seanjc@google.com>

Add a test to verify an rseq's CPU ID is updated correctly if the task is
migrated while the kernel is handling KVM_RUN.  This is a regression test
for a bug introduced by commit 72c3c0fe54a3 ("x86/kvm: Use generic xfer
to guest work function"), where TIF_NOTIFY_RESUME would be cleared by KVM
without updating rseq, leading to a stale CPU ID and other badness.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/.gitignore  |   1 +
 tools/testing/selftests/kvm/Makefile    |   3 +
 tools/testing/selftests/kvm/rseq_test.c | 154 ++++++++++++++++++++++++
 3 files changed, 158 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/rseq_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 0709af0144c8..6d031ff6b68e 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -47,6 +47,7 @@
 /kvm_page_table_test
 /memslot_modification_stress_test
 /memslot_perf_test
+/rseq_test
 /set_memory_region_test
 /steal_time
 /kvm_binary_stats_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 5832f510a16c..0756e79cb513 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -80,6 +80,7 @@ TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += memslot_perf_test
+TEST_GEN_PROGS_x86_64 += rseq_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
 TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test
@@ -92,6 +93,7 @@ TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_aarch64 += kvm_page_table_test
+TEST_GEN_PROGS_aarch64 += rseq_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test
@@ -103,6 +105,7 @@ TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
 TEST_GEN_PROGS_s390x += kvm_page_table_test
+TEST_GEN_PROGS_s390x += rseq_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 TEST_GEN_PROGS_s390x += kvm_binary_stats_test
 
diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c
new file mode 100644
index 000000000000..d28d7ba1a64a
--- /dev/null
+++ b/tools/testing/selftests/kvm/rseq_test.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <syscall.h>
+#include <sys/ioctl.h>
+#include <asm/atomic.h>
+#include <asm/barrier.h>
+#include <linux/rseq.h>
+#include <linux/unistd.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define VCPU_ID 0
+
+static __thread volatile struct rseq __rseq = {
+	.cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
+};
+
+#define RSEQ_SIG 0xdeadbeef
+
+static pthread_t migration_thread;
+static cpu_set_t possible_mask;
+static bool done;
+
+static atomic_t seq_cnt;
+
+static void guest_code(void)
+{
+	for (;;)
+		GUEST_SYNC(0);
+}
+
+static void sys_rseq(int flags)
+{
+	int r;
+
+	r = syscall(__NR_rseq, &__rseq, sizeof(__rseq), flags, RSEQ_SIG);
+	TEST_ASSERT(!r, "rseq failed, errno = %d (%s)", errno, strerror(errno));
+}
+
+static void *migration_worker(void *ign)
+{
+	cpu_set_t allowed_mask;
+	int r, i, nr_cpus, cpu;
+
+	CPU_ZERO(&allowed_mask);
+
+	nr_cpus = CPU_COUNT(&possible_mask);
+
+	for (i = 0; i < 20000; i++) {
+		cpu = i % nr_cpus;
+		if (!CPU_ISSET(cpu, &possible_mask))
+			continue;
+
+		CPU_SET(cpu, &allowed_mask);
+
+		/*
+		 * Bump the sequence count twice to allow the reader to detect
+		 * that a migration may have occurred in between rseq and sched
+		 * CPU ID reads.  An odd sequence count indicates a migration
+		 * is in-progress, while a completely different count indicates
+		 * a migration occurred since the count was last read.
+		 */
+		atomic_inc(&seq_cnt);
+		r = sched_setaffinity(0, sizeof(allowed_mask), &allowed_mask);
+		TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)",
+			    errno, strerror(errno));
+		atomic_inc(&seq_cnt);
+
+		CPU_CLR(cpu, &allowed_mask);
+
+		/*
+		 * Let the read-side get back into KVM_RUN to improve the odds
+		 * of task migration coinciding with KVM's run loop.
+		 */
+		usleep(1);
+	}
+	done = true;
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	u32 cpu, rseq_cpu;
+	int r, snapshot;
+
+	/* Tell stdout not to buffer its content */
+	setbuf(stdout, NULL);
+
+	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
+	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
+		    strerror(errno));
+
+	if (CPU_COUNT(&possible_mask) < 2) {
+		print_skip("Only one CPU, task migration not possible\n");
+		exit(KSFT_SKIP);
+	}
+
+	sys_rseq(0);
+
+	/*
+	 * Create and run a dummy VM that immediately exits to userspace via
+	 * GUEST_SYNC, while concurrently migrating the process by setting its
+	 * CPU affinity.
+	 */
+	vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+	pthread_create(&migration_thread, NULL, migration_worker, 0);
+
+	while (!done) {
+		vcpu_run(vm, VCPU_ID);
+		TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
+			    "Guest failed?");
+
+		/*
+		 * Verify rseq's CPU matches sched's CPU.  Ensure migration
+		 * doesn't occur between sched_getcpu() and reading the rseq
+		 * cpu_id by rereading both if the sequence count changes, or
+		 * if the count is odd (migration in-progress).
+		 */
+		do {
+			/*
+			 * Drop bit 0 to force a mismatch if the count is odd,
+			 * i.e. if a migration is in-progress.
+			 */
+			snapshot = atomic_read(&seq_cnt) & ~1;
+			smp_rmb();
+			cpu = sched_getcpu();
+			rseq_cpu = READ_ONCE(__rseq.cpu_id);
+			smp_rmb();
+		} while (snapshot != atomic_read(&seq_cnt));
+
+		TEST_ASSERT(rseq_cpu == cpu,
+			    "rseq CPU = %d, sched CPU = %d\n", rseq_cpu, cpu);
+	}
+
+	pthread_join(migration_thread, NULL);
+
+	kvm_vm_free(vm);
+
+	sys_rseq(RSEQ_FLAG_UNREGISTER);
+
+	return 0;
+}
-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply related

* [PATCH v2 3/5] tools: Move x86 syscall number fallbacks to .../uapi/
From: Sean Christopherson @ 2021-08-20 22:50 UTC (permalink / raw)
  To: Russell King, Catalin Marinas, Will Deacon, Guo Ren,
	Thomas Bogendoerfer, Michael Ellerman, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Steven Rostedt, Ingo Molnar,
	Oleg Nesterov, Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	Mathieu Desnoyers, Paul E. McKenney, Boqun Feng, Paolo Bonzini,
	Shuah Khan
  Cc: linux-s390, kvm, Ben Gardon, linux-kernel, linux-csky, linux-mips,
	Peter Foley, Paul Mackerras, linux-kselftest, Sean Christopherson,
	Shakeel Butt, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20210820225002.310652-1-seanjc@google.com>

Move unistd_{32,64}.h from x86/include/asm to x86/include/uapi/asm so
that tools/selftests that install kernel headers, e.g. KVM selftests, can
include non-uapi tools headers, e.g. to get 'struct list_head', without
effectively overriding the installed non-tool uapi headers.

Swapping KVM's search order, e.g. to search the kernel headers before
tool headers, is not a viable option as doing results in linux/type.h and
other core headers getting pulled from the kernel headers, which do not
have the kernel-internal typedefs that are used through tools, including
many files outside of selftests/kvm's control.

Prior to commit cec07f53c398 ("perf tools: Move syscall number fallbacks
from perf-sys.h to tools/arch/x86/include/asm/"), the handcoded numbers
were actual fallbacks, i.e. overriding unistd_{32,64}.h from the kernel
headers was unintentional.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/arch/x86/include/{ => uapi}/asm/unistd_32.h | 0
 tools/arch/x86/include/{ => uapi}/asm/unistd_64.h | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tools/arch/x86/include/{ => uapi}/asm/unistd_32.h (100%)
 rename tools/arch/x86/include/{ => uapi}/asm/unistd_64.h (100%)

diff --git a/tools/arch/x86/include/asm/unistd_32.h b/tools/arch/x86/include/uapi/asm/unistd_32.h
similarity index 100%
rename from tools/arch/x86/include/asm/unistd_32.h
rename to tools/arch/x86/include/uapi/asm/unistd_32.h
diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/uapi/asm/unistd_64.h
similarity index 100%
rename from tools/arch/x86/include/asm/unistd_64.h
rename to tools/arch/x86/include/uapi/asm/unistd_64.h
-- 
2.33.0.rc2.250.ged5fa647cd-goog

^ permalink raw reply

* [PATCH v2 2/5] entry: rseq: Call rseq_handle_notify_resume() in tracehook_notify_resume()
From: Sean Christopherson @ 2021-08-20 22:49 UTC (permalink / raw)
  To: Russell King, Catalin Marinas, Will Deacon, Guo Ren,
	Thomas Bogendoerfer, Michael Ellerman, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Steven Rostedt, Ingo Molnar,
	Oleg Nesterov, Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	Mathieu Desnoyers, Paul E. McKenney, Boqun Feng, Paolo Bonzini,
	Shuah Khan
  Cc: linux-s390, kvm, Ben Gardon, linux-kernel, linux-csky, linux-mips,
	Peter Foley, Paul Mackerras, linux-kselftest, Sean Christopherson,
	Shakeel Butt, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20210820225002.310652-1-seanjc@google.com>

Invoke rseq_handle_notify_resume() from tracehook_notify_resume() now
that the two function are always called back-to-back by architectures
that have rseq.  The rseq helper is stubbed out for architectures that
don't support rseq, i.e. this is a nop across the board.

Note, tracehook_notify_resume() is horribly named and arguably does not
belong in tracehook.h as literally every line of code in it has nothing
to do with tracing.  But, that's been true since commit a42c6ded827d
("move key_repace_session_keyring() into tracehook_notify_resume()")
first usurped tracehook_notify_resume() back in 2012.  Punt cleaning that
mess up to future patches.

No functional change intended.

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm/kernel/signal.c     | 1 -
 arch/arm64/kernel/signal.c   | 1 -
 arch/csky/kernel/signal.c    | 4 +---
 arch/mips/kernel/signal.c    | 4 +---
 arch/powerpc/kernel/signal.c | 4 +---
 arch/s390/kernel/signal.c    | 1 -
 include/linux/tracehook.h    | 2 ++
 kernel/entry/common.c        | 4 +---
 kernel/entry/kvm.c           | 4 +---
 9 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index a3a38d0a4c85..9df68d139965 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -670,7 +670,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 				uprobe_notify_resume(regs);
 			} else {
 				tracehook_notify_resume(regs);
-				rseq_handle_notify_resume(NULL, regs);
 			}
 		}
 		local_irq_disable();
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 23036334f4dc..22b55db13da6 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -951,7 +951,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 
 			if (thread_flags & _TIF_NOTIFY_RESUME) {
 				tracehook_notify_resume(regs);
-				rseq_handle_notify_resume(NULL, regs);
 
 				/*
 				 * If we reschedule after checking the affinity
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index 312f046d452d..bc4238b9f709 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -260,8 +260,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+	if (thread_info_flags & _TIF_NOTIFY_RESUME)
 		tracehook_notify_resume(regs);
-		rseq_handle_notify_resume(NULL, regs);
-	}
 }
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index f1e985109da0..c9b2a75563e1 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -906,10 +906,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+	if (thread_info_flags & _TIF_NOTIFY_RESUME)
 		tracehook_notify_resume(regs);
-		rseq_handle_notify_resume(NULL, regs);
-	}
 
 	user_enter();
 }
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index e600764a926c..b93b87df499d 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -293,10 +293,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 		do_signal(current);
 	}
 
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+	if (thread_info_flags & _TIF_NOTIFY_RESUME)
 		tracehook_notify_resume(regs);
-		rseq_handle_notify_resume(NULL, regs);
-	}
 }
 
 static unsigned long get_tm_stackpointer(struct task_struct *tsk)
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 78ef53b29958..b307db26bf2d 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -537,5 +537,4 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
 void do_notify_resume(struct pt_regs *regs)
 {
 	tracehook_notify_resume(regs);
-	rseq_handle_notify_resume(NULL, regs);
 }
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 3e80c4bc66f7..2564b7434b4d 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -197,6 +197,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 
 	mem_cgroup_handle_over_high();
 	blkcg_maybe_throttle_current();
+
+	rseq_handle_notify_resume(NULL, regs);
 }
 
 /*
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bf16395b9e13..d5a61d565ad5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 			handle_signal_work(regs, ti_work);
 
-		if (ti_work & _TIF_NOTIFY_RESUME) {
+		if (ti_work & _TIF_NOTIFY_RESUME)
 			tracehook_notify_resume(regs);
-			rseq_handle_notify_resume(NULL, regs);
-		}
 
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 049fd06b4c3d..49972ee99aff 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -19,10 +19,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 		if (ti_work & _TIF_NEED_RESCHED)
 			schedule();
 
-		if (ti_work & _TIF_NOTIFY_RESUME) {
+		if (ti_work & _TIF_NOTIFY_RESUME)
 			tracehook_notify_resume(NULL);
-			rseq_handle_notify_resume(NULL, NULL);
-		}
 
 		ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
 		if (ret)
-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply related

* [PATCH v2 1/5] KVM: rseq: Update rseq when processing NOTIFY_RESUME on xfer to KVM guest
From: Sean Christopherson @ 2021-08-20 22:49 UTC (permalink / raw)
  To: Russell King, Catalin Marinas, Will Deacon, Guo Ren,
	Thomas Bogendoerfer, Michael Ellerman, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Steven Rostedt, Ingo Molnar,
	Oleg Nesterov, Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	Mathieu Desnoyers, Paul E. McKenney, Boqun Feng, Paolo Bonzini,
	Shuah Khan
  Cc: linux-s390, kvm, Ben Gardon, linux-kernel, linux-csky, linux-mips,
	Peter Foley, Paul Mackerras, linux-kselftest, Sean Christopherson,
	Shakeel Butt, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20210820225002.310652-1-seanjc@google.com>

Invoke rseq's NOTIFY_RESUME handler when processing the flag prior to
transferring to a KVM guest, which is roughly equivalent to an exit to
userspace and processes many of the same pending actions.  While the task
cannot be in an rseq critical section as the KVM path is reachable only
by via ioctl(KVM_RUN), the side effects that apply to rseq outside of a
critical section still apply, e.g. the current CPU needs to be updated if
the task is migrated.

Clearing TIF_NOTIFY_RESUME without informing rseq can lead to segfaults
and other badness in userspace VMMs that use rseq in combination with KVM,
e.g. due to the CPU ID being stale after task migration.

Fixes: 72c3c0fe54a3 ("x86/kvm: Use generic xfer to guest work function")
Reported-by: Peter Foley <pefoley@google.com>
Bisected-by: Doug Evans <dje@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 kernel/entry/kvm.c |  4 +++-
 kernel/rseq.c      | 14 +++++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 49972ee99aff..049fd06b4c3d 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -19,8 +19,10 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 		if (ti_work & _TIF_NEED_RESCHED)
 			schedule();
 
-		if (ti_work & _TIF_NOTIFY_RESUME)
+		if (ti_work & _TIF_NOTIFY_RESUME) {
 			tracehook_notify_resume(NULL);
+			rseq_handle_notify_resume(NULL, NULL);
+		}
 
 		ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
 		if (ret)
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 35f7bd0fced0..6d45ac3dae7f 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 
 	if (unlikely(t->flags & PF_EXITING))
 		return;
-	ret = rseq_ip_fixup(regs);
-	if (unlikely(ret < 0))
-		goto error;
+
+	/*
+	 * regs is NULL if and only if the caller is in a syscall path.  Skip
+	 * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
+	 * kill a misbehaving userspace on debug kernels.
+	 */
+	if (regs) {
+		ret = rseq_ip_fixup(regs);
+		if (unlikely(ret < 0))
+			goto error;
+	}
 	if (unlikely(rseq_update_cpu_id(t)))
 		goto error;
 	return;
-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply related

* [PATCH v2 0/5] KVM: rseq: Fix and a test for a KVM+rseq bug
From: Sean Christopherson @ 2021-08-20 22:49 UTC (permalink / raw)
  To: Russell King, Catalin Marinas, Will Deacon, Guo Ren,
	Thomas Bogendoerfer, Michael Ellerman, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, Steven Rostedt, Ingo Molnar,
	Oleg Nesterov, Thomas Gleixner, Peter Zijlstra, Andy Lutomirski,
	Mathieu Desnoyers, Paul E. McKenney, Boqun Feng, Paolo Bonzini,
	Shuah Khan
  Cc: linux-s390, kvm, Ben Gardon, linux-kernel, linux-csky, linux-mips,
	Peter Foley, Paul Mackerras, linux-kselftest, Sean Christopherson,
	Shakeel Butt, linuxppc-dev, linux-arm-kernel

Patch 1 fixes a KVM+rseq bug where KVM's handling of TIF_NOTIFY_RESUME,
e.g. for task migration, clears the flag without informing rseq and leads
to stale data in userspace's rseq struct.

Patch 2 is a cleanup to try and make future bugs less likely.  It's also
a baby step towards moving and renaming tracehook_notify_resume() since
it has nothing to do with tracing.

Patch 3 is a fix/cleanup to stop overriding x86's unistd_{32,64}.h when
the include path (intentionally) omits tools' uapi headers.  KVM's
selftests do exactly that so that they can pick up the uapi headers from
the installed kernel headers, and still use various tools/ headers that
mirror kernel code, e.g. linux/types.h.  This allows the new test in
patch 4 to reference __NR_rseq without having to manually define it.

Patch 4 is a regression test for the KVM+rseq bug.

Patch 5 is a cleanup made possible by patch 3.

v2:
  - Don't touch rseq_cs when handling KVM case so that rseq_syscall() will
    still detect a naughty userspace. [Mathieu]
  - Use a sequence counter + retry in the test to ensure the process isn't
    migrated between sched_getcpu() and reading rseq.cpu_id, i.e. to
    avoid a flaky test. [Mathieu]
  - Add Mathieu's ack for patch 2.
  - Add more comments in the test.

v1: https://lkml.kernel.org/r/20210818001210.4073390-1-seanjc@google.com

Sean Christopherson (5):
  KVM: rseq: Update rseq when processing NOTIFY_RESUME on xfer to KVM
    guest
  entry: rseq: Call rseq_handle_notify_resume() in
    tracehook_notify_resume()
  tools: Move x86 syscall number fallbacks to .../uapi/
  KVM: selftests: Add a test for KVM_RUN+rseq to detect task migration
    bugs
  KVM: selftests: Remove __NR_userfaultfd syscall fallback

 arch/arm/kernel/signal.c                      |   1 -
 arch/arm64/kernel/signal.c                    |   1 -
 arch/csky/kernel/signal.c                     |   4 +-
 arch/mips/kernel/signal.c                     |   4 +-
 arch/powerpc/kernel/signal.c                  |   4 +-
 arch/s390/kernel/signal.c                     |   1 -
 include/linux/tracehook.h                     |   2 +
 kernel/entry/common.c                         |   4 +-
 kernel/rseq.c                                 |  14 +-
 .../x86/include/{ => uapi}/asm/unistd_32.h    |   0
 .../x86/include/{ => uapi}/asm/unistd_64.h    |   3 -
 tools/testing/selftests/kvm/.gitignore        |   1 +
 tools/testing/selftests/kvm/Makefile          |   3 +
 tools/testing/selftests/kvm/rseq_test.c       | 154 ++++++++++++++++++
 14 files changed, 175 insertions(+), 21 deletions(-)
 rename tools/arch/x86/include/{ => uapi}/asm/unistd_32.h (100%)
 rename tools/arch/x86/include/{ => uapi}/asm/unistd_64.h (83%)
 create mode 100644 tools/testing/selftests/kvm/rseq_test.c

-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply

* Re: [PATCH v3] PCI: Move pci_dev_is/assign_added() to pci.h
From: Bjorn Helgaas @ 2021-08-20 22:37 UTC (permalink / raw)
  To: Niklas Schnelle
  Cc: linux-arch, linux-s390, linux-kernel, Paul Mackerras, linux-pci,
	Bjorn Helgaas, linuxppc-dev
In-Reply-To: <20210720150145.640727-1-schnelle@linux.ibm.com>

On Tue, Jul 20, 2021 at 05:01:45PM +0200, Niklas Schnelle wrote:
> The helper function pci_dev_is_added() from drivers/pci/pci.h is used in
> PCI arch code of both s390 and powerpc leading to awkward relative
> includes. Move it to the global include/linux/pci.h and get rid of these
> includes just for that one function.

I agree the includes are awkward.

But the arch code *using* pci_dev_is_added() seems awkward, too.

AFAICS, in powerpc, pci_dev_is_added() is only used by
pnv_pci_ioda_fixup_iov() and pseries_pci_fixup_iov_resources().  Those
are only called from pcibios_add_device(), which is only called from
pci_device_add().

Is it even possible for pci_dev_is_added() to be true in that path?

s390 uses pci_dev_is_added() in recover_store(), but I don't know what
that is (looks like a sysfs file, but it's not documented) or why s390
is the only arch that does this.

Maybe we should make powerpc and s390 less special?

> Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
> ---
> Since v1 (and bad v2):
> - Fixed accidental removal of PCI_DPC_RECOVERED, PCI_DPC_RECOVERING
>   defines and also move these to include/linux/pci.h
> 
>  arch/powerpc/platforms/powernv/pci-sriov.c |  3 ---
>  arch/powerpc/platforms/pseries/setup.c     |  1 -
>  arch/s390/pci/pci_sysfs.c                  |  2 --
>  drivers/pci/hotplug/acpiphp_glue.c         |  1 -
>  drivers/pci/pci.h                          | 15 ---------------
>  include/linux/pci.h                        | 15 +++++++++++++++
>  6 files changed, 15 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
> index 28aac933a439..2e0ca5451e85 100644
> --- a/arch/powerpc/platforms/powernv/pci-sriov.c
> +++ b/arch/powerpc/platforms/powernv/pci-sriov.c
> @@ -9,9 +9,6 @@
>  
>  #include "pci.h"
>  
> -/* for pci_dev_is_added() */
> -#include "../../../../drivers/pci/pci.h"
> -
>  /*
>   * The majority of the complexity in supporting SR-IOV on PowerNV comes from
>   * the need to put the MMIO space for each VF into a separate PE. Internally
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 631a0d57b6cd..17585ec9f955 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -74,7 +74,6 @@
>  #include <asm/hvconsole.h>
>  
>  #include "pseries.h"
> -#include "../../../../drivers/pci/pci.h"
>  
>  DEFINE_STATIC_KEY_FALSE(shared_processor);
>  EXPORT_SYMBOL_GPL(shared_processor);
> diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
> index 6e2450c2b9c1..8dbe54ef8f8e 100644
> --- a/arch/s390/pci/pci_sysfs.c
> +++ b/arch/s390/pci/pci_sysfs.c
> @@ -13,8 +13,6 @@
>  #include <linux/stat.h>
>  #include <linux/pci.h>
>  
> -#include "../../../drivers/pci/pci.h"
> -
>  #include <asm/sclp.h>
>  
>  #define zpci_attr(name, fmt, member)					\
> diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
> index f031302ad401..4cb963f88183 100644
> --- a/drivers/pci/hotplug/acpiphp_glue.c
> +++ b/drivers/pci/hotplug/acpiphp_glue.c
> @@ -38,7 +38,6 @@
>  #include <linux/slab.h>
>  #include <linux/acpi.h>
>  
> -#include "../pci.h"
>  #include "acpiphp.h"
>  
>  static LIST_HEAD(bridge_list);
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 93dcdd431072..a159cd0f6f05 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -383,21 +383,6 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
>  	return dev->error_state == pci_channel_io_perm_failure;
>  }
>  
> -/* pci_dev priv_flags */
> -#define PCI_DEV_ADDED 0
> -#define PCI_DPC_RECOVERED 1
> -#define PCI_DPC_RECOVERING 2
> -
> -static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
> -{
> -	assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
> -}
> -
> -static inline bool pci_dev_is_added(const struct pci_dev *dev)
> -{
> -	return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
> -}
> -
>  #ifdef CONFIG_PCIEAER
>  #include <linux/aer.h>
>  
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 540b377ca8f6..ea0e23dbc8ec 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -507,6 +507,21 @@ struct pci_dev {
>  	unsigned long	priv_flags;	/* Private flags for the PCI driver */
>  };
>  
> +/* pci_dev priv_flags */
> +#define PCI_DEV_ADDED 0
> +#define PCI_DPC_RECOVERED 1
> +#define PCI_DPC_RECOVERING 2
> +
> +static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
> +{
> +	assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
> +}
> +
> +static inline bool pci_dev_is_added(const struct pci_dev *dev)
> +{
> +	return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
> +}
> +
>  static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
>  {
>  #ifdef CONFIG_PCI_IOV
> -- 
> 2.25.1
> 

^ permalink raw reply

* Re: [PATCH 1/5] KVM: rseq: Update rseq when processing NOTIFY_RESUME on xfer to KVM guest
From: Sean Christopherson @ 2021-08-20 22:26 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: KVM list, Peter Zijlstra, linux-kernel, Will Deacon, Guo Ren,
	linux-kselftest, Ben Gardon, shuah, Paul Mackerras, linux-s390,
	gor, Russell King, ARM Linux, linux-csky, Christian Borntraeger,
	Ingo Molnar, Catalin Marinas, linux-mips, Boqun Feng, paulmck,
	Heiko Carstens, rostedt, Shakeel Butt, Andy Lutomirski,
	Thomas Gleixner, Peter Foley, linux-arm-kernel,
	Thomas Bogendoerfer, Oleg Nesterov, Paolo Bonzini, linuxppc-dev
In-Reply-To: <1872633041.20290.1629485463253.JavaMail.zimbra@efficios.com>

On Fri, Aug 20, 2021, Mathieu Desnoyers wrote:
> Without the lazy clear scheme, a rseq c.s. would look like:
> 
>  *                     init(rseq_cs)
>  *                     cpu = TLS->rseq::cpu_id_start
>  *   [1]               TLS->rseq::rseq_cs = rseq_cs
>  *   [start_ip]        ----------------------------
>  *   [2]               if (cpu != TLS->rseq::cpu_id)
>  *                             goto abort_ip;
>  *   [3]               <last_instruction_in_cs>
>  *   [post_commit_ip]  ----------------------------
>  *   [4]               TLS->rseq::rseq_cs = NULL
> 
> But as a fast-path optimization, [4] is not entirely needed because the rseq_cs
> descriptor contains information about the instruction pointer range of the critical
> section. Therefore, userspace can omit [4], but if the kernel never clears it, it
> means that it will have to re-read the rseq_cs descriptor's content each time it
> needs to check it to confirm that it is not nested over a rseq c.s..
> 
> So making the kernel lazily clear the rseq_cs pointer is just an optimization which
> ensures that the kernel won't do useless work the next time it needs to check
> rseq_cs, given that it has already validated that the userspace code is currently
> not within the rseq c.s. currently advertised by the rseq_cs field.

Thanks for the explanation, much appreciated!

^ permalink raw reply

* Re: [PATCH 4/5] KVM: selftests: Add a test for KVM_RUN+rseq to detect task migration bugs
From: Sean Christopherson @ 2021-08-20 22:25 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: KVM list, Peter Zijlstra, linux-kernel, Will Deacon, Guo Ren,
	linux-kselftest, Ben Gardon, shuah, Paul Mackerras, linux-s390,
	gor, Russell King, ARM Linux, linux-csky, Christian Borntraeger,
	Ingo Molnar, Catalin Marinas, linux-mips, Boqun Feng, paulmck,
	Heiko Carstens, rostedt, Shakeel Butt, Andy Lutomirski,
	Thomas Gleixner, Peter Foley, linux-arm-kernel,
	Thomas Bogendoerfer, Oleg Nesterov, Paolo Bonzini, linuxppc-dev
In-Reply-To: <407716135.20250.1629484298288.JavaMail.zimbra@efficios.com>

On Fri, Aug 20, 2021, Mathieu Desnoyers wrote:
> I still really hate flakiness in tests, because then people stop caring when they
> fail once in a while. And with the nature of rseq, a once-in-a-while failure is a
> big deal. Let's see if we can use other tricks to ensure stability of the cpu id
> without changing timings too much.

Yeah, zero agrument regarding flaky tests.

> One idea would be to use a seqcount lock.

A sequence counter did the trick!  Thanks much!

> But even if we use that, I'm concerned that the very long writer critical
> section calling sched_setaffinity would need to be alternated with a sleep to
> ensure the read-side progresses. The sleep delay could be relatively small
> compared to the duration of the sched_setaffinity call, e.g. ratio 1:10.

I already had an arbitrary usleep(10) to let the reader make progress between
sched_setaffinity() calls.  Dropping it down to 1us didn't affect reproducibility,
so I went with that to shave those precious cycles :-)  Eliminating the delay
entirely did result in no repro, which was a nice confirmation that it's needed
to let the reader get back into KVM_RUN.

Thanks again!

^ permalink raw reply

* Re: [PATCH 1/5] KVM: rseq: Update rseq when processing NOTIFY_RESUME on xfer to KVM guest
From: Mathieu Desnoyers @ 2021-08-20 18:51 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: KVM list, Peter Zijlstra, linux-kernel, Will Deacon, Guo Ren,
	linux-kselftest, Ben Gardon, shuah, Paul Mackerras, linux-s390,
	gor, Russell King, ARM Linux, linux-csky, Christian Borntraeger,
	Ingo Molnar, Catalin Marinas, linux-mips, Boqun Feng, paulmck,
	Heiko Carstens, rostedt, Shakeel Butt, Andy Lutomirski,
	Thomas Gleixner, Peter Foley, linux-arm-kernel,
	Thomas Bogendoerfer, Oleg Nesterov, Paolo Bonzini, linuxppc-dev
In-Reply-To: <YR7tzZ98XC6OV2vu@google.com>

----- On Aug 19, 2021, at 7:48 PM, Sean Christopherson seanjc@google.com wrote:

> On Thu, Aug 19, 2021, Mathieu Desnoyers wrote:
>> ----- On Aug 17, 2021, at 8:12 PM, Sean Christopherson seanjc@google.com wrote:
>> > @@ -250,7 +250,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)
>> > 	 * If not nested over a rseq critical section, restart is useless.
>> > 	 * Clear the rseq_cs pointer and return.
>> > 	 */
>> > -	if (!in_rseq_cs(ip, &rseq_cs))
>> > +	if (!regs || !in_rseq_cs(ip, &rseq_cs))
>> 
>> I think clearing the thread's rseq_cs unconditionally here when regs is NULL
>> is not the behavior we want when this is called from xfer_to_guest_mode_work.
>> 
>> If we have a scenario where userspace ends up calling this ioctl(KVM_RUN)
>> from within a rseq c.s., we really want a CONFIG_DEBUG_RSEQ=y kernel to
>> kill this application in the rseq_syscall handler when exiting back to usermode
>> when the ioctl eventually returns.
>> 
>> However, clearing the thread's rseq_cs will prevent this from happening.
>> 
>> So I would favor an approach where we simply do:
>> 
>> if (!regs)
>>      return 0;
>> 
>> Immediately at the beginning of rseq_ip_fixup, before getting the instruction
>> pointer, so effectively skip all side-effects of the ip fixup code. Indeed, it
>> is not relevant to do any fixup here, because it is nested in a ioctl system
>> call.
>> 
>> Effectively, this would preserve the SIGSEGV behavior when this ioctl is
>> erroneously called by user-space from a rseq critical section.
> 
> Ha, that's effectively what I implemented first, but I changed it because of the
> comment in clear_rseq_cs() that says:
> 
>  The rseq_cs field is set to NULL on preemption or signal delivery ... as well
>  as well as on top of code outside of the rseq assembly block.
> 
> Which makes it sound like something might rely on clearing rseq_cs?

This comment is describing succinctly the lazy clear scheme for rseq_cs.

Without the lazy clear scheme, a rseq c.s. would look like:

 *                     init(rseq_cs)
 *                     cpu = TLS->rseq::cpu_id_start
 *   [1]               TLS->rseq::rseq_cs = rseq_cs
 *   [start_ip]        ----------------------------
 *   [2]               if (cpu != TLS->rseq::cpu_id)
 *                             goto abort_ip;
 *   [3]               <last_instruction_in_cs>
 *   [post_commit_ip]  ----------------------------
 *   [4]               TLS->rseq::rseq_cs = NULL

But as a fast-path optimization, [4] is not entirely needed because the rseq_cs
descriptor contains information about the instruction pointer range of the critical
section. Therefore, userspace can omit [4], but if the kernel never clears it, it
means that it will have to re-read the rseq_cs descriptor's content each time it
needs to check it to confirm that it is not nested over a rseq c.s..

So making the kernel lazily clear the rseq_cs pointer is just an optimization which
ensures that the kernel won't do useless work the next time it needs to check
rseq_cs, given that it has already validated that the userspace code is currently
not within the rseq c.s. currently advertised by the rseq_cs field.

> 
> Ah, or is it the case that rseq_cs is non-NULL if and only if userspace is in an
> rseq critical section, and because syscalls in critical sections are illegal, by
> definition clearing rseq_cs is a nop unless userspace is misbehaving.

Not quite, as I described above. But we want it to stay set so the CONFIG_DEBUG_RSEQ
code executed when returning from ioctl to userspace will be able to validate that
it is not nested within a rseq critical section.

> 
> If that's true, what about explicitly checking that at NOTIFY_RESUME?  Or is it
> not worth the extra code to detect an error that will likely be caught anyways?

The error will indeed already be caught on return from ioctl to userspace, so I
don't see any added value in duplicating this check.

Thanks,

Mathieu

> 
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index 35f7bd0fced0..28b8342290b0 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -282,6 +282,13 @@ void __rseq_handle_notify_resume(struct ksignal *ksig,
> struct pt_regs *regs)
> 
>        if (unlikely(t->flags & PF_EXITING))
>                return;
> +       if (!regs) {
> +#ifdef CONFIG_DEBUG_RSEQ
> +               if (t->rseq && rseq_get_rseq_cs(t, &rseq_cs))
> +                       goto error;
> +#endif
> +               return;
> +       }
>        ret = rseq_ip_fixup(regs);
>        if (unlikely(ret < 0))
>                goto error;
> 
>> Thanks for looking into this !
>> 
>> Mathieu
>> 
>> > 		return clear_rseq_cs(t);
>> > 	ret = rseq_need_restart(t, rseq_cs.flags);
>> > 	if (ret <= 0)
>> > --
>> > 2.33.0.rc1.237.g0d66db33f3-goog
>> 
>> --
>> Mathieu Desnoyers
>> EfficiOS Inc.
> > http://www.efficios.com

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [PATCH 4/5] KVM: selftests: Add a test for KVM_RUN+rseq to detect task migration bugs
From: Mathieu Desnoyers @ 2021-08-20 18:31 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: KVM list, Peter Zijlstra, linux-kernel, Will Deacon, Guo Ren,
	linux-kselftest, Ben Gardon, shuah, Paul Mackerras, linux-s390,
	gor, Russell King, ARM Linux, linux-csky, Christian Borntraeger,
	Ingo Molnar, Catalin Marinas, linux-mips, Boqun Feng, paulmck,
	Heiko Carstens, rostedt, Shakeel Butt, Andy Lutomirski,
	Thomas Gleixner, Peter Foley, linux-arm-kernel,
	Thomas Bogendoerfer, Oleg Nesterov, Paolo Bonzini, linuxppc-dev
In-Reply-To: <YR7qXvnI/AQM10gU@google.com>

----- On Aug 19, 2021, at 7:33 PM, Sean Christopherson seanjc@google.com wrote:

> On Thu, Aug 19, 2021, Mathieu Desnoyers wrote:
>> ----- On Aug 17, 2021, at 8:12 PM, Sean Christopherson seanjc@google.com wrote:
>> 
>> > Add a test to verify an rseq's CPU ID is updated correctly if the task is
>> > migrated while the kernel is handling KVM_RUN.  This is a regression test
>> > for a bug introduced by commit 72c3c0fe54a3 ("x86/kvm: Use generic xfer
>> > to guest work function"), where TIF_NOTIFY_RESUME would be cleared by KVM
>> > without updating rseq, leading to a stale CPU ID and other badness.
>> > 
>> > Signed-off-by: Sean Christopherson <seanjc@google.com>
>> > ---
>> 
>> [...]
>> 
>> > +	while (!done) {
>> > +		vcpu_run(vm, VCPU_ID);
>> > +		TEST_ASSERT(get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC,
>> > +			    "Guest failed?");
>> > +
>> > +		cpu = sched_getcpu();
>> > +		rseq_cpu = READ_ONCE(__rseq.cpu_id);
>> > +
>> > +		/*
>> > +		 * Verify rseq's CPU matches sched's CPU, and that sched's CPU
>> > +		 * is stable.  This doesn't handle the case where the task is
>> > +		 * migrated between sched_getcpu() and reading rseq, and again
>> > +		 * between reading rseq and sched_getcpu(), but in practice no
>> > +		 * false positives have been observed, while on the other hand
>> > +		 * blocking migration while this thread reads CPUs messes with
>> > +		 * the timing and prevents hitting failures on a buggy kernel.
>> > +		 */
>> 
>> I think you could get a stable cpu id between sched_getcpu and __rseq_abi.cpu_id
>> if you add a pthread mutex to protect:
>> 
>> sched_getcpu and __rseq_abi.cpu_id  reads
>> 
>> vs
>> 
>> sched_setaffinity calls within the migration thread.
>> 
>> Thoughts ?
> 
> I tried that and couldn't reproduce the bug.  That's what I attempted to call
> out
> in the blurb "blocking migration while this thread reads CPUs ... prevents
> hitting
> failures on a buggy kernel".
> 
> I considered adding arbitrary delays around the mutex to try and hit the bug,
> but
> I was worried that even if I got it "working" for this bug, the test would be
> too
> tailored to this bug and potentially miss future regression.  Letting the two
> threads run wild seemed like it would provide the best coverage, at the cost of
> potentially causing to false failures.

OK, so your point is that using mutual exclusion to ensure stability of the cpu id
changes the timings too much, to a point where the issues don't reproduce. I understand
that this mutex ties the migration thread timings to the vcpu thread's use of the mutex,
which will reduce timings randomness, which is unwanted here.

I still really hate flakiness in tests, because then people stop caring when they
fail once in a while. And with the nature of rseq, a once-in-a-while failure is a
big deal. Let's see if we can use other tricks to ensure stability of the cpu id
without changing timings too much.

One idea would be to use a seqcount lock. But even if we use that, I'm concerned that
the very long writer critical section calling sched_setaffinity would need to be
alternated with a sleep to ensure the read-side progresses. The sleep delay could be
relatively small compared to the duration of the sched_setaffinity call, e.g. ratio
1:10.

static volatile uint64_t seqcnt;

The thread responsible for setting the affinity would do something like:

for (;;) {
  atomic_inc_seq_cst(&seqcnt);
  sched_setaffinity(..., n++ % nr_cpus);
  atomic_inc_seq_cst(&seqcnt);
  usleep(1);  /* this is where read-side is allowed to progress. */
}

And the thread reading the rseq cpu id and calling sched_getcpu():

uint64_t snapshot;

do {
  snapshot = atomic_load(&seqcnt) & ~1; /* force retry if odd */
  smp_rmb();
  cpu = sched_getcpu();
  rseq_cpu = READ_ONCE(__rseq.cpu_id);
  smp_rmb();
} while (snapshot != atomic_load(&seqcnt));

So the reader retry the cpu id reads whenever sched_setaffinity is being
called by the migration thread, and whenever it is preempted for more
than one migration thread loop.

That should achieve our goal of providing cpu id stability without significantly
changing the timings of the migration thread, given that it never blocks waiting
for the reader.

Thoughts ?

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [PATCH] powerpc/32: indirect function call use bctrl rather than blrl in ret_from_kernel_thread
From: Segher Boessenkool @ 2021-08-20 18:12 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Paul Mackerras, linux-kernel
In-Reply-To: <20210820171845.GS1583@gate.crashing.org>

On Fri, Aug 20, 2021 at 12:18:45PM -0500, Segher Boessenkool wrote:
> On Fri, Aug 20, 2021 at 10:15:11PM +1000, Michael Ellerman wrote:
> > Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> > > Copied from commit 89bbe4c798bc ("powerpc/64: indirect function call
> > > use bctrl rather than blrl in ret_from_kernel_thread")
> > >
> > > blrl is not recommended to use as an indirect function call, as it may
> > > corrupt the link stack predictor.
> > 
> > Do we know if any 32-bit CPUs have a link stack predictor or similar?
> 
> 74xx do.

7450 and later, that is.  Will I ever get that right, sigh.


Segher

^ permalink raw reply

* Re: [PATCH] powerpc/32: indirect function call use bctrl rather than blrl in ret_from_kernel_thread
From: Segher Boessenkool @ 2021-08-20 17:18 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Paul Mackerras, linuxppc-dev, linux-kernel
In-Reply-To: <871r6oe2i8.fsf@mpe.ellerman.id.au>

On Fri, Aug 20, 2021 at 10:15:11PM +1000, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> > Copied from commit 89bbe4c798bc ("powerpc/64: indirect function call
> > use bctrl rather than blrl in ret_from_kernel_thread")
> >
> > blrl is not recommended to use as an indirect function call, as it may
> > corrupt the link stack predictor.
> 
> Do we know if any 32-bit CPUs have a link stack predictor or similar?

74xx do.


Segher

^ permalink raw reply

* [Bug 213079] [bisected] IRQ problems and crashes on a PowerMac G5 with 5.12.3
From: bugzilla-daemon @ 2021-08-20 16:39 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-213079-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=213079

--- Comment #18 from Erhard F. (erhard_f@mailbox.org) ---
The 'hackfix for MSI init' patch also applies on top of v5.14-rc6.

But unchanged the G5 runs later into bug #213837.

-- 
You may reply to this email to add a comment.

You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* [Bug 213837] "Kernel panic - not syncing: corrupted stack end detected inside scheduler" at building via distcc on a G5
From: bugzilla-daemon @ 2021-08-20 16:35 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-213837-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=213837

--- Comment #3 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 298395
  --> https://bugzilla.kernel.org/attachment.cgi?id=298395&action=edit
kernel .config (5.14-rc6, PowerMac G5 11,2)

-- 
You may reply to this email to add a comment.

You are receiving this mail because:
You are watching someone on the CC list of the bug.

^ permalink raw reply

* [Bug 213837] "Kernel panic - not syncing: corrupted stack end detected inside scheduler" at building via distcc on a G5
From: bugzilla-daemon @ 2021-08-20 16:34 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-213837-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=213837

--- Comment #2 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 298393
  --> https://bugzilla.kernel.org/attachment.cgi?id=298393&action=edit
dmesg (5.14-rc6, PowerMac G5 11,2)

Happens also on 5.14-rc6:

[...]
Kernel panic - not syncing: corrupted stack end detected inside scheduler
CPU: 1 PID: 32354 Comm: powerpc64-unkno Tainted: G        W        
5.14.0-rc6-PowerMacG5+ #3
Call Trace:
[c000000062feb4c0] [c00000000054de04] .dump_stack_lvl+0x98/0xe0 (unreliable)
[c000000062feb550] [c000000000068f4c] .panic+0x160/0x40c
[c000000062feb600] [c000000000818d3c] .__schedule+0x7c/0x840
[c000000062feb6d0] [c00000000081964c] .preempt_schedule_common+0x28/0x48
[c000000062feb750] [c00000000081969c] .__cond_resched+0x30/0x4c
[c000000062feb7d0] [c0000000004ee8f8] .copy_page_to_iter+0xbc/0x32c
[c000000062feb8a0] [c0000000001c9c20] .filemap_read+0x574/0x618
[c000000062feba60] [c000000000333c18] .ext4_file_read_iter+0xb8/0x11c
[c000000062febb00] [c000000000275488] .new_sync_read+0x94/0xe0
[c000000062febc00] [c000000000276c2c] .vfs_read+0x128/0x12c
[c000000062febca0] [c000000000276fc4] .ksys_read+0x78/0xc4
[c000000062febd60] [c000000000022808] .system_call_exception+0x1a4/0x1dc
[c000000062febe10] [c00000000000b4cc] system_call_common+0xec/0x250
--- interrupt: c00 at 0x3fff999e0cd0
NIP:  00003fff999e0cd0 LR: 00000001039d3660 CTR: 0000000000000000
REGS: c000000062febe80 TRAP: 0c00   Tainted: G        W         
(5.14.0-rc6-PowerMacG5+)
MSR:  900000000200f032 <SF,HV,VEC,EE,PR,FP,ME,IR,DR,RI>  CR: 24000442  XER:
00000000
IRQMASK: 0 
GPR00: 0000000000000003 00003ffff4e23690 00003fff99a0df00 0000000000000004 
GPR04: 00003fff99699010 00000000000583ca 00003fff999c1320 0000000000000000 
GPR08: 00003fff999c12e0 0000000000000000 0000000000000000 0000000000000000 
GPR12: 0000000000000000 00003fff99a93c20 000000011fbcf950 000000017149a1d0 
GPR16: 00000001039dec38 00003ffff4e23b78 00000001039deb28 00003ffff4e239c8 
GPR20: 00003ffff4e23d80 ffffffffffffffff 000000011fbce540 0000000000000000 
GPR24: 000000011fbcf930 000000011fbcfa90 0000000000000005 00003ffff4e238e0 
GPR28: 0000000103a268e8 0000000000000004 00003fff99699010 00000000000583ca 
NIP [00003fff999e0cd0] 0x3fff999e0cd0
LR [00000001039d3660] 0x1039d3660
--- interrupt: c00
Rebooting in 40 seconds..

-- 
You may reply to this email to add a comment.

You are receiving this mail because:
You are watching someone on the CC list of the bug.

^ permalink raw reply

* Re: [PATCH v2 57/63] powerpc/signal32: Use struct_group() to zero spe regs
From: Kees Cook @ 2021-08-20 15:55 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: kernel test robot, Rasmus Villemoes, Greg Kroah-Hartman,
	linux-staging, linux-wireless, linux-kernel, Gustavo A. R. Silva,
	linux-block, clang-built-linux, netdev, Paul Mackerras, dri-devel,
	Sudeep Holla, Andrew Morton, linuxppc-dev, linux-kbuild,
	linux-hardening
In-Reply-To: <877dggeesw.fsf@mpe.ellerman.id.au>

On Fri, Aug 20, 2021 at 05:49:35PM +1000, Michael Ellerman wrote:
> Kees Cook <keescook@chromium.org> writes:
> > In preparation for FORTIFY_SOURCE performing compile-time and run-time
> > field bounds checking for memset(), avoid intentionally writing across
> > neighboring fields.
> >
> > Add a struct_group() for the spe registers so that memset() can correctly reason
> > about the size:
> >
> >    In function 'fortify_memset_chk',
> >        inlined from 'restore_user_regs.part.0' at arch/powerpc/kernel/signal_32.c:539:3:
> >>> include/linux/fortify-string.h:195:4: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning]
> >      195 |    __write_overflow_field();
> >          |    ^~~~~~~~~~~~~~~~~~~~~~~~
> >
> > Cc: Michael Ellerman <mpe@ellerman.id.au>
> > Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > Cc: Paul Mackerras <paulus@samba.org>
> > Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
> > Cc: Sudeep Holla <sudeep.holla@arm.com>
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Reported-by: kernel test robot <lkp@intel.com>
> > Signed-off-by: Kees Cook <keescook@chromium.org>
> > ---
> >  arch/powerpc/include/asm/processor.h | 6 ++++--
> >  arch/powerpc/kernel/signal_32.c      | 6 +++---
> >  2 files changed, 7 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
> > index f348e564f7dd..05dc567cb9a8 100644
> > --- a/arch/powerpc/include/asm/processor.h
> > +++ b/arch/powerpc/include/asm/processor.h
> > @@ -191,8 +191,10 @@ struct thread_struct {
> >  	int		used_vsr;	/* set if process has used VSX */
> >  #endif /* CONFIG_VSX */
> >  #ifdef CONFIG_SPE
> > -	unsigned long	evr[32];	/* upper 32-bits of SPE regs */
> > -	u64		acc;		/* Accumulator */
> > +	struct_group(spe,
> > +		unsigned long	evr[32];	/* upper 32-bits of SPE regs */
> > +		u64		acc;		/* Accumulator */
> > +	);
> >  	unsigned long	spefscr;	/* SPE & eFP status */
> >  	unsigned long	spefscr_last;	/* SPEFSCR value on last prctl
> >  					   call or trap return */
> > diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
> > index 0608581967f0..77b86caf5c51 100644
> > --- a/arch/powerpc/kernel/signal_32.c
> > +++ b/arch/powerpc/kernel/signal_32.c
> > @@ -532,11 +532,11 @@ static long restore_user_regs(struct pt_regs *regs,
> >  	regs_set_return_msr(regs, regs->msr & ~MSR_SPE);
> >  	if (msr & MSR_SPE) {
> >  		/* restore spe registers from the stack */
> > -		unsafe_copy_from_user(current->thread.evr, &sr->mc_vregs,
> > -				      ELF_NEVRREG * sizeof(u32), failed);
> > +		unsafe_copy_from_user(&current->thread.spe, &sr->mc_vregs,
> > +				      sizeof(current->thread.spe), failed);
> 
> This makes me nervous, because the ABI is that we copy ELF_NEVRREG *
> sizeof(u32) bytes, not whatever sizeof(current->thread.spe) happens to
> be.
> 
> ie. if we use sizeof an inadvertent change to the fields in
> thread_struct could change how many bytes we copy out to userspace,
> which would be an ABI break.
> 
> And that's not that hard to do, because it's not at all obvious that the
> size and layout of fields in thread_struct affects the user ABI.
> 
> At the same time we don't want to copy the right number of bytes but
> the wrong content, so from that point of view using sizeof is good :)
> 
> The way we handle it in ptrace is to have BUILD_BUG_ON()s to verify that
> things match up, so maybe we should do that here too.
> 
> ie. add:
> 
> 	BUILD_BUG_ON(sizeof(current->thread.spe) == ELF_NEVRREG * sizeof(u32));
> 
> Not sure if you are happy doing that as part of this patch. I can always
> do it later if not.

Sounds good to me; I did that in a few other cases in the series where
the relationships between things seemed tenuous. :) I'll add this (as
!=) in v3.

Thanks!

-- 
Kees Cook

^ permalink raw reply

* Re: [PATCH v8 2/3] tty: hvc: pass DMA capable memory to put_chars()
From: Michael Ellerman @ 2021-08-20 12:34 UTC (permalink / raw)
  To: Daniel Axtens, Xianting Tian, gregkh, jirislaby, amit, arnd,
	osandov
  Cc: Xianting Tian, shile.zhang, linuxppc-dev, linux-kernel,
	virtualization
In-Reply-To: <87pmu8ehkk.fsf@linkitivity.dja.id.au>

Daniel Axtens <dja@axtens.net> writes:
> Xianting Tian <xianting.tian@linux.alibaba.com> writes:
>
>> As well known, hvc backend driver(eg, virtio-console) can register its
>> operations to hvc framework. The operations can contain put_chars(),
>> get_chars() and so on.
>>
>> Some hvc backend may do dma in its operations. eg, put_chars() of
>> virtio-console. But in the code of hvc framework, it may pass DMA
>> incapable memory to put_chars() under a specific configuration, which
>> is explained in commit c4baad5029(virtio-console: avoid DMA from stack):
>
> We could also run into issues on powerpc where Andrew is working on
> adding vmap-stack but the opal hvc driver assumes that it is passed a
> buffer which is not in vmalloc space but in the linear mapping.

The right fix for that is our code that calls opal has to be careful
that it's not passing vmalloc addresses.

We have many cases where we pass stack variables to opal, they'll all
have to be fixed to pass the underlying phyiscal/linear map address. The
opal hvc code will just be one more case of that.

cheers

^ permalink raw reply

* Re: [PATCH 6/6] powerpc/compat_sys: Declare syscalls
From: Michael Ellerman @ 2021-08-20 12:25 UTC (permalink / raw)
  To: Cédric Le Goater, linuxppc-dev
  Cc: Christophe Leroy, Cédric Le Goater
In-Reply-To: <20210819125656.14498-7-clg@kaod.org>

Cédric Le Goater <clg@kaod.org> writes:
> This fixes a compile error with W=1.
>
> Cc: Christophe Leroy <christophe.leroy@c-s.fr>
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/asm/syscalls.h | 31 +++++++++++++++++++++++++++++
>  1 file changed, 31 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
> index 398171fdcd9f..1d5f2abaa38a 100644
> --- a/arch/powerpc/include/asm/syscalls.h
> +++ b/arch/powerpc/include/asm/syscalls.h
> @@ -6,6 +6,7 @@
>  #include <linux/compiler.h>
>  #include <linux/linkage.h>
>  #include <linux/types.h>
> +#include <linux/compat.h>
>  
>  struct rtas_args;
>  
> @@ -18,5 +19,35 @@ asmlinkage long sys_mmap2(unsigned long addr, size_t len,
>  asmlinkage long ppc64_personality(unsigned long personality);
>  asmlinkage long sys_rtas(struct rtas_args __user *uargs);
>  
> +#ifdef CONFIG_COMPAT
> +unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
> +			       unsigned long prot, unsigned long flags,
> +			       unsigned long fd, unsigned long pgoff);
> +
> +compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count,
> +				  u32 reg6, u32 pos1, u32 pos2);
> +
> +compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count,
> +				   u32 reg6, u32 pos1, u32 pos2);
> +
> +compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count);
> +
> +asmlinkage int compat_sys_truncate64(const char __user *path, u32 reg4,
> +				     unsigned long len1, unsigned long len2);

asmlinkage does nothing, ie. it's an empty macro. Let's not add new uses
of it.

cheers

^ permalink raw reply

* Re: [PATCH 3/6] KVM: PPC: Book3S PR: Declare kvmppc_handle_exit_pr()
From: Michael Ellerman @ 2021-08-20 12:22 UTC (permalink / raw)
  To: Christophe Leroy, Cédric Le Goater, linuxppc-dev; +Cc: Christophe Leroy
In-Reply-To: <59d11f91-c235-6b7d-6d9a-de8d852aba35@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 19/08/2021 à 14:56, Cédric Le Goater a écrit :
>> This fixes a compile error with W=1.
>> 
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>   arch/powerpc/kvm/book3s.h | 1 +
>>   1 file changed, 1 insertion(+)
>> 
>> diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
>> index 740e51def5a5..c08f93b7f523 100644
>> --- a/arch/powerpc/kvm/book3s.h
>> +++ b/arch/powerpc/kvm/book3s.h
>> @@ -24,6 +24,7 @@ extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu,
>>   					int sprn, ulong *spr_val);
>>   extern int kvmppc_book3s_init_pr(void);
>>   extern void kvmppc_book3s_exit_pr(void);
>> +extern int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr);
>
> Don't add new 'extern' keywords, they are useless and pointless for functions prototypes.

I dropped it when applying.

cheers

^ permalink raw reply

* Re: [PATCH 1/6] powerpc/prom: Introduce early_reserve_mem_old()
From: Michael Ellerman @ 2021-08-20 12:22 UTC (permalink / raw)
  To: Christophe Leroy, Cédric Le Goater, linuxppc-dev; +Cc: Christophe Leroy
In-Reply-To: <d79611e0-b42b-e74b-d628-5db718e6ebfa@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 19/08/2021 à 14:56, Cédric Le Goater a écrit :
>> and condition its call with IS_ENABLED(CONFIG_PPC32). This fixes a
>> compile error with W=1.
>> 
>> arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’:
>> arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set but not used [-Werror=unused-but-set-variable]
>>    __be64 *reserve_map;
>>            ^~~~~~~~~~~
>> cc1: all warnings being treated as errors
>> 
>> Cc: Christophe Leroy <christophe.leroy@c-s.fr>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>> 
>>   Christophe, I think you had comments on this one ? Yes, I am being a bit lazy.
>
>
> Yeah, my comment was to leave thing almost as is, just drop the #ifdef CONFIG_PPC32 and instead put 
> something like:
>
> 	if (!IS_ENABLED(CONFIG_PPC32))
> 		return;

Yeah that's cleaner, let's do that.

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/32: indirect function call use bctrl rather than blrl in ret_from_kernel_thread
From: Michael Ellerman @ 2021-08-20 12:15 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <91b1d242525307ceceec7ef6e832bfbacdd4501b.1629436472.git.christophe.leroy@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Copied from commit 89bbe4c798bc ("powerpc/64: indirect function call
> use bctrl rather than blrl in ret_from_kernel_thread")
>
> blrl is not recommended to use as an indirect function call, as it may
> corrupt the link stack predictor.

Do we know if any 32-bit CPUs have a link stack predictor or similar?

cheers

> This is not a performance critical path but this should be fixed for
> consistency.
>
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  arch/powerpc/kernel/entry_32.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
> index 0273a1349006..61fdd53cdd9a 100644
> --- a/arch/powerpc/kernel/entry_32.S
> +++ b/arch/powerpc/kernel/entry_32.S
> @@ -161,10 +161,10 @@ ret_from_fork:
>  ret_from_kernel_thread:
>  	REST_NVGPRS(r1)
>  	bl	schedule_tail
> -	mtlr	r14
> +	mtctr	r14
>  	mr	r3,r15
>  	PPC440EP_ERR42
> -	blrl
> +	bctrl
>  	li	r3,0
>  	b	ret_from_syscall
>  
> -- 
> 2.25.0

^ permalink raw reply

* Re: [PATCH v2 57/63] powerpc/signal32: Use struct_group() to zero spe regs
From: Michael Ellerman @ 2021-08-20 12:13 UTC (permalink / raw)
  To: Christophe Leroy, Kees Cook, linux-kernel
  Cc: kernel test robot, Rasmus Villemoes, Greg Kroah-Hartman,
	linux-staging, linux-wireless, Gustavo A. R. Silva, dri-devel,
	linux-block, clang-built-linux, netdev, Paul Mackerras,
	linux-hardening, Sudeep Holla, Andrew Morton, linuxppc-dev,
	linux-kbuild
In-Reply-To: <0f6e508e-62b6-3840-5ff4-eb5a77635bd1@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 20/08/2021 à 09:49, Michael Ellerman a écrit :
>> Kees Cook <keescook@chromium.org> writes:
>>> In preparation for FORTIFY_SOURCE performing compile-time and run-time
>>> field bounds checking for memset(), avoid intentionally writing across
>>> neighboring fields.
>>>
>>> Add a struct_group() for the spe registers so that memset() can correctly reason
>>> about the size:
>>>
>>>     In function 'fortify_memset_chk',
>>>         inlined from 'restore_user_regs.part.0' at arch/powerpc/kernel/signal_32.c:539:3:
>>>>> include/linux/fortify-string.h:195:4: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning]
>>>       195 |    __write_overflow_field();
>>>           |    ^~~~~~~~~~~~~~~~~~~~~~~~
>>>
>>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>>> Cc: Paul Mackerras <paulus@samba.org>
>>> Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
>>> Cc: Sudeep Holla <sudeep.holla@arm.com>
>>> Cc: linuxppc-dev@lists.ozlabs.org
>>> Reported-by: kernel test robot <lkp@intel.com>
>>> Signed-off-by: Kees Cook <keescook@chromium.org>
>>> ---
>>>   arch/powerpc/include/asm/processor.h | 6 ++++--
>>>   arch/powerpc/kernel/signal_32.c      | 6 +++---
>>>   2 files changed, 7 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
>>> index f348e564f7dd..05dc567cb9a8 100644
>>> --- a/arch/powerpc/include/asm/processor.h
>>> +++ b/arch/powerpc/include/asm/processor.h
>>> @@ -191,8 +191,10 @@ struct thread_struct {
>>>   	int		used_vsr;	/* set if process has used VSX */
>>>   #endif /* CONFIG_VSX */
>>>   #ifdef CONFIG_SPE
>>> -	unsigned long	evr[32];	/* upper 32-bits of SPE regs */
>>> -	u64		acc;		/* Accumulator */
>>> +	struct_group(spe,
>>> +		unsigned long	evr[32];	/* upper 32-bits of SPE regs */
>>> +		u64		acc;		/* Accumulator */
>>> +	);
>>>   	unsigned long	spefscr;	/* SPE & eFP status */
>>>   	unsigned long	spefscr_last;	/* SPEFSCR value on last prctl
>>>   					   call or trap return */
>>> diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
>>> index 0608581967f0..77b86caf5c51 100644
>>> --- a/arch/powerpc/kernel/signal_32.c
>>> +++ b/arch/powerpc/kernel/signal_32.c
>>> @@ -532,11 +532,11 @@ static long restore_user_regs(struct pt_regs *regs,
>>>   	regs_set_return_msr(regs, regs->msr & ~MSR_SPE);
>>>   	if (msr & MSR_SPE) {
>>>   		/* restore spe registers from the stack */
>>> -		unsafe_copy_from_user(current->thread.evr, &sr->mc_vregs,
>>> -				      ELF_NEVRREG * sizeof(u32), failed);
>>> +		unsafe_copy_from_user(&current->thread.spe, &sr->mc_vregs,
>>> +				      sizeof(current->thread.spe), failed);
>> 
>> This makes me nervous, because the ABI is that we copy ELF_NEVRREG *
>> sizeof(u32) bytes, not whatever sizeof(current->thread.spe) happens to
>> be.
>> 
>> ie. if we use sizeof an inadvertent change to the fields in
>> thread_struct could change how many bytes we copy out to userspace,
>> which would be an ABI break.
>> 
>> And that's not that hard to do, because it's not at all obvious that the
>> size and layout of fields in thread_struct affects the user ABI.
>> 
>> At the same time we don't want to copy the right number of bytes but
>> the wrong content, so from that point of view using sizeof is good :)
>> 
>> The way we handle it in ptrace is to have BUILD_BUG_ON()s to verify that
>> things match up, so maybe we should do that here too.
>> 
>> ie. add:
>> 
>> 	BUILD_BUG_ON(sizeof(current->thread.spe) == ELF_NEVRREG * sizeof(u32));
>
> You mean != I guess ?

Gah. Yes I do :)

cheers

^ permalink raw reply

* [PATCH v1] powerpc/64s: Fix scv implicit soft-mask table for relocated kernels
From: Nicholas Piggin @ 2021-08-20 10:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Hari Bathini, Nicholas Piggin

The implict soft-mask table addresses get relocated if they use a
relative symbol like a label. This is right for code that runs relocated
but not for unrelocated. The scv interrupt vectors run unrelocated, so
absolute addresses are required for their soft-mask table entry.

This fixes crashing with relocated kernels, usually an asynchronous
interrupt hitting in the scv handler, then hitting the trap that checks
whether r1 is in userspace.

Cc: Hari Bathini <hbathini@linux.ibm.com>
Fixes: 325678fd0522 ("powerpc/64s: add a table of implicit soft-masked addresses")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 4aec59a77d4c..37859e62a8dc 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -812,7 +812,6 @@ __start_interrupts:
  * syscall register convention is in Documentation/powerpc/syscall64-abi.rst
  */
 EXC_VIRT_BEGIN(system_call_vectored, 0x3000, 0x1000)
-1:
 	/* SCV 0 */
 	mr	r9,r13
 	GET_PACA(r13)
@@ -842,10 +841,12 @@ EXC_VIRT_BEGIN(system_call_vectored, 0x3000, 0x1000)
 	b	system_call_vectored_sigill
 #endif
 	.endr
-2:
 EXC_VIRT_END(system_call_vectored, 0x3000, 0x1000)
 
-SOFT_MASK_TABLE(1b, 2b) // Treat scv vectors as soft-masked, see comment above.
+// Treat scv vectors as soft-masked, see comment above.
+// Use absolute values rather than labels here, so they don't get relocated,
+// because this code runs unrelocated.
+SOFT_MASK_TABLE(0xc000000000003000, 0xc000000000004000)
 
 #ifdef CONFIG_RELOCATABLE
 TRAMP_VIRT_BEGIN(system_call_vectored_tramp)
-- 
2.23.0


^ permalink raw reply related

* [PATCH] powerpc/audit: Simplify syscall_get_arch()
From: Christophe Leroy @ 2021-08-20  9:39 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Make use of is_32bit_task() and CONFIG_CPU_LITTLE_ENDIAN
to simplify syscall_get_arch().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/syscall.h | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index ba0f88f3a30d..ac766037e8a1 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -116,16 +116,11 @@ static inline void syscall_set_arguments(struct task_struct *task,
 
 static inline int syscall_get_arch(struct task_struct *task)
 {
-	int arch;
-
-	if (IS_ENABLED(CONFIG_PPC64) && !test_tsk_thread_flag(task, TIF_32BIT))
-		arch = AUDIT_ARCH_PPC64;
+	if (is_32bit_task())
+		return AUDIT_ARCH_PPC;
+	else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
+		return AUDIT_ARCH_PPC64LE;
 	else
-		arch = AUDIT_ARCH_PPC;
-
-#ifdef __LITTLE_ENDIAN__
-	arch |= __AUDIT_ARCH_LE;
-#endif
-	return arch;
+		return AUDIT_ARCH_PPC64;
 }
 #endif	/* _ASM_SYSCALL_H */
-- 
2.25.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox