The Linux Kernel Mailing List
 help / color / mirror / Atom feed
* [RFC] entry: Untangle the return value of syscall_enter_from_user_mode from syscall NR
@ 2026-07-01 17:42 Michal Suchánek
  2026-07-01 18:29 ` H. Peter Anvin
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Michal Suchánek @ 2026-07-01 17:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jonathan Corbet, Shuah Khan, Huacai Chen, WANG Xuerui,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP), Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, Heiko Carstens, Vasily Gorbik,
	Alexander Gordeev, Christian Borntraeger, Sven Schnelle,
	Andy Lutomirski, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Andrew Donnellan, Mark Rutland,
	Michal Suchánek, Arnd Bergmann, Jiaxun Yang, Ryan Roberts,
	Greg Kroah-Hartman, Mukesh Kumar Chaurasiya, Shrikanth Hegde,
	Zong Li, Nam Cao, Deepak Gupta, Lukas Gerlach, Rui Qi, Kees Cook,
	linux-doc, linux-kernel, loongarch, linuxppc-dev, linux-riscv,
	linux-s390

The return value of syscall_enter_from_user_mode is used both for the
adjusted syscall number and the indicator that a syscall should be
skipped.

As seccomp can be invoked on any syscall, including invalid ones this
somewhat undermines seccomp.

While the seccomp variants that terminate the process do not need to
care about this for the filter that sets the syscall return value this
disctinction is required.

Pass the syscall number as a pointer to the inline entry functions, and
use the return value exclusively for the indication that the syscall is
already handled.

This should avoid the need for the s390 PIF_SYSCALL_RET_SET which is the
workaround for exactly this deficiency.

If this is desirable the patch could be split into some series that
adjusts the code flow where needed so that the final change is mostly
mechanical.

There is also another way to handle this problem.

With x86 using bit 30 to denote compatibility syscall it sounds like
declaring syscall number a 30bit quantity would work.

Then bit 31 could be used to denote an invalid syscall that can never be
executed, and the -1 returned from syscall_enter_from_user_mode would
then be inherently invalid.

That is so long as no architectures use syscall numbers outside of this
range so far, and the limitation is considered fine.

Signed-off-by: Michal Suchánek <msuchanek@suse.de>
---
 Documentation/core-api/entry.rst | 12 ++++++----
 arch/loongarch/kernel/syscall.c  |  6 ++---
 arch/powerpc/kernel/syscall.c    |  3 ++-
 arch/riscv/kernel/traps.c        |  6 ++---
 arch/s390/kernel/syscall.c       |  6 ++---
 arch/x86/entry/syscall_32.c      | 39 +++++++++++++++-----------------
 arch/x86/entry/syscall_64.c      | 19 ++++++++--------
 include/linux/entry-common.h     | 38 ++++++++++++++-----------------
 8 files changed, 63 insertions(+), 66 deletions(-)

diff --git a/Documentation/core-api/entry.rst b/Documentation/core-api/entry.rst
index 71d8eedc0549..b0bfae31fe7c 100644
--- a/Documentation/core-api/entry.rst
+++ b/Documentation/core-api/entry.rst
@@ -68,12 +68,14 @@ invoked from low-level assembly code looks like this:
   noinstr void syscall(struct pt_regs *regs, int nr)
   {
 	arch_syscall_enter(regs);
-	nr = syscall_enter_from_user_mode(regs, nr);
 
-	instrumentation_begin();
-	if (!invoke_syscall(regs, nr) && nr != -1)
-	 	result_reg(regs) = __sys_ni_syscall(regs);
-	instrumentation_end();
+	/* Skip syscall when -1 is returned */
+	if (!syscall_enter_from_user_mode(regs, &nr)) {
+		instrumentation_begin();
+		if (!invoke_syscall(regs, nr) && nr != -1)
+			result_reg(regs) = __sys_ni_syscall(regs);
+		instrumentation_end();
+	}
 
 	syscall_exit_to_user_mode(regs);
   }
diff --git a/arch/loongarch/kernel/syscall.c b/arch/loongarch/kernel/syscall.c
index 94c1c3b5b0b5..fc18ac56b91c 100644
--- a/arch/loongarch/kernel/syscall.c
+++ b/arch/loongarch/kernel/syscall.c
@@ -58,7 +58,7 @@ typedef long (*sys_call_fn)(unsigned long, unsigned long,
 
 void noinstr __no_stack_protector do_syscall(struct pt_regs *regs)
 {
-	unsigned long nr;
+	unsigned long nr, ret;
 	sys_call_fn syscall_fn;
 
 	nr = regs->regs[11];
@@ -70,11 +70,11 @@ void noinstr __no_stack_protector do_syscall(struct pt_regs *regs)
 	regs->orig_a0 = regs->regs[4];
 	regs->regs[4] = -ENOSYS;
 
-	nr = syscall_enter_from_user_mode(regs, nr);
+	ret = syscall_enter_from_user_mode(regs, &nr);
 
 	add_random_kstack_offset();
 
-	if (nr < NR_syscalls) {
+	if (nr < NR_syscalls && !ret) {
 		syscall_fn = sys_call_table[array_index_nospec(nr, NR_syscalls)];
 		regs->regs[4] = syscall_fn(regs->orig_a0, regs->regs[5], regs->regs[6],
 					   regs->regs[7], regs->regs[8], regs->regs[9]);
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index a9da2af6efa8..45d11d518c51 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -20,7 +20,8 @@ notrace long system_call_exception(struct pt_regs *regs, unsigned long r0)
 	syscall_fn f;
 
 	add_random_kstack_offset();
-	r0 = syscall_enter_from_user_mode(regs, r0);
+	if (unlikely(syscall_enter_from_user_mode(regs, &r0)))
+		return syscall_get_error(current, regs);
 
 	if (unlikely(r0 >= NR_syscalls)) {
 		if (unlikely(trap_is_unsupported_scv(regs))) {
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 8c62c771a656..9326a4a50696 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -325,7 +325,7 @@ asmlinkage __visible __trap_section  __no_stack_protector
 void do_trap_ecall_u(struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
-		long syscall = regs->a7;
+		long ret, syscall = regs->a7;
 
 		regs->epc += 4;
 		regs->orig_a0 = regs->a0;
@@ -333,11 +333,11 @@ void do_trap_ecall_u(struct pt_regs *regs)
 
 		riscv_v_vstate_discard(regs);
 
-		syscall = syscall_enter_from_user_mode(regs, syscall);
+		ret = syscall_enter_from_user_mode(regs, &syscall);
 
 		add_random_kstack_offset();
 
-		if (syscall >= 0 && syscall < NR_syscalls) {
+		if (syscall >= 0 && syscall < NR_syscalls && !ret) {
 			syscall = array_index_nospec(syscall, NR_syscalls);
 			syscall_handler(regs, syscall);
 		}
diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c
index 75d5a3cab14e..9e5b873c011d 100644
--- a/arch/s390/kernel/syscall.c
+++ b/arch/s390/kernel/syscall.c
@@ -95,7 +95,7 @@ SYSCALL_DEFINE0(ni_syscall)
 
 void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
 {
-	unsigned long nr;
+	unsigned long nr, ret;
 
 	enter_from_user_mode(regs);
 	add_random_kstack_offset();
@@ -121,7 +121,7 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
 		regs->psw.addr = current->restart_block.arch_data;
 		current->restart_block.arch_data = 1;
 	}
-	nr = syscall_enter_from_user_mode_work(regs, nr);
+	ret = syscall_enter_from_user_mode_work(regs, &nr);
 	/*
 	 * In the s390 ptrace ABI, both the syscall number and the return value
 	 * use gpr2. However, userspace puts the syscall number either in the
@@ -129,7 +129,7 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
 	 * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
 	 * and if set, the syscall will be skipped.
 	 */
-	if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
+	if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET) || ret))
 		goto out;
 	regs->gprs[2] = -ENOSYS;
 	if (likely(nr < NR_syscalls)) {
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 31b9492fe851..525e99691b31 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -128,7 +128,7 @@ static __always_inline bool int80_is_external(void)
  */
 __visible noinstr void do_int80_emulation(struct pt_regs *regs)
 {
-	int nr;
+	long nr;
 
 	/* Kernel does not use INT $0x80! */
 	if (unlikely(!user_mode(regs))) {
@@ -168,8 +168,7 @@ __visible noinstr void do_int80_emulation(struct pt_regs *regs)
 	nr = syscall_32_enter(regs);
 
 	local_irq_enable();
-	nr = syscall_enter_from_user_mode_work(regs, nr);
-	do_syscall_32_irqs_on(regs, nr);
+	syscall_enter_from_user_mode_work(regs, &nr);
 
 	instrumentation_end();
 	syscall_exit_to_user_mode(regs);
@@ -208,7 +207,7 @@ __visible noinstr void do_int80_emulation(struct pt_regs *regs)
  */
 DEFINE_FREDENTRY_RAW(int80_emulation)
 {
-	int nr;
+	long nr;
 
 	enter_from_user_mode(regs);
 
@@ -232,8 +231,10 @@ DEFINE_FREDENTRY_RAW(int80_emulation)
 	nr = syscall_32_enter(regs);
 
 	local_irq_enable();
-	nr = syscall_enter_from_user_mode_work(regs, nr);
-	do_syscall_32_irqs_on(regs, nr);
+	if (!syscall_enter_from_user_mode_work(regs, &nr)) {
+		nr &= GENMASK(31, 0);
+		do_syscall_32_irqs_on(regs, nr);
+	}
 
 	instrumentation_end();
 	syscall_exit_to_user_mode(regs);
@@ -245,20 +246,17 @@ DEFINE_FREDENTRY_RAW(int80_emulation)
 /* Handles int $0x80 on a 32bit kernel */
 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
 {
-	int nr = syscall_32_enter(regs);
-
-	/*
-	 * Subtlety here: if ptrace pokes something larger than 2^31-1 into
-	 * orig_ax, the int return value truncates it. This matches
-	 * the semantics of syscall_get_nr().
-	 */
-	nr = syscall_enter_from_user_mode(regs, nr);
-	instrumentation_begin();
+	long nr = syscall_32_enter(regs);
 
 	add_random_kstack_offset();
-	do_syscall_32_irqs_on(regs, nr);
+	if (!syscall_enter_from_user_mode(regs, &nr)) {
+		instrumentation_begin();
 
-	instrumentation_end();
+		nr &= & GENMASK(31, 0);
+		do_syscall_32_irqs_on(regs, nr);
+
+		instrumentation_end();
+	}
 	syscall_exit_to_user_mode(regs);
 }
 #endif /* !CONFIG_IA32_EMULATION */
@@ -301,10 +299,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
 		return false;
 	}
 
-	nr = syscall_enter_from_user_mode_work(regs, nr);
-
-	/* Now this is just like a normal syscall. */
-	do_syscall_32_irqs_on(regs, nr);
+	if (!syscall_enter_from_user_mode_work(regs, &nr))
+		/* Now this is just like a normal syscall. */
+		do_syscall_32_irqs_on(regs, nr);
 
 	instrumentation_end();
 	syscall_exit_to_user_mode(regs);
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 71f032504e73..3400c2f43a62 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -84,19 +84,20 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 }
 
 /* Returns true to return using SYSRET, or false to use IRET */
-__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, long nr)
 {
-	nr = syscall_enter_from_user_mode(regs, nr);
-
-	instrumentation_begin();
 	add_random_kstack_offset();
+	if (!syscall_enter_from_user_mode(regs, &nr)) {
 
-	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
-		/* Invalid system call, but still a system call. */
-		regs->ax = __x64_sys_ni_syscall(regs);
-	}
+		instrumentation_begin();
 
-	instrumentation_end();
+		if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr)) {
+			/* Invalid system call, but still a system call. */
+			regs->ax = __x64_sys_ni_syscall(regs);
+		}
+
+		instrumentation_end();
+	}
 	syscall_exit_to_user_mode(regs);
 
 	/*
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 416a3352261f..4991071d01fe 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -69,10 +69,8 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 	}
 }
 
-static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
+static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work, unsigned long *syscall)
 {
-	long syscall, ret = 0;
-
 	/*
 	 * Handle Syscall User Dispatch.  This must comes first, since
 	 * the ABI here can be something that doesn't make sense for
@@ -93,27 +91,25 @@ static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned l
 
 	/* Handle ptrace */
 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
-		ret = arch_ptrace_report_syscall_entry(regs);
-		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+		if (arch_ptrace_report_syscall_entry(regs) || (work & SYSCALL_WORK_SYSCALL_EMU))
 			return -1L;
 	}
 
 	/* Do seccomp after ptrace, to catch any tracer changes. */
 	if (work & SYSCALL_WORK_SECCOMP) {
-		ret = __secure_computing();
-		if (ret == -1L)
-			return ret;
+		if (__secure_computing())
+			return -1L;
 	}
 
 	/* Either of the above might have changed the syscall number */
-	syscall = syscall_get_nr(current, regs);
+	*syscall = syscall_get_nr(current, regs);
 
 	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
-		syscall = trace_syscall_enter(regs, syscall);
+		*syscall = trace_syscall_enter(regs, *syscall);
 
-	syscall_enter_audit(regs, syscall);
+	syscall_enter_audit(regs, *syscall);
 
-	return ret ? : syscall;
+	return 0;
 }
 
 /**
@@ -126,12 +122,12 @@ static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned l
  * enabled after invoking enter_from_user_mode(), enabling interrupts and
  * extra architecture specific work.
  *
- * Returns: The original or a modified syscall number
+ * Returns: The original or a modified syscall number as syscall
  *
- * If the returned syscall number is -1 then the syscall should be
- * skipped. In this case the caller may invoke syscall_set_error() or
- * syscall_set_return_value() first.  If neither of those are called and -1
- * is returned, then the syscall will fail with ENOSYS.
+ * If the returned value is -1 then the syscall should be skipped. In this case
+ * the caller may invoke syscall_set_error() or syscall_set_return_value()
+ * first.  If neither of those are called and -1 is returned, then the syscall
+ * will fail with ENOSYS.
  *
  * It handles the following work items:
  *
@@ -139,14 +135,14 @@ static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned l
  *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
  *  2) Invocation of audit_syscall_entry()
  */
-static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long *syscall)
 {
 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
 
 	if (work & SYSCALL_WORK_ENTER)
-		syscall = syscall_trace_enter(regs, work);
+		return syscall_trace_enter(regs, work, syscall);
 
-	return syscall;
+	return 0;
 }
 
 /**
@@ -167,7 +163,7 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re
  * Returns: The original or a modified syscall number. See
  * syscall_enter_from_user_mode_work() for further explanation.
  */
-static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long *syscall)
 {
 	long ret;
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-07-02 20:45 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-07-01 17:42 [RFC] entry: Untangle the return value of syscall_enter_from_user_mode from syscall NR Michal Suchánek
2026-07-01 18:29 ` H. Peter Anvin
2026-07-02  9:30   ` Michal Suchánek
2026-07-02  8:12 ` Sven Schnelle
2026-07-02  9:12   ` Michal Suchánek
2026-07-02 12:01     ` Sven Schnelle
2026-07-02 12:13       ` Michal Suchánek
2026-07-02 11:24 ` Thomas Gleixner
2026-07-02 11:45   ` Michal Suchánek
2026-07-02 20:45     ` Thomas Gleixner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox