The Linux Kernel Mailing List
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>,
	tglx@kernel.org, mingo@redhat.com, bp@alien8.de,
	Nathan Chancellor <nathan@kernel.org>,
	Calvin Owens <calvin@wbinvd.org>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86-ML <x86@kernel.org>, LKML <linux-kernel@vger.kernel.org>
Subject: Re: 8aeb879baf12 - significant system call latency regression, bisected
Date: Fri, 19 Jun 2026 10:14:27 +0200	[thread overview]
Message-ID: <20260619081427.GD49529@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20260617123718.GM49951@noisy.programming.kicks-ass.net>

On Wed, Jun 17, 2026 at 02:37:18PM +0200, Peter Zijlstra wrote:
>  - makes -fno-jump-tables unconditional
>  - removes array_index_nospec() from the syscall dispatch

FWIW, this also allows making all SYSCALLs __noendbr, very much
including the 'legacy' sys_call_table :-)

Compile tested with IA32_EMULATION=n and reliably yields:

vmlinux.o: warning: objtool: sys_call_table+0x0: data relocation to !ENDBR: __x64_sys_read+0x0
...
vmlinux.o: warning: objtool: sys_call_table+0xeb0: data relocation to !ENDBR: __x64_sys_listns+0x0

(which is just one little objtool patch away from being fixed)

and boots fine (in kvm).

---
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 598f178102ee..b154a2a20eb2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -90,17 +90,8 @@ CC_FLAGS_FPU += -mhard-float
 endif
 
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
-#
-# Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
-# NOTRACK prefixes. Current generation compilers unconditionally employ NOTRACK
-# for jump-tables, as such, disable jump-tables for now.
-#
-# (jump-tables are implicitly disabled by RETPOLINE)
-#
-#   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
-#
-KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
-KBUILD_RUSTFLAGS += -Zcf-protection=branch $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch)
+KBUILD_RUSTFLAGS += -Zcf-protection=branch
 else
 KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
 endif
@@ -173,6 +164,13 @@ endif
         KBUILD_RUSTFLAGS += -Ccode-model=kernel
 
         percpu_seg := gs
+
+	# Due to retpolines and cf-protection=branch's implicit NOTRACK usage
+	# for jump-tables, blanked disable jump-tables for all x86_64 builds to
+	# get a consistent behaviour across configurations. This allows
+	# removing some array_index_nospec() usage.
+	KBUILD_CFLAGS += -fno-jump-tables
+	KBUILD_RISTFLAGS += $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
 endif
 
 ifeq ($(CONFIG_STACKPROTECTOR),y)
@@ -209,15 +207,6 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 ifdef CONFIG_MITIGATION_RETPOLINE
   KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
   KBUILD_RUSTFLAGS += $(RETPOLINE_RUSTFLAGS)
-  # Additionally, avoid generating expensive indirect jumps which
-  # are subject to retpolines for small number of switch cases.
-  # LLVM turns off jump table generation by default when under
-  # retpoline builds, however, gcc does not for x86. This has
-  # only been fixed starting from gcc stable version 8.4.0 and
-  # onwards, but not for older ones. See gcc bug #86952.
-  ifndef CONFIG_CC_IS_CLANG
-    KBUILD_CFLAGS += -fno-jump-tables
-  endif
 endif
 
 ifdef CONFIG_MITIGATION_SLS
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 71f032504e73..7e87947e12be 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -8,9 +8,10 @@
 #include <linux/entry-common.h>
 #include <linux/nospec.h>
 #include <asm/syscall.h>
+#include <asm/ibt.h>
 
-#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
-#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
+#define __SYSCALL(nr, sym) extern __noendbr long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern __noendbr long __noreturn __x64_##sym(const struct pt_regs *);
 #include <asm/syscalls_64.h>
 #ifdef CONFIG_X86_X32_ABI
 #include <asm/syscalls_x32.h>
@@ -25,30 +26,47 @@
  * kernel/trace/trace_syscalls.c still wants to know the system
  * call address.
  */
-#define __SYSCALL(nr, sym) __x64_##sym,
+#define __SYSCALL(nr, sym) (void *)&__x64_##sym,
 const sys_call_ptr_t sys_call_table[] = {
 #include <asm/syscalls_64.h>
 };
 #undef  __SYSCALL
 
 #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
+static noinstr long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
+	/*
+	 * Because -fno-jump-tables, this compiles into a binary branch tree
+	 * rather than a jump-table. As such @nr is not used as an array
+	 * index. Additionally, this is an out-of-line function on purpose,
+	 * such that all the actual syscall function calls are tail-calls,
+	 * returning to our caller for the common bits.
+	 */
+	instrumentation_begin();
 	switch (nr) {
 	#include <asm/syscalls_64.h>
 	default: return __x64_sys_ni_syscall(regs);
 	}
+	instrumentation_end();
 }
 
 #ifdef CONFIG_X86_X32_ABI
-long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+static noinstr long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
+	instrumentation_begin();
 	switch (nr) {
 	#include <asm/syscalls_x32.h>
 	default: return __x64_sys_ni_syscall(regs);
 	}
+	instrumentation_end();
+}
+#else
+static __always_inline long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+	return __x64_sys_ni_syscall(regs);
 }
 #endif
+#undef  __SYSCALL
 
 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 {
@@ -59,7 +77,6 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 	unsigned int unr = nr;
 
 	if (likely(unr < NR_syscalls)) {
-		unr = array_index_nospec(unr, NR_syscalls);
 		regs->ax = x64_sys_call(regs, unr);
 		return true;
 	}
@@ -76,7 +93,6 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 	unsigned int xnr = nr - __X32_SYSCALL_BIT;
 
 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
-		xnr = array_index_nospec(xnr, X32_NR_syscalls);
 		regs->ax = x32_sys_call(regs, xnr);
 		return true;
 	}
@@ -84,7 +100,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 }
 
 /* Returns true to return using SYSRET, or false to use IRET */
-__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
 {
 	nr = syscall_enter_from_user_mode(regs, nr);
 
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index 7e88705e907f..1639fbc02680 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -7,9 +7,10 @@
 #define _ASM_X86_SYSCALL_WRAPPER_H
 
 #include <asm/ptrace.h>
+#include <asm/ibt.h>
 
-extern long __x64_sys_ni_syscall(const struct pt_regs *regs);
-extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
+extern __noendbr long __x64_sys_ni_syscall(const struct pt_regs *regs);
+extern __noendbr long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 
 /*
  * Instead of the generic __SYSCALL_DEFINEx() definition, the x86 version takes
@@ -83,15 +84,15 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 			__MAP(x, __SC_TYPE, __VA_ARGS__))		\
 
 #define __SYS_STUB0(abi, name)						\
-	long __##abi##_##name(const struct pt_regs *regs);		\
+	long __noendbr __##abi##_##name(const struct pt_regs *regs);		\
 	ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO);			\
-	long __##abi##_##name(const struct pt_regs *regs)		\
+	long __noendbr __##abi##_##name(const struct pt_regs *regs)		\
 		__alias(__do_##name);
 
 #define __SYS_STUBx(abi, name, ...)					\
-	long __##abi##_##name(const struct pt_regs *regs);		\
+	long __noendbr __##abi##_##name(const struct pt_regs *regs);		\
 	ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO);			\
-	long __##abi##_##name(const struct pt_regs *regs)		\
+	long __noendbr __##abi##_##name(const struct pt_regs *regs)		\
 	{								\
 		return __se_##name(__VA_ARGS__);			\
 	}
@@ -257,8 +258,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
  * For VSYSCALLS, we need to declare these three syscalls with the new
  * pt_regs-based calling convention for in-kernel use.
  */
-long __x64_sys_getcpu(const struct pt_regs *regs);
-long __x64_sys_gettimeofday(const struct pt_regs *regs);
-long __x64_sys_time(const struct pt_regs *regs);
+long __noendbr __x64_sys_getcpu(const struct pt_regs *regs);
+long __noendbr __x64_sys_gettimeofday(const struct pt_regs *regs);
+long __noendbr __x64_sys_time(const struct pt_regs *regs);
 
 #endif /* _ASM_X86_SYSCALL_WRAPPER_H */

  parent reply	other threads:[~2026-06-19  8:14 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-13  1:45 8aeb879baf12 - significant system call latency regression, bisected "H. Peter Anvin" (Intel)
2026-06-13  8:59 ` Peter Zijlstra
2026-06-13 20:34   ` H. Peter Anvin
2026-06-13 23:52     ` H. Peter Anvin
2026-06-14  1:50       ` H. Peter Anvin
2026-06-14 18:08         ` Xin Li
2026-06-14 18:31           ` H. Peter Anvin
2026-06-15  0:19         ` H. Peter Anvin
2026-06-15  2:07           ` H. Peter Anvin
2026-06-15  3:41             ` Linus Torvalds
2026-06-15 18:30               ` H. Peter Anvin
2026-06-16  7:12                 ` Peter Zijlstra
2026-06-16  7:38             ` Peter Zijlstra
2026-06-16  7:53             ` Peter Zijlstra
2026-06-18 23:05               ` H. Peter Anvin
2026-06-19  7:50                 ` Peter Zijlstra
2026-06-19 10:22                   ` H. Peter Anvin
2026-06-16  8:28         ` Peter Zijlstra
2026-06-16  8:46           ` Linus Torvalds
2026-06-16  9:51             ` Ingo Molnar
2026-06-16 17:44               ` H. Peter Anvin
2026-06-17  9:54                 ` Ingo Molnar
2026-06-17 10:05                   ` Ingo Molnar
2026-06-17 12:37             ` Peter Zijlstra
2026-06-18 22:40               ` H. Peter Anvin
2026-06-19  1:11                 ` H. Peter Anvin
2026-06-19  2:08                   ` Linus Torvalds
2026-06-19  2:11                     ` Linus Torvalds
2026-06-19  4:32                       ` H. Peter Anvin
2026-06-19  7:35                         ` Peter Zijlstra
2026-06-19  2:11                     ` H. Peter Anvin
2026-06-19  7:31                 ` Peter Zijlstra
2026-06-19  8:14               ` Peter Zijlstra [this message]
2026-06-19 10:23                 ` H. Peter Anvin
2026-06-19 11:18                   ` Peter Zijlstra
2026-06-19 21:53                     ` syscall path improvements (was: syscall performance regression, debunked) H. Peter Anvin
2026-06-16 13:53           ` 8aeb879baf12 - significant system call latency regression, bisected David Laight
2026-06-18 23:03             ` H. Peter Anvin
2026-06-14  2:11       ` Calvin Owens
2026-06-14  2:14         ` Calvin Owens

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260619081427.GD49529@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=bp@alien8.de \
    --cc=calvin@wbinvd.org \
    --cc=dave.hansen@linux.intel.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=nathan@kernel.org \
    --cc=tglx@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox