All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>,
	tglx@kernel.org, mingo@redhat.com, bp@alien8.de,
	Nathan Chancellor <nathan@kernel.org>,
	Calvin Owens <calvin@wbinvd.org>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86-ML <x86@kernel.org>, LKML <linux-kernel@vger.kernel.org>
Subject: Re: 8aeb879baf12 - significant system call latency regression, bisected
Date: Wed, 17 Jun 2026 14:37:18 +0200	[thread overview]
Message-ID: <20260617123718.GM49951@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <CAHk-=wi2tNnbZ+w8kr1LHKJNFdQTXyA4wbhzd4cSi-W5cDpuhA@mail.gmail.com>

On Tue, Jun 16, 2026 at 02:16:41PM +0530, Linus Torvalds wrote:

> So 64-byte alignment would always be the best option if you only look
> at a *particular* piece of code.
> 
> But it obviously is very wasteful and hurts when there is code around
> it that could be loaded into the cache at the same time.
> 
> So almost certainly not a good idea in general.
> 
> But 64-byte alignment is probably what things like interrupt and
> system call entrypoints should use, because those things would make
> sense to look at as isolated things, not part of a bigger load". And
> they are quite likely to start from a fairly cold-cache situation.
> 
> So *not* some general compiler option in a config file, but maybe a
> special "entry point alignment" macro?

This builds with kcfi on and seems to do more or less do what is expected.

I've not actually tried performance measurements on my IDT based system.

Obviously this would want splitting into a few patches, but it does:

 - makes -fno-jump-tables unconditional
 - removes array_index_nospec() from the syscall dispatch
 - makes x{32,64}_sys_call() 'static noinstr'
 - adds align_entry attribute that aligns on cacheline boundaries
   and disallows taking address
 - sprinkles align_entry on the noinstr syscall path

---
 arch/x86/Makefile              | 29 +++++++++--------------------
 arch/x86/entry/entry_fred.c    | 12 ++++++------
 arch/x86/entry/syscall_64.c    | 26 +++++++++++++++++++++-----
 arch/x86/include/asm/fred.h    |  5 +++--
 arch/x86/include/asm/syscall.h | 23 ++++++++++++++++++++---
 5 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 598f178102ee..b154a2a20eb2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -90,17 +90,8 @@ CC_FLAGS_FPU += -mhard-float
 endif
 
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
-#
-# Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
-# NOTRACK prefixes. Current generation compilers unconditionally employ NOTRACK
-# for jump-tables, as such, disable jump-tables for now.
-#
-# (jump-tables are implicitly disabled by RETPOLINE)
-#
-#   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816
-#
-KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables)
-KBUILD_RUSTFLAGS += -Zcf-protection=branch $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch)
+KBUILD_RUSTFLAGS += -Zcf-protection=branch
 else
 KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
 endif
@@ -173,6 +164,13 @@ endif
         KBUILD_RUSTFLAGS += -Ccode-model=kernel
 
         percpu_seg := gs
+
+	# Due to retpolines and cf-protection=branch's implicit NOTRACK usage
+	# for jump-tables, blanked disable jump-tables for all x86_64 builds to
+	# get a consistent behaviour across configurations. This allows
+	# removing some array_index_nospec() usage.
+	KBUILD_CFLAGS += -fno-jump-tables
+	KBUILD_RISTFLAGS += $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables)
 endif
 
 ifeq ($(CONFIG_STACKPROTECTOR),y)
@@ -209,15 +207,6 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 ifdef CONFIG_MITIGATION_RETPOLINE
   KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
   KBUILD_RUSTFLAGS += $(RETPOLINE_RUSTFLAGS)
-  # Additionally, avoid generating expensive indirect jumps which
-  # are subject to retpolines for small number of switch cases.
-  # LLVM turns off jump table generation by default when under
-  # retpoline builds, however, gcc does not for x86. This has
-  # only been fixed starting from gcc stable version 8.4.0 and
-  # onwards, but not for older ones. See gcc bug #86952.
-  ifndef CONFIG_CC_IS_CLANG
-    KBUILD_CFLAGS += -fno-jump-tables
-  endif
 endif
 
 ifdef CONFIG_MITIGATION_SLS
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index fb3594ddf731..740fdf9bb08a 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -51,7 +51,7 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code
 	irqentry_nmi_exit(regs, irq_state);
 }
 
-static noinstr void fred_intx(struct pt_regs *regs)
+static noinstr align_entry void fred_intx(struct pt_regs *regs)
 {
 	switch (regs->fred_ss.vector) {
 	/* Opcode 0xcd, 0x3, NOT INT3 (opcode 0xcc) */
@@ -157,7 +157,7 @@ void __init fred_complete_exception_setup(void)
 	fred_setup_done = true;
 }
 
-static noinstr void fred_extint(struct pt_regs *regs)
+static noinstr align_entry void fred_extint(struct pt_regs *regs)
 {
 	unsigned int vector = regs->fred_ss.vector;
 
@@ -177,7 +177,7 @@ static noinstr void fred_extint(struct pt_regs *regs)
 	}
 }
 
-static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
+static noinstr align_entry void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
 {
 	/* Optimize for #PF. That's the only exception which matters performance wise */
 	if (likely(regs->fred_ss.vector == X86_TRAP_PF))
@@ -216,7 +216,7 @@ static noinstr void fred_hwexc(struct pt_regs *regs, unsigned long error_code)
 
 }
 
-static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code)
+static noinstr align_entry void fred_swexc(struct pt_regs *regs, unsigned long error_code)
 {
 	switch (regs->fred_ss.vector) {
 	case X86_TRAP_BP: return exc_int3(regs);
@@ -225,7 +225,7 @@ static noinstr void fred_swexc(struct pt_regs *regs, unsigned long error_code)
 	}
 }
 
-__visible noinstr void fred_entry_from_user(struct pt_regs *regs)
+__visible noinstr align_entry void fred_entry_from_user(struct pt_regs *regs)
 {
 	unsigned long error_code = regs->orig_ax;
 
@@ -257,7 +257,7 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs)
 	return fred_bad_type(regs, error_code);
 }
 
-__visible noinstr void fred_entry_from_kernel(struct pt_regs *regs)
+__visible noinstr align_entry void fred_entry_from_kernel(struct pt_regs *regs)
 {
 	unsigned long error_code = regs->orig_ax;
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 71f032504e73..10654c12dd36 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -8,6 +8,7 @@
 #include <linux/entry-common.h>
 #include <linux/nospec.h>
 #include <asm/syscall.h>
+#include <asm/ibt.h>
 
 #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
 #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
@@ -32,23 +33,40 @@ const sys_call_ptr_t sys_call_table[] = {
 #undef  __SYSCALL
 
 #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
+static noinstr align_entry long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
+	/*
+	 * Because -fno-jump-tables, this compiles into a binary branch tree
+	 * rather than a jump-table. As such @nr is not used as an array
+	 * index. Additionally, this is an out-of-line function on purpose,
+	 * such that all the actual syscall function calls are tail-calls,
+	 * returning to our caller for the common bits.
+	 */
+	instrumentation_begin();
 	switch (nr) {
 	#include <asm/syscalls_64.h>
 	default: return __x64_sys_ni_syscall(regs);
 	}
+	instrumentation_end();
 }
 
 #ifdef CONFIG_X86_X32_ABI
-long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+static noinstr align_entry long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
 {
+	instrumentation_begin();
 	switch (nr) {
 	#include <asm/syscalls_x32.h>
 	default: return __x64_sys_ni_syscall(regs);
 	}
+	instrumentation_end();
+}
+#else
+static __always_inline long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+	return __x64_sys_ni_syscall(regs);
 }
 #endif
+#undef  __SYSCALL
 
 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 {
@@ -59,7 +77,6 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
 	unsigned int unr = nr;
 
 	if (likely(unr < NR_syscalls)) {
-		unr = array_index_nospec(unr, NR_syscalls);
 		regs->ax = x64_sys_call(regs, unr);
 		return true;
 	}
@@ -76,7 +93,6 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 	unsigned int xnr = nr - __X32_SYSCALL_BIT;
 
 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
-		xnr = array_index_nospec(xnr, X32_NR_syscalls);
 		regs->ax = x32_sys_call(regs, xnr);
 		return true;
 	}
@@ -84,7 +100,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
 }
 
 /* Returns true to return using SYSRET, or false to use IRET */
-__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
+__visible noinstr align_entry bool do_syscall_64(struct pt_regs *regs, int nr)
 {
 	nr = syscall_enter_from_user_mode(regs, nr);
 
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 18a2f811c358..10b8d73e4088 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -11,6 +11,7 @@
 #include <asm/asm.h>
 #include <asm/msr.h>
 #include <asm/trapnr.h>
+#include <asm/syscall.h>
 
 /*
  * FRED event return instruction opcodes for ERET{S,U}; supported in
@@ -67,8 +68,8 @@ void asm_fred_entrypoint_user(void);
 void asm_fred_entrypoint_kernel(void);
 void asm_fred_entry_from_kvm(struct fred_ss);
 
-__visible void fred_entry_from_user(struct pt_regs *regs);
-__visible void fred_entry_from_kernel(struct pt_regs *regs);
+__visible align_entry void fred_entry_from_user(struct pt_regs *regs);
+__visible align_entry void fred_entry_from_kernel(struct pt_regs *regs);
 __visible void __fred_entry_from_kvm(struct pt_regs *regs);
 
 /* Can be called from noinstr code, thus __always_inline */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c10dbb74cd00..624e7d6f30a3 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -20,13 +20,30 @@
 typedef long (*sys_call_ptr_t)(const struct pt_regs *);
 extern const sys_call_ptr_t sys_call_table[];
 
+/*
+ * When changing patchable_function_entry for a function, the kCFI ABI is
+ * affected, therefore combine this with __noendbr, which disallows indirect
+ * calls and generates compiler warnings when the address is taken of such a
+ * function.
+ *
+ * This will effectively waste a full cacheline per align_entry user.
+ */
+#ifdef CONFIG_CALL_PADDING
+#define __pfe(x)	__attribute__((patchable_function_entry(x,x))) __noendbr
+#else
+#define __pfe(x)	__noendbr
+#endif
+
+#define __align_entry(x) __aligned(x) \
+	__pfe(x-CONFIG_FUNCTION_ALIGNMENT+CONFIG_FUNCTION_PADDING_BYTES)
+
+#define align_entry	__align_entry(SMP_CACHE_BYTES)
+
 /*
  * These may not exist, but still put the prototypes in so we
  * can use IS_ENABLED().
  */
 extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
-extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
-extern long x64_sys_call(const struct pt_regs *, unsigned int nr);
 
 /*
  * Only the low 32 bits of orig_ax are meaningful, so we return int.
@@ -172,7 +189,7 @@ static inline int syscall_get_arch(struct task_struct *task)
 		? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 }
 
-bool do_syscall_64(struct pt_regs *regs, int nr);
+align_entry bool do_syscall_64(struct pt_regs *regs, int nr);
 void do_int80_emulation(struct pt_regs *regs);
 
 #endif	/* CONFIG_X86_32 */

  parent reply	other threads:[~2026-06-17 12:37 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-13  1:45 8aeb879baf12 - significant system call latency regression, bisected "H. Peter Anvin" (Intel)
2026-06-13  8:59 ` Peter Zijlstra
2026-06-13 20:34   ` H. Peter Anvin
2026-06-13 23:52     ` H. Peter Anvin
2026-06-14  1:50       ` H. Peter Anvin
2026-06-14 18:08         ` Xin Li
2026-06-14 18:31           ` H. Peter Anvin
2026-06-15  0:19         ` H. Peter Anvin
2026-06-15  2:07           ` H. Peter Anvin
2026-06-15  3:41             ` Linus Torvalds
2026-06-15 18:30               ` H. Peter Anvin
2026-06-16  7:12                 ` Peter Zijlstra
2026-06-16  7:38             ` Peter Zijlstra
2026-06-16  7:53             ` Peter Zijlstra
2026-06-16  8:28         ` Peter Zijlstra
2026-06-16  8:46           ` Linus Torvalds
2026-06-16  9:51             ` Ingo Molnar
2026-06-16 17:44               ` H. Peter Anvin
2026-06-17  9:54                 ` Ingo Molnar
2026-06-17 10:05                   ` Ingo Molnar
2026-06-17 12:37             ` Peter Zijlstra [this message]
2026-06-16 13:53           ` David Laight
2026-06-14  2:11       ` Calvin Owens
2026-06-14  2:14         ` Calvin Owens

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260617123718.GM49951@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=bp@alien8.de \
    --cc=calvin@wbinvd.org \
    --cc=dave.hansen@linux.intel.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=nathan@kernel.org \
    --cc=tglx@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.