public inbox for linux-arm-kernel@lists.infradead.org
 help / color / mirror / Atom feed
* [PATCH v7] arm64: implement support for static call trampolines
@ 2026-03-13  6:18 Carlos Llamas
  2026-03-13  8:52 ` Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Carlos Llamas @ 2026-03-13  6:18 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Sami Tolvanen, Catalin Marinas, Will Deacon, Peter Zijlstra,
	Josh Poimboeuf, Ard Biesheuvel, Mark Rutland, Kees Cook,
	Quentin Perret, Steven Rostedt, Will McVicker,
	Sean Christopherson, kernel-team, linux-kernel, Carlos Llamas

From: Ard Biesheuvel <ardb@kernel.org>

Implement arm64 support for the 'unoptimized' static call variety, which
routes all calls through a single trampoline that is patched to perform a
tail call to the selected function.

Since static call targets may be located in modules loaded out of direct
branching range, we need to use a ADRP/ADD pair to load the branch target
into R16 and use a branch-to-register (BR) instruction to perform an
indirect call. Unlike on x86, there is no pressing need on arm64 to avoid
indirect calls at all cost, but hiding it from the compiler as is done
here does have some benefits:
- the literal is located in .rodata, which gives us the same robustness
  advantage that code patching does;
- no performance hit on CFI enabled Clang builds that decorate compiler
  emitted indirect calls with branch target validity checks.

Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
---
v7:
  - Took Ard's v3 patch (as it leaves the code patching logic out) and
    rebased it on top  of mainline 7.0-rc3.
  - Dropped the changes to arch/arm64/lib/insn.c and instead switched to
    the (now) existing aarch64_insn_write_literal_u64().
  - Added the RET0 trampoline define which points to the generic stub
    __static_call_return0.
  - Made the HAVE_STATIC_CALL conditional on CFI as suggested by Ard.
  - Added .type and .size sections to the trampoline definition to
    support ABI tools.

 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/static_call.h | 33 ++++++++++++++++++++++++++++
 arch/arm64/kernel/Makefile           |  1 +
 arch/arm64/kernel/static_call.c      | 20 +++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S      |  1 +
 5 files changed, 56 insertions(+)
 create mode 100644 arch/arm64/include/asm/static_call.h
 create mode 100644 arch/arm64/kernel/static_call.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 38dba5f7e4d2..9ea19b74b6c3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -252,6 +252,7 @@ config ARM64
 	select HAVE_RSEQ
 	select HAVE_RUST if RUSTC_SUPPORTS_ARM64
 	select HAVE_STACKPROTECTOR
+	select HAVE_STATIC_CALL if CFI
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
diff --git a/arch/arm64/include/asm/static_call.h b/arch/arm64/include/asm/static_call.h
new file mode 100644
index 000000000000..331580542fd4
--- /dev/null
+++ b/arch/arm64/include/asm/static_call.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_STATIC_CALL_H
+#define _ASM_STATIC_CALL_H
+
+#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, target)			    \
+	asm("	.pushsection .static_call.text, \"ax\"			\n" \
+	    "	.align	3						\n" \
+	    "	.globl	" STATIC_CALL_TRAMP_STR(name) "			\n" \
+	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
+	    "	hint	34	/* BTI C */				\n" \
+	    "	adrp	x16, 1f						\n" \
+	    "	ldr	x16, [x16, :lo12:1f]				\n" \
+	    "	cbz	x16, 0f						\n" \
+	    "	br	x16						\n" \
+	    "0:	ret							\n" \
+	    "	.type	" STATIC_CALL_TRAMP_STR(name) ", %function	\n" \
+	    "	.size	" STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
+	    "	.popsection						\n" \
+	    "	.pushsection .rodata, \"a\"				\n" \
+	    "	.align	3						\n" \
+	    "1:	.quad	" target "					\n" \
+	    "	.popsection						\n")
+
+#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
+	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, #func)
+
+#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
+	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "0x0")
+
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)			\
+	ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0)
+
+#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 76f32e424065..fe627100d199 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_MODULES)			+= module.o module-plts.o
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o perf_callchain.o
 obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF)	+= watchdog_hld.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
+obj-$(CONFIG_HAVE_STATIC_CALL)		+= static_call.o
 obj-$(CONFIG_CPU_PM)			+= sleep.o suspend.o
 obj-$(CONFIG_KGDB)			+= kgdb.o
 obj-$(CONFIG_EFI)			+= efi.o efi-rt-wrapper.o
diff --git a/arch/arm64/kernel/static_call.c b/arch/arm64/kernel/static_call.c
new file mode 100644
index 000000000000..944ecabb821f
--- /dev/null
+++ b/arch/arm64/kernel/static_call.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/static_call.h>
+#include <linux/memory.h>
+#include <asm/text-patching.h>
+
+void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
+{
+	u64 literal;
+	int ret;
+
+	/* decode the instructions to discover the literal address */
+	literal = ALIGN_DOWN((u64)tramp + 4, SZ_4K) +
+		  aarch64_insn_adrp_get_offset(le32_to_cpup(tramp + 4)) +
+		  8 * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12,
+						    le32_to_cpup(tramp + 8));
+
+	ret = aarch64_insn_write_literal_u64((void *)literal, (u64)func);
+	WARN_ON_ONCE(ret);
+}
+EXPORT_SYMBOL_GPL(arch_static_call_transform);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 2964aad0362e..2d1e75263f03 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -191,6 +191,7 @@ SECTIONS
 			LOCK_TEXT
 			KPROBES_TEXT
 			HYPERVISOR_TEXT
+			STATIC_CALL_TEXT
 			*(.gnu.warning)
 	}
 
-- 
2.53.0.880.g73c4285caa-goog



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v7] arm64: implement support for static call trampolines
  2026-03-13  6:18 [PATCH v7] arm64: implement support for static call trampolines Carlos Llamas
@ 2026-03-13  8:52 ` Peter Zijlstra
  2026-03-13 16:48 ` Sami Tolvanen
  2026-03-17 10:59 ` Ard Biesheuvel
  2 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2026-03-13  8:52 UTC (permalink / raw)
  To: Carlos Llamas
  Cc: linux-arm-kernel, Sami Tolvanen, Catalin Marinas, Will Deacon,
	Josh Poimboeuf, Ard Biesheuvel, Mark Rutland, Kees Cook,
	Quentin Perret, Steven Rostedt, Will McVicker,
	Sean Christopherson, kernel-team, linux-kernel

On Fri, Mar 13, 2026 at 06:18:52AM +0000, Carlos Llamas wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Implement arm64 support for the 'unoptimized' static call variety, which
> routes all calls through a single trampoline that is patched to perform a
> tail call to the selected function.
> 
> Since static call targets may be located in modules loaded out of direct
> branching range, we need to use a ADRP/ADD pair to load the branch target
> into R16 and use a branch-to-register (BR) instruction to perform an
> indirect call. Unlike on x86, there is no pressing need on arm64 to avoid
> indirect calls at all cost, but hiding it from the compiler as is done
> here does have some benefits:
> - the literal is located in .rodata, which gives us the same robustness
>   advantage that code patching does;
> - no performance hit on CFI enabled Clang builds that decorate compiler
>   emitted indirect calls with branch target validity checks.
> 
> Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> Signed-off-by: Carlos Llamas <cmllamas@google.com>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v7] arm64: implement support for static call trampolines
  2026-03-13  6:18 [PATCH v7] arm64: implement support for static call trampolines Carlos Llamas
  2026-03-13  8:52 ` Peter Zijlstra
@ 2026-03-13 16:48 ` Sami Tolvanen
  2026-03-13 17:15   ` Carlos Llamas
  2026-03-17 10:59 ` Ard Biesheuvel
  2 siblings, 1 reply; 8+ messages in thread
From: Sami Tolvanen @ 2026-03-13 16:48 UTC (permalink / raw)
  To: Carlos Llamas
  Cc: linux-arm-kernel, Catalin Marinas, Will Deacon, Peter Zijlstra,
	Josh Poimboeuf, Ard Biesheuvel, Mark Rutland, Kees Cook,
	Quentin Perret, Steven Rostedt, Will McVicker,
	Sean Christopherson, kernel-team, linux-kernel

On Fri, Mar 13, 2026 at 06:18:52AM +0000, Carlos Llamas wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Implement arm64 support for the 'unoptimized' static call variety, which
> routes all calls through a single trampoline that is patched to perform a
> tail call to the selected function.
> 
> Since static call targets may be located in modules loaded out of direct
> branching range, we need to use a ADRP/ADD pair to load the branch target
> into R16 and use a branch-to-register (BR) instruction to perform an
> indirect call. Unlike on x86, there is no pressing need on arm64 to avoid
> indirect calls at all cost, but hiding it from the compiler as is done
> here does have some benefits:
> - the literal is located in .rodata, which gives us the same robustness
>   advantage that code patching does;
> - no performance hit on CFI enabled Clang builds that decorate compiler
>   emitted indirect calls with branch target validity checks.
> 
> Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> Signed-off-by: Carlos Llamas <cmllamas@google.com>

Does this need a Co-developed-by tag as well?

> ---
> v7:
>   - Took Ard's v3 patch (as it leaves the code patching logic out) and
>     rebased it on top  of mainline 7.0-rc3.
>   - Dropped the changes to arch/arm64/lib/insn.c and instead switched to
>     the (now) existing aarch64_insn_write_literal_u64().
>   - Added the RET0 trampoline define which points to the generic stub
>     __static_call_return0.
>   - Made the HAVE_STATIC_CALL conditional on CFI as suggested by Ard.
>   - Added .type and .size sections to the trampoline definition to
>     support ABI tools.
> 
>  arch/arm64/Kconfig                   |  1 +
>  arch/arm64/include/asm/static_call.h | 33 ++++++++++++++++++++++++++++
>  arch/arm64/kernel/Makefile           |  1 +
>  arch/arm64/kernel/static_call.c      | 20 +++++++++++++++++
>  arch/arm64/kernel/vmlinux.lds.S      |  1 +
>  5 files changed, 56 insertions(+)
>  create mode 100644 arch/arm64/include/asm/static_call.h
>  create mode 100644 arch/arm64/kernel/static_call.c
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 38dba5f7e4d2..9ea19b74b6c3 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -252,6 +252,7 @@ config ARM64
>  	select HAVE_RSEQ
>  	select HAVE_RUST if RUSTC_SUPPORTS_ARM64
>  	select HAVE_STACKPROTECTOR
> +	select HAVE_STATIC_CALL if CFI
>  	select HAVE_SYSCALL_TRACEPOINTS
>  	select HAVE_KPROBES
>  	select HAVE_KRETPROBES
> diff --git a/arch/arm64/include/asm/static_call.h b/arch/arm64/include/asm/static_call.h
> new file mode 100644
> index 000000000000..331580542fd4
> --- /dev/null
> +++ b/arch/arm64/include/asm/static_call.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_STATIC_CALL_H
> +#define _ASM_STATIC_CALL_H
> +
> +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, target)			    \
> +	asm("	.pushsection .static_call.text, \"ax\"			\n" \
> +	    "	.align	3						\n" \
> +	    "	.globl	" STATIC_CALL_TRAMP_STR(name) "			\n" \
> +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
> +	    "	hint	34	/* BTI C */				\n" \

It doesn't really matter either way, but do we still support toolchains
that don't understand "bti c"?

Otherwise looks good to me, and definitely a much better way to solve
this issue:

Reviewed-by: Sami Tolvanen <samitolvanen@google.com>

Sami


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v7] arm64: implement support for static call trampolines
  2026-03-13 16:48 ` Sami Tolvanen
@ 2026-03-13 17:15   ` Carlos Llamas
  0 siblings, 0 replies; 8+ messages in thread
From: Carlos Llamas @ 2026-03-13 17:15 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: linux-arm-kernel, Catalin Marinas, Will Deacon, Peter Zijlstra,
	Josh Poimboeuf, Ard Biesheuvel, Mark Rutland, Kees Cook,
	Quentin Perret, Steven Rostedt, Will McVicker,
	Sean Christopherson, kernel-team, linux-kernel

On Fri, Mar 13, 2026 at 04:48:56PM +0000, Sami Tolvanen wrote:
> On Fri, Mar 13, 2026 at 06:18:52AM +0000, Carlos Llamas wrote:
> > From: Ard Biesheuvel <ardb@kernel.org>
> > 
> > Implement arm64 support for the 'unoptimized' static call variety, which
> > routes all calls through a single trampoline that is patched to perform a
> > tail call to the selected function.
> > 
> > Since static call targets may be located in modules loaded out of direct
> > branching range, we need to use a ADRP/ADD pair to load the branch target
> > into R16 and use a branch-to-register (BR) instruction to perform an
> > indirect call. Unlike on x86, there is no pressing need on arm64 to avoid
> > indirect calls at all cost, but hiding it from the compiler as is done
> > here does have some benefits:
> > - the literal is located in .rodata, which gives us the same robustness
> >   advantage that code patching does;
> > - no performance hit on CFI enabled Clang builds that decorate compiler
> >   emitted indirect calls with branch target validity checks.
> > 
> > Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > Signed-off-by: Carlos Llamas <cmllamas@google.com>
> 
> Does this need a Co-developed-by tag as well?

The bulk of modifications came from the rebase itself and suggestions
from Ard. I skipped the Co-developed-by tag for this reason.

> 
> > ---
> > v7:
> >   - Took Ard's v3 patch (as it leaves the code patching logic out) and
> >     rebased it on top  of mainline 7.0-rc3.
> >   - Dropped the changes to arch/arm64/lib/insn.c and instead switched to
> >     the (now) existing aarch64_insn_write_literal_u64().
> >   - Added the RET0 trampoline define which points to the generic stub
> >     __static_call_return0.
> >   - Made the HAVE_STATIC_CALL conditional on CFI as suggested by Ard.
> >   - Added .type and .size sections to the trampoline definition to
> >     support ABI tools.
> > 
> >  arch/arm64/Kconfig                   |  1 +
> >  arch/arm64/include/asm/static_call.h | 33 ++++++++++++++++++++++++++++
> >  arch/arm64/kernel/Makefile           |  1 +
> >  arch/arm64/kernel/static_call.c      | 20 +++++++++++++++++
> >  arch/arm64/kernel/vmlinux.lds.S      |  1 +
> >  5 files changed, 56 insertions(+)
> >  create mode 100644 arch/arm64/include/asm/static_call.h
> >  create mode 100644 arch/arm64/kernel/static_call.c
> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 38dba5f7e4d2..9ea19b74b6c3 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -252,6 +252,7 @@ config ARM64
> >  	select HAVE_RSEQ
> >  	select HAVE_RUST if RUSTC_SUPPORTS_ARM64
> >  	select HAVE_STACKPROTECTOR
> > +	select HAVE_STATIC_CALL if CFI
> >  	select HAVE_SYSCALL_TRACEPOINTS
> >  	select HAVE_KPROBES
> >  	select HAVE_KRETPROBES
> > diff --git a/arch/arm64/include/asm/static_call.h b/arch/arm64/include/asm/static_call.h
> > new file mode 100644
> > index 000000000000..331580542fd4
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/static_call.h
> > @@ -0,0 +1,33 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_STATIC_CALL_H
> > +#define _ASM_STATIC_CALL_H
> > +
> > +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, target)			    \
> > +	asm("	.pushsection .static_call.text, \"ax\"			\n" \
> > +	    "	.align	3						\n" \
> > +	    "	.globl	" STATIC_CALL_TRAMP_STR(name) "			\n" \
> > +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
> > +	    "	hint	34	/* BTI C */				\n" \
> 
> It doesn't really matter either way, but do we still support toolchains
> that don't understand "bti c"?

I tested with this early. A 8.2 gcc from the arm archives and it failed
to build:

   Error: selected processor does not support `bti c'

However, this was _before_ I added the conditional on CFI. Maybe clang
doesn't have this problem and technically speaking CFI would gate the
"bti c" usage? Because you can't have CFI and "old" gcc.

The CFI conditional might be later removed though so maybe lets keep it
as is?

--
Carlos Llamas


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v7] arm64: implement support for static call trampolines
  2026-03-13  6:18 [PATCH v7] arm64: implement support for static call trampolines Carlos Llamas
  2026-03-13  8:52 ` Peter Zijlstra
  2026-03-13 16:48 ` Sami Tolvanen
@ 2026-03-17 10:59 ` Ard Biesheuvel
  2026-03-17 11:24   ` Peter Zijlstra
  2 siblings, 1 reply; 8+ messages in thread
From: Ard Biesheuvel @ 2026-03-17 10:59 UTC (permalink / raw)
  To: Carlos Llamas, linux-arm-kernel
  Cc: Sami Tolvanen, Catalin Marinas, Will Deacon, Peter Zijlstra,
	Josh Poimboeuf, Mark Rutland, Kees Cook, Quentin Perret,
	Steven Rostedt, Will McVicker, Sean Christopherson, kernel-team,
	linux-kernel

Hi Carlos,

On Fri, 13 Mar 2026, at 07:18, Carlos Llamas wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
>
> Implement arm64 support for the 'unoptimized' static call variety, which
> routes all calls through a single trampoline that is patched to perform a
> tail call to the selected function.
>
> Since static call targets may be located in modules loaded out of direct
> branching range, we need to use a ADRP/ADD pair to load the branch target
> into R16 and use a branch-to-register (BR) instruction to perform an
> indirect call. Unlike on x86, there is no pressing need on arm64 to avoid
> indirect calls at all cost, but hiding it from the compiler as is done
> here does have some benefits:
> - the literal is located in .rodata, which gives us the same robustness
>   advantage that code patching does;
> - no performance hit on CFI enabled Clang builds that decorate compiler
>   emitted indirect calls with branch target validity checks.
>

It was pointed out to me that this claim is unsubstantiated: IIRC this patch was written before kcfi was introduced, but even if it wasn't, it might be better to call out the actual difference here.

kCFI conditionally performs an indirect call to address 'x', after loading the u32 located at x-4 and comparing it with a compile time constant that encodes the function prototype expected by the call site.

The static call trampoline involves two branches: one direct branch to the trampoline, and an indirect one to the target function. (We can drop the conditional branch and the ret here, see below).

If there is any measurable difference, it will likely be highly dependent on micro-architectural details and the nature of the workload, and neither one is obviously more efficient.

TL;DR maybe just drop the bullet point? But at least drop the claim that it speeds up static call dispatch with CFI enabled.


> Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> Signed-off-by: Carlos Llamas <cmllamas@google.com>
> ---
> v7:
>   - Took Ard's v3 patch (as it leaves the code patching logic out) and
>     rebased it on top  of mainline 7.0-rc3.
>   - Dropped the changes to arch/arm64/lib/insn.c and instead switched to
>     the (now) existing aarch64_insn_write_literal_u64().
>   - Added the RET0 trampoline define which points to the generic stub
>     __static_call_return0.
>   - Made the HAVE_STATIC_CALL conditional on CFI as suggested by Ard.
>   - Added .type and .size sections to the trampoline definition to
>     support ABI tools.
>
>  arch/arm64/Kconfig                   |  1 +
>  arch/arm64/include/asm/static_call.h | 33 ++++++++++++++++++++++++++++
>  arch/arm64/kernel/Makefile           |  1 +
>  arch/arm64/kernel/static_call.c      | 20 +++++++++++++++++
>  arch/arm64/kernel/vmlinux.lds.S      |  1 +
>  5 files changed, 56 insertions(+)
>  create mode 100644 arch/arm64/include/asm/static_call.h
>  create mode 100644 arch/arm64/kernel/static_call.c
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 38dba5f7e4d2..9ea19b74b6c3 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -252,6 +252,7 @@ config ARM64
>  	select HAVE_RSEQ
>  	select HAVE_RUST if RUSTC_SUPPORTS_ARM64
>  	select HAVE_STACKPROTECTOR
> +	select HAVE_STATIC_CALL if CFI
>  	select HAVE_SYSCALL_TRACEPOINTS
>  	select HAVE_KPROBES
>  	select HAVE_KRETPROBES
> diff --git a/arch/arm64/include/asm/static_call.h 
> b/arch/arm64/include/asm/static_call.h
> new file mode 100644
> index 000000000000..331580542fd4
> --- /dev/null
> +++ b/arch/arm64/include/asm/static_call.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_STATIC_CALL_H
> +#define _ASM_STATIC_CALL_H
> +
> +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, target)			    \
> +	asm("	.pushsection .static_call.text, \"ax\"			\n" \
> +	    "	.align	3						\n" \
> +	    "	.globl	" STATIC_CALL_TRAMP_STR(name) "			\n" \
> +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
> +	    "	hint	34	/* BTI C */				\n" \
> +	    "	adrp	x16, 1f						\n" \
> +	    "	ldr	x16, [x16, :lo12:1f]				\n" \
> +	    "	cbz	x16, 0f						\n" \
> +	    "	br	x16						\n" \
> +	    "0:	ret							\n" \
> +	    "	.type	" STATIC_CALL_TRAMP_STR(name) ", %function	\n" \
> +	    "	.size	" STATIC_CALL_TRAMP_STR(name) ", . - " 
> STATIC_CALL_TRAMP_STR(name) " \n" \
> +	    "	.popsection						\n" \
> +	    "	.pushsection .rodata, \"a\"				\n" \
> +	    "	.align	3						\n" \
> +	    "1:	.quad	" target "					\n" \
> +	    "	.popsection						\n")
> +
> +#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
> +	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, #func)
> +
> +#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
> +	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "0x0")
> +

We could use either __static_call_return0 or __static_call_nop here, rather than 0x0, and do the same in the implementation of arch_static_call_transform(). That way, we can drop the cbz and ret instructions from the trampoline.

(__static_call_return0 is perfectly acceptable as a NOP, given that R0 is clobbered in any case after a function returning void returns, so just do whatever is easiest)

> +#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)			\
> +	ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0)
> +
> +#endif /* _ASM_STATIC_CALL_H */
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index 76f32e424065..fe627100d199 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -46,6 +46,7 @@ obj-$(CONFIG_MODULES)			+= module.o module-plts.o
>  obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o perf_callchain.o
>  obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF)	+= watchdog_hld.o
>  obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
> +obj-$(CONFIG_HAVE_STATIC_CALL)		+= static_call.o
>  obj-$(CONFIG_CPU_PM)			+= sleep.o suspend.o
>  obj-$(CONFIG_KGDB)			+= kgdb.o
>  obj-$(CONFIG_EFI)			+= efi.o efi-rt-wrapper.o
> diff --git a/arch/arm64/kernel/static_call.c 
> b/arch/arm64/kernel/static_call.c
> new file mode 100644
> index 000000000000..944ecabb821f
> --- /dev/null
> +++ b/arch/arm64/kernel/static_call.c
> @@ -0,0 +1,20 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/static_call.h>
> +#include <linux/memory.h>
> +#include <asm/text-patching.h>
> +
> +void arch_static_call_transform(void *site, void *tramp, void *func, 
> bool tail)
> +{
> +	u64 literal;
> +	int ret;
> +

Here, set func to &__static_call_return0 if it is NULL.

> +	/* decode the instructions to discover the literal address */
> +	literal = ALIGN_DOWN((u64)tramp + 4, SZ_4K) +
> +		  aarch64_insn_adrp_get_offset(le32_to_cpup(tramp + 4)) +
> +		  8 * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12,
> +						    le32_to_cpup(tramp + 8));
> +
> +	ret = aarch64_insn_write_literal_u64((void *)literal, (u64)func);
> +	WARN_ON_ONCE(ret);
> +}
> +EXPORT_SYMBOL_GPL(arch_static_call_transform);
> diff --git a/arch/arm64/kernel/vmlinux.lds.S 
> b/arch/arm64/kernel/vmlinux.lds.S
> index 2964aad0362e..2d1e75263f03 100644
> --- a/arch/arm64/kernel/vmlinux.lds.S
> +++ b/arch/arm64/kernel/vmlinux.lds.S
> @@ -191,6 +191,7 @@ SECTIONS
>  			LOCK_TEXT
>  			KPROBES_TEXT
>  			HYPERVISOR_TEXT
> +			STATIC_CALL_TEXT
>  			*(.gnu.warning)
>  	}
> 
> -- 
> 2.53.0.880.g73c4285caa-goog


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v7] arm64: implement support for static call trampolines
  2026-03-17 10:59 ` Ard Biesheuvel
@ 2026-03-17 11:24   ` Peter Zijlstra
  2026-03-17 11:31     ` Ard Biesheuvel
  0 siblings, 1 reply; 8+ messages in thread
From: Peter Zijlstra @ 2026-03-17 11:24 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Carlos Llamas, linux-arm-kernel, Sami Tolvanen, Catalin Marinas,
	Will Deacon, Josh Poimboeuf, Mark Rutland, Kees Cook,
	Quentin Perret, Steven Rostedt, Will McVicker,
	Sean Christopherson, kernel-team, linux-kernel

On Tue, Mar 17, 2026 at 11:59:49AM +0100, Ard Biesheuvel wrote:

> > +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, target)			    \
> > +	asm("	.pushsection .static_call.text, \"ax\"			\n" \
> > +	    "	.align	3						\n" \
> > +	    "	.globl	" STATIC_CALL_TRAMP_STR(name) "			\n" \
> > +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
> > +	    "	hint	34	/* BTI C */				\n" \
> > +	    "	adrp	x16, 1f						\n" \
> > +	    "	ldr	x16, [x16, :lo12:1f]				\n" \
> > +	    "	cbz	x16, 0f						\n" \
> > +	    "	br	x16						\n" \
> > +	    "0:	ret							\n" \
> > +	    "	.type	" STATIC_CALL_TRAMP_STR(name) ", %function	\n" \
> > +	    "	.size	" STATIC_CALL_TRAMP_STR(name) ", . - " 
> > STATIC_CALL_TRAMP_STR(name) " \n" \
> > +	    "	.popsection						\n" \
> > +	    "	.pushsection .rodata, \"a\"				\n" \
> > +	    "	.align	3						\n" \
> > +	    "1:	.quad	" target "					\n" \
> > +	    "	.popsection						\n")

> > +void arch_static_call_transform(void *site, void *tramp, void *func, 
> > bool tail)
> > +{
> > +	u64 literal;
> > +	int ret;
> > +
> 
> Here, set func to &__static_call_return0 if it is NULL.

I'm confused. NULL is for the static_call_cond() case, where we NO-OP.
And the trampoline above does that cbz 0f to ret. So far so good.

But ret0 should return 0, and IIRC arm64 uses x0 for the return value.
But I don't see the above clearing it. Hmm?

> > +	/* decode the instructions to discover the literal address */
> > +	literal = ALIGN_DOWN((u64)tramp + 4, SZ_4K) +
> > +		  aarch64_insn_adrp_get_offset(le32_to_cpup(tramp + 4)) +
> > +		  8 * aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12,
> > +						    le32_to_cpup(tramp + 8));
> > +
> > +	ret = aarch64_insn_write_literal_u64((void *)literal, (u64)func);
> > +	WARN_ON_ONCE(ret);
> > +}
> > +EXPORT_SYMBOL_GPL(arch_static_call_transform);


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v7] arm64: implement support for static call trampolines
  2026-03-17 11:24   ` Peter Zijlstra
@ 2026-03-17 11:31     ` Ard Biesheuvel
  2026-03-17 11:34       ` Peter Zijlstra
  0 siblings, 1 reply; 8+ messages in thread
From: Ard Biesheuvel @ 2026-03-17 11:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Carlos Llamas, linux-arm-kernel, Sami Tolvanen, Catalin Marinas,
	Will Deacon, Josh Poimboeuf, Mark Rutland, Kees Cook,
	Quentin Perret, Steven Rostedt, Will McVicker,
	Sean Christopherson, kernel-team, linux-kernel


On Tue, 17 Mar 2026, at 12:24, Peter Zijlstra wrote:
> On Tue, Mar 17, 2026 at 11:59:49AM +0100, Ard Biesheuvel wrote:
>
>> > +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, target)			    \
>> > +	asm("	.pushsection .static_call.text, \"ax\"			\n" \
>> > +	    "	.align	3						\n" \
>> > +	    "	.globl	" STATIC_CALL_TRAMP_STR(name) "			\n" \
>> > +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
>> > +	    "	hint	34	/* BTI C */				\n" \
>> > +	    "	adrp	x16, 1f						\n" \
>> > +	    "	ldr	x16, [x16, :lo12:1f]				\n" \
>> > +	    "	cbz	x16, 0f						\n" \
>> > +	    "	br	x16						\n" \
>> > +	    "0:	ret							\n" \
>> > +	    "	.type	" STATIC_CALL_TRAMP_STR(name) ", %function	\n" \
>> > +	    "	.size	" STATIC_CALL_TRAMP_STR(name) ", . - " 
>> > STATIC_CALL_TRAMP_STR(name) " \n" \
>> > +	    "	.popsection						\n" \
>> > +	    "	.pushsection .rodata, \"a\"				\n" \
>> > +	    "	.align	3						\n" \
>> > +	    "1:	.quad	" target "					\n" \
>> > +	    "	.popsection						\n")
>
>> > +void arch_static_call_transform(void *site, void *tramp, void *func, 
>> > bool tail)
>> > +{
>> > +	u64 literal;
>> > +	int ret;
>> > +
>> 
>> Here, set func to &__static_call_return0 if it is NULL.
>
> I'm confused. NULL is for the static_call_cond() case, where we NO-OP.
> And the trampoline above does that cbz 0f to ret. So far so good.
>
> But ret0 should return 0, and IIRC arm64 uses x0 for the return value.
> But I don't see the above clearing it. Hmm?
>

The RET0 case will tail call __static_call_return0() which will take care of this.

I am just saying that the NULL case could just do the same, rather than have a conditional branch in the trampoline, as even in that case, the surrounding code must assume that X0 is clobbered. Alternatively, we could tail call __static_call_nop(), which would do just the 'ret'.

IOW, if we guarantee that the target is always set to something appropriate, we can elide the NULL check, and __static_call_return0() and __static_call_nop() are equally appropriate for the NULL case.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v7] arm64: implement support for static call trampolines
  2026-03-17 11:31     ` Ard Biesheuvel
@ 2026-03-17 11:34       ` Peter Zijlstra
  0 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2026-03-17 11:34 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Carlos Llamas, linux-arm-kernel, Sami Tolvanen, Catalin Marinas,
	Will Deacon, Josh Poimboeuf, Mark Rutland, Kees Cook,
	Quentin Perret, Steven Rostedt, Will McVicker,
	Sean Christopherson, kernel-team, linux-kernel

On Tue, Mar 17, 2026 at 12:31:51PM +0100, Ard Biesheuvel wrote:
> 
> On Tue, 17 Mar 2026, at 12:24, Peter Zijlstra wrote:
> > On Tue, Mar 17, 2026 at 11:59:49AM +0100, Ard Biesheuvel wrote:
> >
> >> > +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, target)			    \
> >> > +	asm("	.pushsection .static_call.text, \"ax\"			\n" \
> >> > +	    "	.align	3						\n" \
> >> > +	    "	.globl	" STATIC_CALL_TRAMP_STR(name) "			\n" \
> >> > +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
> >> > +	    "	hint	34	/* BTI C */				\n" \
> >> > +	    "	adrp	x16, 1f						\n" \
> >> > +	    "	ldr	x16, [x16, :lo12:1f]				\n" \
> >> > +	    "	cbz	x16, 0f						\n" \
> >> > +	    "	br	x16						\n" \
> >> > +	    "0:	ret							\n" \
> >> > +	    "	.type	" STATIC_CALL_TRAMP_STR(name) ", %function	\n" \
> >> > +	    "	.size	" STATIC_CALL_TRAMP_STR(name) ", . - " 
> >> > STATIC_CALL_TRAMP_STR(name) " \n" \
> >> > +	    "	.popsection						\n" \
> >> > +	    "	.pushsection .rodata, \"a\"				\n" \
> >> > +	    "	.align	3						\n" \
> >> > +	    "1:	.quad	" target "					\n" \
> >> > +	    "	.popsection						\n")
> >
> >> > +void arch_static_call_transform(void *site, void *tramp, void *func, 
> >> > bool tail)
> >> > +{
> >> > +	u64 literal;
> >> > +	int ret;
> >> > +
> >> 
> >> Here, set func to &__static_call_return0 if it is NULL.
> >
> > I'm confused. NULL is for the static_call_cond() case, where we NO-OP.
> > And the trampoline above does that cbz 0f to ret. So far so good.
> >
> > But ret0 should return 0, and IIRC arm64 uses x0 for the return value.
> > But I don't see the above clearing it. Hmm?
> >

(your MUA seems busted and generates these silly long lines, let me
reflow again)

> The RET0 case will tail call __static_call_return0() which will take
> care of this.
> 
> I am just saying that the NULL case could just do the same, rather
> than have a conditional branch in the trampoline, as even in that
> case, the surrounding code must assume that X0 is clobbered.
> Alternatively, we could tail call __static_call_nop(), which would do
> just the 'ret'.
> 
> IOW, if we guarantee that the target is always set to something
> appropriate, we can elide the NULL check, and __static_call_return0()
> and __static_call_nop() are equally appropriate for the NULL case.

Ah, yes that is possible. Trade that cbz for an unconditional branch to
a function. That works fine.

Do whatever is best for the uarch etc..


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-03-17 11:34 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-13  6:18 [PATCH v7] arm64: implement support for static call trampolines Carlos Llamas
2026-03-13  8:52 ` Peter Zijlstra
2026-03-13 16:48 ` Sami Tolvanen
2026-03-13 17:15   ` Carlos Llamas
2026-03-17 10:59 ` Ard Biesheuvel
2026-03-17 11:24   ` Peter Zijlstra
2026-03-17 11:31     ` Ard Biesheuvel
2026-03-17 11:34       ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox