public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Carlos Llamas <cmllamas@google.com>
To: Ard Biesheuvel <ardb@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, Mark Rutland <mark.rutland@arm.com>,
	Quentin Perret <qperret@google.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	James Morse <james.morse@arm.com>, Will Deacon <will@kernel.org>,
	Frederic Weisbecker <frederic@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Kees Cook <keescook@chromium.org>,
	Sami Tolvanen <samitolvanen@google.com>,
	Andy Lutomirski <luto@kernel.org>,
	Josh Poimboeuf <jpoimboe@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>
Subject: Re: [PATCH v6 2/2] arm64: implement support for static call trampolines
Date: Thu, 12 Mar 2026 18:02:34 +0000	[thread overview]
Message-ID: <abL_ulvlC9JT7Zgw@google.com> (raw)
In-Reply-To: <20211105145917.2828911-3-ardb@kernel.org>

On Fri, Nov 05, 2021 at 03:59:17PM +0100, Ard Biesheuvel wrote:
> Implement arm64 support for the 'unoptimized' static call variety, which
> routes all calls through a single trampoline that is patched to perform a
> tail call to the selected function.
> 
> It is expected that the direct branch instruction will be able to cover
> the common case. However, given that static call targets may be located
> in modules loaded out of direct branching range, we need a fallback path
> that loads the address into R16 and uses a branch-to-register (BR)
> instruction to perform an indirect call.
> 
> Unlike on x86, there is no pressing need on arm64 to avoid indirect
> calls at all cost, but hiding it from the compiler as is done here does
> have some benefits:
> - the literal is located in .text, which gives us the same robustness
>   advantage that code patching does;
> - no performance hit on CFI enabled Clang builds that decorate compiler
>   emitted indirect calls with branch target validity checks.
> 
> Acked-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---

I'm starting to testing this out on top of 7.0-rc3...


>  arch/arm64/Kconfig                   |  2 +
>  arch/arm64/include/asm/static_call.h | 40 ++++++++++
>  arch/arm64/kernel/patching.c         | 77 +++++++++++++++++++-
>  arch/arm64/kernel/vmlinux.lds.S      |  1 +
>  4 files changed, 117 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 176d6fddc4f2..ccc33b85769c 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -193,6 +193,8 @@ config ARM64
>  	select HAVE_PERF_USER_STACK_DUMP
>  	select HAVE_REGS_AND_STACK_ACCESS_API
>  	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
> +	# https://github.com/ClangBuiltLinux/linux/issues/1354
> +	select HAVE_STATIC_CALL if !LTO_CLANG_THIN || CLANG_VERSION >= 130000

I got a circular dependency error on this...

  error: recursive dependency detected!
	symbol GCOV_KERNEL depends on DEBUG_FS
	symbol DEBUG_FS is selected by GPIO_VIRTUSER
	symbol GPIO_VIRTUSER depends on GPIOLIB
	symbol GPIOLIB is selected by CEC_GPIO
	symbol CEC_GPIO depends on PREEMPTION
	symbol PREEMPTION is selected by PREEMPT_BUILD
	symbol PREEMPT_BUILD is selected by PREEMPT_DYNAMIC
	symbol PREEMPT_DYNAMIC depends on HAVE_PREEMPT_DYNAMIC
	symbol HAVE_PREEMPT_DYNAMIC is selected by HAVE_PREEMPT_DYNAMIC_CALL
	symbol HAVE_PREEMPT_DYNAMIC_CALL depends on HAVE_STATIC_CALL
	symbol HAVE_STATIC_CALL is selected by LTO_CLANG_THIN
	symbol LTO_CLANG_THIN is part of choice block at arch/Kconfig:817
	symbol <choice> unknown is visible depending on LTO_CLANG_FULL
	symbol LTO_CLANG_FULL prompt is visible depending on HAS_LTO_CLANG
	symbol HAS_LTO_CLANG depends on GCOV_KERNEL

...so I just dropped the checks altogether for now.


>  	select HAVE_FUNCTION_ARG_ACCESS_API
>  	select HAVE_FUTEX_CMPXCHG if FUTEX
>  	select MMU_GATHER_RCU_TABLE_FREE
> diff --git a/arch/arm64/include/asm/static_call.h b/arch/arm64/include/asm/static_call.h
> new file mode 100644
> index 000000000000..6ee918991510
> --- /dev/null
> +++ b/arch/arm64/include/asm/static_call.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_STATIC_CALL_H
> +#define _ASM_STATIC_CALL_H
> +
> +/*
> + * The sequence below is laid out in a way that guarantees that the literal and
> + * the instruction are always covered by the same cacheline, and can be updated
> + * using a single store-pair instruction (provided that we rewrite the BTI C
> + * instruction as well). This means the literal and the instruction are always
> + * in sync when observed via the D-side.
> + *
> + * However, this does not guarantee that the I-side will catch up immediately
> + * as well: until the I-cache maintenance completes, CPUs may branch to the old
> + * target, or execute a stale NOP or RET. We deal with this by writing the
> + * literal unconditionally, even if it is 0x0 or the branch is in range. That
> + * way, a stale NOP will fall through and call the new target via an indirect
> + * call. Stale RETs or Bs will be taken as before, and branch to the old
> + * target.
> + */
> +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insn)			    \
> +	asm("	.pushsection	.static_call.text, \"ax\"		\n" \
> +	    "	.align		4					\n" \
> +	    "	.globl		" STATIC_CALL_TRAMP_STR(name) "		\n" \
> +	    "0:	.quad		0x0					\n" \
> +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
> +	    "	hint 		34	/* BTI C */			\n" \
> +		insn "							\n" \
> +	    "	ldr		x16, 0b					\n" \
> +	    "	cbz		x16, 1f					\n" \
> +	    "	br		x16					\n" \
> +	    "1:	ret							\n" \
> +	    "	.popsection						\n")
> +
> +#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
> +	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "b " #func)
> +
> +#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
> +	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret")
> +

Now that we have the RET0 implementation I added:

--- a/arch/arm64/include/asm/static_call.h
+++ b/arch/arm64/include/asm/static_call.h
@@ -37,4 +37,7 @@
 #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
 	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret")
 
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)			\
+	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "mov x0, xzr")
+
 #endif /* _ASM_STATIC_CALL_H */


> +#endif /* _ASM_STATIC_CALL_H */
> diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
> index 771f543464e0..a265a87d4d9e 100644
> --- a/arch/arm64/kernel/patching.c
> +++ b/arch/arm64/kernel/patching.c
> @@ -3,6 +3,7 @@
>  #include <linux/mm.h>
>  #include <linux/smp.h>
>  #include <linux/spinlock.h>
> +#include <linux/static_call.h>
>  #include <linux/stop_machine.h>
>  #include <linux/uaccess.h>
>  
> @@ -66,7 +67,7 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
>  	return ret;
>  }
>  
> -static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
> +static int __kprobes __aarch64_insn_write(void *addr, void *insn, int size)
>  {
>  	void *waddr = addr;
>  	unsigned long flags = 0;
> @@ -75,7 +76,7 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
>  	raw_spin_lock_irqsave(&patch_lock, flags);
>  	waddr = patch_map(addr, FIX_TEXT_POKE0);
>  
> -	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
> +	ret = copy_to_kernel_nofault(waddr, insn, size);
>  
>  	patch_unmap(FIX_TEXT_POKE0);
>  	raw_spin_unlock_irqrestore(&patch_lock, flags);
> @@ -85,7 +86,77 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
>  
>  int __kprobes aarch64_insn_write(void *addr, u32 insn)
>  {
> -	return __aarch64_insn_write(addr, cpu_to_le32(insn));
> +	__le32 i = cpu_to_le32(insn);
> +
> +	return __aarch64_insn_write(addr, &i, AARCH64_INSN_SIZE);
> +}
> +
> +static void *strip_cfi_jt(void *addr)
> +{
> +	if (IS_ENABLED(CONFIG_CFI_CLANG)) {

I believe this is now just "CONFIG_CFI".

> +		void *p = addr;
> +		u32 insn;
> +
> +		/*
> +		 * Taking the address of a function produces the address of the
> +		 * jump table entry when Clang CFI is enabled. Such entries are
> +		 * ordinary jump instructions, preceded by a BTI C instruction
> +		 * if BTI is enabled for the kernel.
> +		 */
> +		if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
> +			p += 4;
> +
> +		insn = le32_to_cpup(p);
> +		if (aarch64_insn_is_b(insn))
> +			return p + aarch64_get_branch_offset(insn);
> +
> +		WARN_ON(1);
> +	}
> +	return addr;
> +}
> +
> +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
> +{
> +	/*
> +	 * -0x8	<literal>
> +	 *  0x0	bti c		<--- trampoline entry point
> +	 *  0x4	<branch or nop>
> +	 *  0x8	ldr x16, <literal>
> +	 *  0xc	cbz x16, 20
> +	 * 0x10	br x16
> +	 * 0x14	ret
> +	 */
> +	struct {
> +		u64	literal;
> +		__le32	insn[2];
> +	} insns;
> +	u32 insn;
> +	int ret;
> +
> +	insn = aarch64_insn_gen_hint(AARCH64_INSN_HINT_BTIC);
> +	insns.literal = (u64)func;
> +	insns.insn[0] = cpu_to_le32(insn);
> +
> +	if (!func) {
> +		insn = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_LR,
> +						   AARCH64_INSN_BRANCH_RETURN);
> +	} else {
> +		insn = aarch64_insn_gen_branch_imm((u64)tramp + 4,
> +						   (u64)strip_cfi_jt(func),
> +						   AARCH64_INSN_BRANCH_NOLINK);
> +
> +		/*
> +		 * Use a NOP if the branch target is out of range, and rely on
> +		 * the indirect call instead.
> +		 */
> +		if (insn == AARCH64_BREAK_FAULT)
> +			insn = aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
> +	}
> +	insns.insn[1] = cpu_to_le32(insn);
> +
> +	ret = __aarch64_insn_write(tramp - 8, &insns, sizeof(insns));
> +	if (!WARN_ON(ret))
> +		caches_clean_inval_pou((u64)tramp - 8, sizeof(insns));
>  }
>  
>  int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
> diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
> index 50bab186c49b..e16860a14eaf 100644
> --- a/arch/arm64/kernel/vmlinux.lds.S
> +++ b/arch/arm64/kernel/vmlinux.lds.S
> @@ -173,6 +173,7 @@ SECTIONS
>  			HIBERNATE_TEXT
>  			KEXEC_TEXT
>  			TRAMP_TEXT
> +			STATIC_CALL_TEXT
>  			*(.gnu.warning)
>  		. = ALIGN(16);
>  		*(.got)			/* Global offset table		*/
> -- 
> 2.30.2
> 

A CFI crash was reported in [1]. Also, I was able to reproduce a CFI
failure with just a simple "linux-perf" command. With this patchset I no
longer see these issues. Awesome!

[1] https://lore.kernel.org/all/YfrQzoIWyv9lNljh@google.com/

--
Carlos Llamas

  parent reply	other threads:[~2026-03-12 18:02 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-05 14:59 [PATCH v6 0/2] static call support for arm64 Ard Biesheuvel
2021-11-05 14:59 ` [PATCH v6 1/2] static_call: use non-function types to refer to the trampolines Ard Biesheuvel
2021-11-08 10:08   ` Peter Zijlstra
2021-11-05 14:59 ` [PATCH v6 2/2] arm64: implement support for static call trampolines Ard Biesheuvel
2021-11-08 10:23   ` Peter Zijlstra
2021-11-08 11:29     ` Ard Biesheuvel
2021-11-08 11:52       ` Peter Zijlstra
2021-11-09 17:55   ` Mark Rutland
2021-11-09 18:09     ` Ard Biesheuvel
2021-11-09 19:02       ` Quentin Perret
2021-11-10 11:09         ` Mark Rutland
2021-11-10 12:05           ` Quentin Perret
2026-03-12 18:02   ` Carlos Llamas [this message]
2026-03-12 18:35     ` Ard Biesheuvel
2026-03-12 22:10       ` Carlos Llamas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=abL_ulvlC9JT7Zgw@google.com \
    --to=cmllamas@google.com \
    --cc=ardb@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=frederic@kernel.org \
    --cc=james.morse@arm.com \
    --cc=jpoimboe@redhat.com \
    --cc=keescook@chromium.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=peterz@infradead.org \
    --cc=qperret@google.com \
    --cc=rostedt@goodmis.org \
    --cc=samitolvanen@google.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox