All of lore.kernel.org
 help / color / mirror / Atom feed
From: Carlos Llamas <cmllamas@google.com>
To: Ard Biesheuvel <ardb@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, Mark Rutland <mark.rutland@arm.com>,
	Quentin Perret <qperret@google.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	James Morse <james.morse@arm.com>, Will Deacon <will@kernel.org>,
	Frederic Weisbecker <frederic@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Kees Cook <keescook@chromium.org>,
	Sami Tolvanen <samitolvanen@google.com>,
	Andy Lutomirski <luto@kernel.org>,
	Josh Poimboeuf <jpoimboe@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>
Subject: Re: [PATCH v6 2/2] arm64: implement support for static call trampolines
Date: Thu, 12 Mar 2026 18:02:34 +0000	[thread overview]
Message-ID: <abL_ulvlC9JT7Zgw@google.com> (raw)
In-Reply-To: <20211105145917.2828911-3-ardb@kernel.org>

On Fri, Nov 05, 2021 at 03:59:17PM +0100, Ard Biesheuvel wrote:
> Implement arm64 support for the 'unoptimized' static call variety, which
> routes all calls through a single trampoline that is patched to perform a
> tail call to the selected function.
> 
> It is expected that the direct branch instruction will be able to cover
> the common case. However, given that static call targets may be located
> in modules loaded out of direct branching range, we need a fallback path
> that loads the address into R16 and uses a branch-to-register (BR)
> instruction to perform an indirect call.
> 
> Unlike on x86, there is no pressing need on arm64 to avoid indirect
> calls at all cost, but hiding it from the compiler as is done here does
> have some benefits:
> - the literal is located in .text, which gives us the same robustness
>   advantage that code patching does;
> - no performance hit on CFI enabled Clang builds that decorate compiler
>   emitted indirect calls with branch target validity checks.
> 
> Acked-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---

I'm starting to testing this out on top of 7.0-rc3...


>  arch/arm64/Kconfig                   |  2 +
>  arch/arm64/include/asm/static_call.h | 40 ++++++++++
>  arch/arm64/kernel/patching.c         | 77 +++++++++++++++++++-
>  arch/arm64/kernel/vmlinux.lds.S      |  1 +
>  4 files changed, 117 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 176d6fddc4f2..ccc33b85769c 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -193,6 +193,8 @@ config ARM64
>  	select HAVE_PERF_USER_STACK_DUMP
>  	select HAVE_REGS_AND_STACK_ACCESS_API
>  	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
> +	# https://github.com/ClangBuiltLinux/linux/issues/1354
> +	select HAVE_STATIC_CALL if !LTO_CLANG_THIN || CLANG_VERSION >= 130000

I got a circular dependency error on this...

  error: recursive dependency detected!
	symbol GCOV_KERNEL depends on DEBUG_FS
	symbol DEBUG_FS is selected by GPIO_VIRTUSER
	symbol GPIO_VIRTUSER depends on GPIOLIB
	symbol GPIOLIB is selected by CEC_GPIO
	symbol CEC_GPIO depends on PREEMPTION
	symbol PREEMPTION is selected by PREEMPT_BUILD
	symbol PREEMPT_BUILD is selected by PREEMPT_DYNAMIC
	symbol PREEMPT_DYNAMIC depends on HAVE_PREEMPT_DYNAMIC
	symbol HAVE_PREEMPT_DYNAMIC is selected by HAVE_PREEMPT_DYNAMIC_CALL
	symbol HAVE_PREEMPT_DYNAMIC_CALL depends on HAVE_STATIC_CALL
	symbol HAVE_STATIC_CALL is selected by LTO_CLANG_THIN
	symbol LTO_CLANG_THIN is part of choice block at arch/Kconfig:817
	symbol <choice> unknown is visible depending on LTO_CLANG_FULL
	symbol LTO_CLANG_FULL prompt is visible depending on HAS_LTO_CLANG
	symbol HAS_LTO_CLANG depends on GCOV_KERNEL

...so I just dropped the checks altogether for now.


>  	select HAVE_FUNCTION_ARG_ACCESS_API
>  	select HAVE_FUTEX_CMPXCHG if FUTEX
>  	select MMU_GATHER_RCU_TABLE_FREE
> diff --git a/arch/arm64/include/asm/static_call.h b/arch/arm64/include/asm/static_call.h
> new file mode 100644
> index 000000000000..6ee918991510
> --- /dev/null
> +++ b/arch/arm64/include/asm/static_call.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_STATIC_CALL_H
> +#define _ASM_STATIC_CALL_H
> +
> +/*
> + * The sequence below is laid out in a way that guarantees that the literal and
> + * the instruction are always covered by the same cacheline, and can be updated
> + * using a single store-pair instruction (provided that we rewrite the BTI C
> + * instruction as well). This means the literal and the instruction are always
> + * in sync when observed via the D-side.
> + *
> + * However, this does not guarantee that the I-side will catch up immediately
> + * as well: until the I-cache maintenance completes, CPUs may branch to the old
> + * target, or execute a stale NOP or RET. We deal with this by writing the
> + * literal unconditionally, even if it is 0x0 or the branch is in range. That
> + * way, a stale NOP will fall through and call the new target via an indirect
> + * call. Stale RETs or Bs will be taken as before, and branch to the old
> + * target.
> + */
> +#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insn)			    \
> +	asm("	.pushsection	.static_call.text, \"ax\"		\n" \
> +	    "	.align		4					\n" \
> +	    "	.globl		" STATIC_CALL_TRAMP_STR(name) "		\n" \
> +	    "0:	.quad		0x0					\n" \
> +	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
> +	    "	hint 		34	/* BTI C */			\n" \
> +		insn "							\n" \
> +	    "	ldr		x16, 0b					\n" \
> +	    "	cbz		x16, 1f					\n" \
> +	    "	br		x16					\n" \
> +	    "1:	ret							\n" \
> +	    "	.popsection						\n")
> +
> +#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
> +	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "b " #func)
> +
> +#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
> +	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret")
> +

Now that we have the RET0 implementation I added:

--- a/arch/arm64/include/asm/static_call.h
+++ b/arch/arm64/include/asm/static_call.h
@@ -37,4 +37,7 @@
 #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
 	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret")
 
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)			\
+	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "mov x0, xzr")
+
 #endif /* _ASM_STATIC_CALL_H */


> +#endif /* _ASM_STATIC_CALL_H */
> diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
> index 771f543464e0..a265a87d4d9e 100644
> --- a/arch/arm64/kernel/patching.c
> +++ b/arch/arm64/kernel/patching.c
> @@ -3,6 +3,7 @@
>  #include <linux/mm.h>
>  #include <linux/smp.h>
>  #include <linux/spinlock.h>
> +#include <linux/static_call.h>
>  #include <linux/stop_machine.h>
>  #include <linux/uaccess.h>
>  
> @@ -66,7 +67,7 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
>  	return ret;
>  }
>  
> -static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
> +static int __kprobes __aarch64_insn_write(void *addr, void *insn, int size)
>  {
>  	void *waddr = addr;
>  	unsigned long flags = 0;
> @@ -75,7 +76,7 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
>  	raw_spin_lock_irqsave(&patch_lock, flags);
>  	waddr = patch_map(addr, FIX_TEXT_POKE0);
>  
> -	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
> +	ret = copy_to_kernel_nofault(waddr, insn, size);
>  
>  	patch_unmap(FIX_TEXT_POKE0);
>  	raw_spin_unlock_irqrestore(&patch_lock, flags);
> @@ -85,7 +86,77 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
>  
>  int __kprobes aarch64_insn_write(void *addr, u32 insn)
>  {
> -	return __aarch64_insn_write(addr, cpu_to_le32(insn));
> +	__le32 i = cpu_to_le32(insn);
> +
> +	return __aarch64_insn_write(addr, &i, AARCH64_INSN_SIZE);
> +}
> +
> +static void *strip_cfi_jt(void *addr)
> +{
> +	if (IS_ENABLED(CONFIG_CFI_CLANG)) {

I believe this is now just "CONFIG_CFI".

> +		void *p = addr;
> +		u32 insn;
> +
> +		/*
> +		 * Taking the address of a function produces the address of the
> +		 * jump table entry when Clang CFI is enabled. Such entries are
> +		 * ordinary jump instructions, preceded by a BTI C instruction
> +		 * if BTI is enabled for the kernel.
> +		 */
> +		if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
> +			p += 4;
> +
> +		insn = le32_to_cpup(p);
> +		if (aarch64_insn_is_b(insn))
> +			return p + aarch64_get_branch_offset(insn);
> +
> +		WARN_ON(1);
> +	}
> +	return addr;
> +}
> +
> +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
> +{
> +	/*
> +	 * -0x8	<literal>
> +	 *  0x0	bti c		<--- trampoline entry point
> +	 *  0x4	<branch or nop>
> +	 *  0x8	ldr x16, <literal>
> +	 *  0xc	cbz x16, 20
> +	 * 0x10	br x16
> +	 * 0x14	ret
> +	 */
> +	struct {
> +		u64	literal;
> +		__le32	insn[2];
> +	} insns;
> +	u32 insn;
> +	int ret;
> +
> +	insn = aarch64_insn_gen_hint(AARCH64_INSN_HINT_BTIC);
> +	insns.literal = (u64)func;
> +	insns.insn[0] = cpu_to_le32(insn);
> +
> +	if (!func) {
> +		insn = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_LR,
> +						   AARCH64_INSN_BRANCH_RETURN);
> +	} else {
> +		insn = aarch64_insn_gen_branch_imm((u64)tramp + 4,
> +						   (u64)strip_cfi_jt(func),
> +						   AARCH64_INSN_BRANCH_NOLINK);
> +
> +		/*
> +		 * Use a NOP if the branch target is out of range, and rely on
> +		 * the indirect call instead.
> +		 */
> +		if (insn == AARCH64_BREAK_FAULT)
> +			insn = aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
> +	}
> +	insns.insn[1] = cpu_to_le32(insn);
> +
> +	ret = __aarch64_insn_write(tramp - 8, &insns, sizeof(insns));
> +	if (!WARN_ON(ret))
> +		caches_clean_inval_pou((u64)tramp - 8, sizeof(insns));
>  }
>  
>  int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
> diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
> index 50bab186c49b..e16860a14eaf 100644
> --- a/arch/arm64/kernel/vmlinux.lds.S
> +++ b/arch/arm64/kernel/vmlinux.lds.S
> @@ -173,6 +173,7 @@ SECTIONS
>  			HIBERNATE_TEXT
>  			KEXEC_TEXT
>  			TRAMP_TEXT
> +			STATIC_CALL_TEXT
>  			*(.gnu.warning)
>  		. = ALIGN(16);
>  		*(.got)			/* Global offset table		*/
> -- 
> 2.30.2
> 

A CFI crash was reported in [1]. Also, I was able to reproduce a CFI
failure with just a simple "linux-perf" command. With this patchset I no
longer see these issues. Awesome!

[1] https://lore.kernel.org/all/YfrQzoIWyv9lNljh@google.com/

--
Carlos Llamas


  parent reply	other threads:[~2026-03-12 18:02 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-05 14:59 [PATCH v6 0/2] static call support for arm64 Ard Biesheuvel
2021-11-05 14:59 ` Ard Biesheuvel
2021-11-05 14:59 ` [PATCH v6 1/2] static_call: use non-function types to refer to the trampolines Ard Biesheuvel
2021-11-05 14:59   ` Ard Biesheuvel
2021-11-08 10:08   ` Peter Zijlstra
2021-11-08 10:08     ` Peter Zijlstra
2021-11-05 14:59 ` [PATCH v6 2/2] arm64: implement support for static call trampolines Ard Biesheuvel
2021-11-05 14:59   ` Ard Biesheuvel
2021-11-08 10:23   ` Peter Zijlstra
2021-11-08 10:23     ` Peter Zijlstra
2021-11-08 11:29     ` Ard Biesheuvel
2021-11-08 11:29       ` Ard Biesheuvel
2021-11-08 11:52       ` Peter Zijlstra
2021-11-08 11:52         ` Peter Zijlstra
2021-11-09 17:55   ` Mark Rutland
2021-11-09 17:55     ` Mark Rutland
2021-11-09 18:09     ` Ard Biesheuvel
2021-11-09 18:09       ` Ard Biesheuvel
2021-11-09 19:02       ` Quentin Perret
2021-11-09 19:02         ` Quentin Perret
2021-11-10 11:09         ` Mark Rutland
2021-11-10 11:09           ` Mark Rutland
2021-11-10 12:05           ` Quentin Perret
2021-11-10 12:05             ` Quentin Perret
2026-03-12 18:02   ` Carlos Llamas [this message]
2026-03-12 18:35     ` Ard Biesheuvel
2026-03-12 22:10       ` Carlos Llamas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=abL_ulvlC9JT7Zgw@google.com \
    --to=cmllamas@google.com \
    --cc=ardb@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=frederic@kernel.org \
    --cc=james.morse@arm.com \
    --cc=jpoimboe@redhat.com \
    --cc=keescook@chromium.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=peterz@infradead.org \
    --cc=qperret@google.com \
    --cc=rostedt@goodmis.org \
    --cc=samitolvanen@google.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.