linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
@ 2025-10-27 18:17 Ben Niu
  2025-10-29 14:33 ` Mark Rutland
  0 siblings, 1 reply; 8+ messages in thread
From: Ben Niu @ 2025-10-27 18:17 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Will Deacon, tytso, Jason, Ben Niu, Ben Niu, linux-arm-kernel,
	linux-kernel

Currently, Arm64 assembly functions always have a bti c
instruction inserted before the prologue. When ftrace is enabled,
no padding nops are inserted at all.

This breaks kprobe tracing for asm functions, which assumes that
proper nops are added before and within functions (when ftrace is
enabled) and bti c is only present when CONFIG_ARM64_BTI_KERNEL is
defined.

The patch fixes the bug by inserting nops and bti c in Arm64 asm
in the same way as compiled C code.

Note: although this patch unblocks kprobe tracing, fentry is still
broken because no BTF info gets generated from assembly files. A
separate patch is needed to fix that.

I built this patch with different combos of the following features
and confirmed kprobe tracing for asm function __arch_copy_to_user
worked in all cases:

CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
CONFIG_DYNAMIC_FTRACE_WITH_ARGS
CONFIG_ARM64_BTI_KERNEL

Signed-off-by: Ben Niu <benniu@meta.com>
---
 arch/arm64/include/asm/linkage.h           | 103 ++++++++++++++++-----
 arch/arm64/kernel/vdso/vgetrandom-chacha.S |   2 +-
 2 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index d3acd9c87509..f3f3bc168162 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -5,8 +5,47 @@
 #include <asm/assembler.h>
 #endif
 
-#define __ALIGN		.balign CONFIG_FUNCTION_ALIGNMENT
-#define __ALIGN_STR	".balign " #CONFIG_FUNCTION_ALIGNMENT
+#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT
+#define __ALIGN_STR ".balign " #CONFIG_FUNCTION_ALIGNMENT
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+
+#define PRE_FUNCTION_NOPS                                                   \
+	ALIGN;                                                              \
+	nops CONFIG_FUNCTION_ALIGNMENT / 4 - 2;                             \
+	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
+	.p2align 3;                                                         \
+	.8byte 1f;                                                          \
+	.popsection;                                                        \
+	1 :;                                                                \
+	nops 2;
+
+#define PRE_PROLOGUE_NOPS nops 2;
+
+#elif defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)
+
+#define PRE_FUNCTION_NOPS
+
+#define PRE_PROLOGUE_NOPS                                                   \
+	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
+	.p2align 3;                                                         \
+	.8byte 1f;                                                          \
+	.popsection;                                                        \
+	1 :;                                                                \
+	nops 2;
+
+#else
+
+#define PRE_FUNCTION_NOPS
+#define PRE_PROLOGUE_NOPS
+
+#endif
+
+#ifdef CONFIG_ARM64_BTI_KERNEL
+#define BTI_C bti c;
+#else
+#define BTI_C
+#endif
 
 /*
  * When using in-kernel BTI we need to ensure that PCS-conformant
@@ -15,32 +54,50 @@
  * everything, the override is done unconditionally so we're more
  * likely to notice any drift from the overridden definitions.
  */
-#define SYM_FUNC_START(name)				\
-	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
-	bti c ;
+#define SYM_FUNC_START(name)                       \
+	PRE_FUNCTION_NOPS                          \
+	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+	BTI_C                                      \
+	PRE_PROLOGUE_NOPS
+
+#define SYM_FUNC_START_NOTRACE(name)               \
+	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+	BTI_C
 
-#define SYM_FUNC_START_NOALIGN(name)			\
-	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)	\
-	bti c ;
+#define SYM_FUNC_START_NOALIGN(name)              \
+	PRE_FUNCTION_NOPS                         \
+	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
+	BTI_C                                     \
+	PRE_PROLOGUE_NOPS
 
-#define SYM_FUNC_START_LOCAL(name)			\
-	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN)	\
-	bti c ;
+#define SYM_FUNC_START_LOCAL(name)                \
+	PRE_FUNCTION_NOPS                         \
+	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
+	BTI_C                                     \
+	PRE_PROLOGUE_NOPS
 
-#define SYM_FUNC_START_LOCAL_NOALIGN(name)		\
-	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)	\
-	bti c ;
+#define SYM_FUNC_START_LOCAL_NOALIGN(name)       \
+	PRE_FUNCTION_NOPS                        \
+	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
+	BTI_C                                    \
+	PRE_PROLOGUE_NOPS
 
-#define SYM_FUNC_START_WEAK(name)			\
-	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN)	\
-	bti c ;
+#define SYM_FUNC_START_WEAK(name)                \
+	PRE_FUNCTION_NOPS                        \
+	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
+	BTI_C                                    \
+	PRE_PROLOGUE_NOPS
 
-#define SYM_FUNC_START_WEAK_NOALIGN(name)		\
-	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
-	bti c ;
+#define SYM_FUNC_START_WEAK_NOALIGN(name)       \
+	PRE_FUNCTION_NOPS                       \
+	SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
+	BTI_C                                   \
+	PRE_PROLOGUE_NOPS
 
-#define SYM_TYPED_FUNC_START(name)				\
-	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
-	bti c ;
+#define SYM_TYPED_FUNC_START(name)                       \
+	PRE_FUNCTION_NOPS                                \
+	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+	BTI_C                                            \
+	PRE_PROLOGUE_NOPS
 
 #endif
diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
index 67890b445309..21c27b64cf9f 100644
--- a/arch/arm64/kernel/vdso/vgetrandom-chacha.S
+++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
@@ -40,7 +40,7 @@
  *	x2: 8-byte counter input/output
  *	x3: number of 64-byte block to write to output
  */
-SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+SYM_FUNC_START_NOTRACE(__arch_chacha20_blocks_nostack)
 
 	/* copy0 = "expand 32-byte k" */
 	mov_q		x8, 0x3320646e61707865
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
  2025-10-27 18:17 [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions Ben Niu
@ 2025-10-29 14:33 ` Mark Rutland
  2025-10-29 23:53   ` Ben Niu
  0 siblings, 1 reply; 8+ messages in thread
From: Mark Rutland @ 2025-10-29 14:33 UTC (permalink / raw)
  To: Ben Niu
  Cc: Catalin Marinas, Will Deacon, tytso, Jason, Ben Niu,
	linux-arm-kernel, linux-kernel

On Mon, Oct 27, 2025 at 11:17:49AM -0700, Ben Niu wrote:
> Currently, Arm64 assembly functions always have a bti c
> instruction inserted before the prologue. When ftrace is enabled,
> no padding nops are inserted at all.
> 
> This breaks kprobe tracing for asm functions, which assumes that
> proper nops are added before and within functions (when ftrace is
> enabled) and bti c is only present when CONFIG_ARM64_BTI_KERNEL is
> defined.

What exactly do you mean by "breaks kprobe tracing"?

The kprobes code knows NOTHING about those ftrace NOPs, so I cannot see
how those are relevant.

The patch adds entries to __patchable_function_entries, which is owned
by ftrace, and has NOTHING to do with kprobes.

> The patch fixes the bug by inserting nops and bti c in Arm64 asm
> in the same way as compiled C code.

As it stands, NAK to this change.

I'm not averse to making (some) assembly functions traceable by ftrace,
and hence giving those NOPs. However, that's not safe generally (e.g.
due to noinstr requirements), and so special care will need to be taken.

The rationale above does not make sense; it conflates distinct things,
and I think a more complete explanation is necessary.

Mark.

> Note: although this patch unblocks kprobe tracing, fentry is still
> broken because no BTF info gets generated from assembly files. A
> separate patch is needed to fix that.
> 
> I built this patch with different combos of the following features
> and confirmed kprobe tracing for asm function __arch_copy_to_user
> worked in all cases:
> 
> CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> CONFIG_DYNAMIC_FTRACE_WITH_ARGS
> CONFIG_ARM64_BTI_KERNEL
> 
> Signed-off-by: Ben Niu <benniu@meta.com>
> ---
>  arch/arm64/include/asm/linkage.h           | 103 ++++++++++++++++-----
>  arch/arm64/kernel/vdso/vgetrandom-chacha.S |   2 +-
>  2 files changed, 81 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
> index d3acd9c87509..f3f3bc168162 100644
> --- a/arch/arm64/include/asm/linkage.h
> +++ b/arch/arm64/include/asm/linkage.h
> @@ -5,8 +5,47 @@
>  #include <asm/assembler.h>
>  #endif
>  
> -#define __ALIGN		.balign CONFIG_FUNCTION_ALIGNMENT
> -#define __ALIGN_STR	".balign " #CONFIG_FUNCTION_ALIGNMENT
> +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT
> +#define __ALIGN_STR ".balign " #CONFIG_FUNCTION_ALIGNMENT
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> +
> +#define PRE_FUNCTION_NOPS                                                   \
> +	ALIGN;                                                              \
> +	nops CONFIG_FUNCTION_ALIGNMENT / 4 - 2;                             \
> +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> +	.p2align 3;                                                         \
> +	.8byte 1f;                                                          \
> +	.popsection;                                                        \
> +	1 :;                                                                \
> +	nops 2;
> +
> +#define PRE_PROLOGUE_NOPS nops 2;
> +
> +#elif defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)
> +
> +#define PRE_FUNCTION_NOPS
> +
> +#define PRE_PROLOGUE_NOPS                                                   \
> +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> +	.p2align 3;                                                         \
> +	.8byte 1f;                                                          \
> +	.popsection;                                                        \
> +	1 :;                                                                \
> +	nops 2;
> +
> +#else
> +
> +#define PRE_FUNCTION_NOPS
> +#define PRE_PROLOGUE_NOPS
> +
> +#endif
> +
> +#ifdef CONFIG_ARM64_BTI_KERNEL
> +#define BTI_C bti c;
> +#else
> +#define BTI_C
> +#endif
>  
>  /*
>   * When using in-kernel BTI we need to ensure that PCS-conformant
> @@ -15,32 +54,50 @@
>   * everything, the override is done unconditionally so we're more
>   * likely to notice any drift from the overridden definitions.
>   */
> -#define SYM_FUNC_START(name)				\
> -	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> -	bti c ;
> +#define SYM_FUNC_START(name)                       \
> +	PRE_FUNCTION_NOPS                          \
> +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> +	BTI_C                                      \
> +	PRE_PROLOGUE_NOPS
> +
> +#define SYM_FUNC_START_NOTRACE(name)               \
> +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> +	BTI_C
>  
> -#define SYM_FUNC_START_NOALIGN(name)			\
> -	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)	\
> -	bti c ;
> +#define SYM_FUNC_START_NOALIGN(name)              \
> +	PRE_FUNCTION_NOPS                         \
> +	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
> +	BTI_C                                     \
> +	PRE_PROLOGUE_NOPS
>  
> -#define SYM_FUNC_START_LOCAL(name)			\
> -	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN)	\
> -	bti c ;
> +#define SYM_FUNC_START_LOCAL(name)                \
> +	PRE_FUNCTION_NOPS                         \
> +	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
> +	BTI_C                                     \
> +	PRE_PROLOGUE_NOPS
>  
> -#define SYM_FUNC_START_LOCAL_NOALIGN(name)		\
> -	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)	\
> -	bti c ;
> +#define SYM_FUNC_START_LOCAL_NOALIGN(name)       \
> +	PRE_FUNCTION_NOPS                        \
> +	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
> +	BTI_C                                    \
> +	PRE_PROLOGUE_NOPS
>  
> -#define SYM_FUNC_START_WEAK(name)			\
> -	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN)	\
> -	bti c ;
> +#define SYM_FUNC_START_WEAK(name)                \
> +	PRE_FUNCTION_NOPS                        \
> +	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
> +	BTI_C                                    \
> +	PRE_PROLOGUE_NOPS
>  
> -#define SYM_FUNC_START_WEAK_NOALIGN(name)		\
> -	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
> -	bti c ;
> +#define SYM_FUNC_START_WEAK_NOALIGN(name)       \
> +	PRE_FUNCTION_NOPS                       \
> +	SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
> +	BTI_C                                   \
> +	PRE_PROLOGUE_NOPS
>  
> -#define SYM_TYPED_FUNC_START(name)				\
> -	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> -	bti c ;
> +#define SYM_TYPED_FUNC_START(name)                       \
> +	PRE_FUNCTION_NOPS                                \
> +	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> +	BTI_C                                            \
> +	PRE_PROLOGUE_NOPS
>  
>  #endif
> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> index 67890b445309..21c27b64cf9f 100644
> --- a/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> @@ -40,7 +40,7 @@
>   *	x2: 8-byte counter input/output
>   *	x3: number of 64-byte block to write to output
>   */
> -SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +SYM_FUNC_START_NOTRACE(__arch_chacha20_blocks_nostack)
>  
>  	/* copy0 = "expand 32-byte k" */
>  	mov_q		x8, 0x3320646e61707865
> -- 
> 2.47.3
> 
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
  2025-10-29 14:33 ` Mark Rutland
@ 2025-10-29 23:53   ` Ben Niu
  2025-10-30 12:35     ` Mark Rutland
  0 siblings, 1 reply; 8+ messages in thread
From: Ben Niu @ 2025-10-29 23:53 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, tytso, Jason, linux-arm-kernel, niuben003,
	linux-kernel, benniu

On Wed, Oct 29, 2025 at 02:33:26PM +0000, Mark Rutland wrote:
> On Mon, Oct 27, 2025 at 11:17:49AM -0700, Ben Niu wrote:
> > Currently, Arm64 assembly functions always have a bti c
> > instruction inserted before the prologue. When ftrace is enabled,
> > no padding nops are inserted at all.
> > 
> > This breaks kprobe tracing for asm functions, which assumes that
> > proper nops are added before and within functions (when ftrace is
> > enabled) and bti c is only present when CONFIG_ARM64_BTI_KERNEL is
> > defined.
> 
> What exactly do you mean by "breaks kprobe tracing"?
> 
> The kprobes code knows NOTHING about those ftrace NOPs, so I cannot see
> how those are relevant.
> 
> The patch adds entries to __patchable_function_entries, which is owned
> by ftrace, and has NOTHING to do with kprobes.

Thanks for reviewing my patch, Mark. This is the second ever Linux kernel patch
I've sent out so I'm still learning how much detail I should put in a patch for
reviewers. Clearly, not enough details were added for this patch, and sorry
about that.

kprobe and ftrace seem two seprate things, but kprobe actually checks if a
function is notrace based off whether the function is padded with proper nops
for ftrace when ftrace is enabled. See the following call stack for more
details (obtained by debugging centos 10 arm64 linux kernel):

#0  lookup_rec (start=18446603338384966592, end=18446603338384967175) at kernel/trace/ftrace.c:1591
#1  ftrace_location_range (start=18446603338384966592, end=18446603338384967175) at kernel/trace/ftrace.c:1626
#2  0xffff8000802bd070 [PAC] in __within_notrace_func (addr=<optimized out>, addr@entry=18446603338384966592) at kernel/trace/trace_kprobe.c:456
#3  0xffff8000802bd13c [PAC] in within_notrace_func (tk=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:464
#4  0xffff8000802bd534 [PAC] in __register_trace_kprobe (tk=tk@entry=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:496
#5  0xffff8000802bf7a0 [PAC] in __register_trace_kprobe (tk=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:490
#6  create_local_trace_kprobe (func=func@entry=0xffff0000cabb19c0 "__arch_copy_to_user", addr=<optimized out>, offs=<optimized out>, is_return=is_return@entry=false)
    at kernel/trace/trace_kprobe.c:1935
#7  0xffff80008029ef8c [PAC] in perf_kprobe_init (p_event=p_event@entry=0xffff0000c6d89a40, is_retprobe=false) at kernel/trace/trace_event_perf.c:267
#8  0xffff800080364ec4 [PAC] in perf_kprobe_event_init (event=0xffff0000c6d89a40) at kernel/events/core.c:11102
#9  0xffff800080369740 [PAC] in perf_try_init_event (pmu=0xffff8000829347a8 <perf_kprobe>, event=0xffff0000c6d89a40) at kernel/events/core.c:12595
#10 0xffff8000803699dc [PAC] in perf_init_event (event=event@entry=0xffff0000c6d89a40) at kernel/events/core.c:12693
#11 0xffff80008036d91c [PAC] in perf_event_alloc (attr=attr@entry=0xffff80008785b970, cpu=cpu@entry=0, task=task@entry=0x0, group_leader=0xffff0000c6d89a40, 
    group_leader@entry=0x0, parent_event=parent_event@entry=0x0, overflow_handler=<optimized out>, overflow_handler@entry=0x0, context=<optimized out>, context@entry=0x0, 
    cgroup_fd=cgroup_fd@entry=-1) at kernel/events/core.c:12968
#12 0xffff800080373244 [PAC] in __do_sys_perf_event_open (attr_uptr=<optimized out>, pid=<optimized out>, cpu=0, group_fd=<optimized out>, flags=<optimized out>)
    at kernel/events/core.c:13485
#13 0xffff800080379aa0 [PAC] in __se_sys_perf_event_open (attr_uptr=<optimized out>, pid=<optimized out>, cpu=<optimized out>, group_fd=<optimized out>, flags=<optimized out>)
    at kernel/events/core.c:13367
#14 __arm64_sys_perf_event_open (regs=<optimized out>) at kernel/events/core.c:13367
#15 0xffff80008004997c [PAC] in __invoke_syscall (regs=0xffff80008785beb0, syscall_fn=<optimized out>) at arch/arm64/kernel/syscall.c:35
#16 invoke_syscall (regs=regs@entry=0xffff80008785beb0, scno=<optimized out>, sc_nr=sc_nr@entry=463, syscall_table=<optimized out>) at arch/arm64/kernel/syscall.c:49
#17 0xffff800080049a88 [PAC] in el0_svc_common (sc_nr=463, syscall_table=<optimized out>, regs=0xffff80008785beb0, scno=<optimized out>) at arch/arm64/kernel/syscall.c:132
#18 do_el0_svc (regs=regs@entry=0xffff80008785beb0) at arch/arm64/kernel/syscall.c:151
#19 0xffff800080fba654 [PAC] in el0_svc (regs=0xffff80008785beb0) at arch/arm64/kernel/entry-common.c:875
#20 0xffff800080fbab10 [PAC] in el0t_64_sync_handler (regs=<optimized out>) at arch/arm64/kernel/entry-common.c:894
#21 0xffff800080011684 [PAC] in el0t_64_sync () at arch/arm64/kernel/entry.S:600

within_notrace_func is only defined if defined(CONFIG_DYNAMIC_FTRACE) && \
!defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE), which searches a list of records
about whether a function is ftrace-able or not. The list of records is built
by ftrace_init at boot time.
 
> > The patch fixes the bug by inserting nops and bti c in Arm64 asm
> > in the same way as compiled C code.

That's correct. I added __patchable_function_entries in assemblies so that
their addresses are added between __start_mcount_loc and __stop_mcount_loc
at link time, which is then picked up by ftrace_init. Proper nop and bti c
instructions are added to assemblies in the way as if the assemblies are
compiled from C code. This is crucial to satisfy ftrace requirements and nop
patching at boot time. Otherwise, ftrace_bug will be hit and the assembly
function won't be ftrace-able and also kprobe-traceable.

> As it stands, NAK to this change.
> 
> I'm not averse to making (some) assembly functions traceable by ftrace,
> and hence giving those NOPs. However, that's not safe generally (e.g.
> due to noinstr requirements), and so special care will need to be taken.

Assemblies that are not traceable should use SYM_CODE_START or the new macro
SYM_FUNC_START_NOTRACE. If an assembly function starts with SYM_FUNC_START,
by default, it's assumed to be traceable. I dumped all assemblies under
arch/arm64, and majority of the assemblies are utility functions (e.g., crypto,
memset/memcpy, etc.), which seemed safe to trace. There are a few functions
in the following (and maybe others) that might not be traceable, so please
advise.
./kernel/entry-ftrace.S
./kernel/entry.S
./mm/proc.S
./kernel/head.S

We can either have SYM_FUNC_START be traceable by default, or flip the polarity
and introduce SYM_FUNC_START_TRACEABLE to explicitly mark those functions
deemed safe to trace.

> The rationale above does not make sense; it conflates distinct things,
> and I think a more complete explanation is necessary.

Please see my explanation above. If you have any questions, please let me know.

> Mark.
> 
> > Note: although this patch unblocks kprobe tracing, fentry is still
> > broken because no BTF info gets generated from assembly files. A
> > separate patch is needed to fix that.
> > 
> > I built this patch with different combos of the following features
> > and confirmed kprobe tracing for asm function __arch_copy_to_user
> > worked in all cases:
> > 
> > CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > CONFIG_DYNAMIC_FTRACE_WITH_ARGS
> > CONFIG_ARM64_BTI_KERNEL
> > 
> > Signed-off-by: Ben Niu <benniu@meta.com>
> > ---
> >  arch/arm64/include/asm/linkage.h           | 103 ++++++++++++++++-----
> >  arch/arm64/kernel/vdso/vgetrandom-chacha.S |   2 +-
> >  2 files changed, 81 insertions(+), 24 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
> > index d3acd9c87509..f3f3bc168162 100644
> > --- a/arch/arm64/include/asm/linkage.h
> > +++ b/arch/arm64/include/asm/linkage.h
> > @@ -5,8 +5,47 @@
> >  #include <asm/assembler.h>
> >  #endif
> >  
> > -#define __ALIGN		.balign CONFIG_FUNCTION_ALIGNMENT
> > -#define __ALIGN_STR	".balign " #CONFIG_FUNCTION_ALIGNMENT
> > +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT
> > +#define __ALIGN_STR ".balign " #CONFIG_FUNCTION_ALIGNMENT
> > +
> > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > +
> > +#define PRE_FUNCTION_NOPS                                                   \
> > +	ALIGN;                                                              \
> > +	nops CONFIG_FUNCTION_ALIGNMENT / 4 - 2;                             \
> > +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> > +	.p2align 3;                                                         \
> > +	.8byte 1f;                                                          \
> > +	.popsection;                                                        \
> > +	1 :;                                                                \
> > +	nops 2;
> > +
> > +#define PRE_PROLOGUE_NOPS nops 2;
> > +
> > +#elif defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)
> > +
> > +#define PRE_FUNCTION_NOPS
> > +
> > +#define PRE_PROLOGUE_NOPS                                                   \
> > +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> > +	.p2align 3;                                                         \
> > +	.8byte 1f;                                                          \
> > +	.popsection;                                                        \
> > +	1 :;                                                                \
> > +	nops 2;
> > +
> > +#else
> > +
> > +#define PRE_FUNCTION_NOPS
> > +#define PRE_PROLOGUE_NOPS
> > +
> > +#endif
> > +
> > +#ifdef CONFIG_ARM64_BTI_KERNEL
> > +#define BTI_C bti c;
> > +#else
> > +#define BTI_C
> > +#endif
> >  
> >  /*
> >   * When using in-kernel BTI we need to ensure that PCS-conformant
> > @@ -15,32 +54,50 @@
> >   * everything, the override is done unconditionally so we're more
> >   * likely to notice any drift from the overridden definitions.
> >   */
> > -#define SYM_FUNC_START(name)				\
> > -	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> > -	bti c ;
> > +#define SYM_FUNC_START(name)                       \
> > +	PRE_FUNCTION_NOPS                          \
> > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > +	BTI_C                                      \
> > +	PRE_PROLOGUE_NOPS
> > +
> > +#define SYM_FUNC_START_NOTRACE(name)               \
> > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > +	BTI_C
> >  
> > -#define SYM_FUNC_START_NOALIGN(name)			\
> > -	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)	\
> > -	bti c ;
> > +#define SYM_FUNC_START_NOALIGN(name)              \
> > +	PRE_FUNCTION_NOPS                         \
> > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
> > +	BTI_C                                     \
> > +	PRE_PROLOGUE_NOPS
> >  
> > -#define SYM_FUNC_START_LOCAL(name)			\
> > -	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN)	\
> > -	bti c ;
> > +#define SYM_FUNC_START_LOCAL(name)                \
> > +	PRE_FUNCTION_NOPS                         \
> > +	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
> > +	BTI_C                                     \
> > +	PRE_PROLOGUE_NOPS
> >  
> > -#define SYM_FUNC_START_LOCAL_NOALIGN(name)		\
> > -	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)	\
> > -	bti c ;
> > +#define SYM_FUNC_START_LOCAL_NOALIGN(name)       \
> > +	PRE_FUNCTION_NOPS                        \
> > +	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
> > +	BTI_C                                    \
> > +	PRE_PROLOGUE_NOPS
> >  
> > -#define SYM_FUNC_START_WEAK(name)			\
> > -	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN)	\
> > -	bti c ;
> > +#define SYM_FUNC_START_WEAK(name)                \
> > +	PRE_FUNCTION_NOPS                        \
> > +	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
> > +	BTI_C                                    \
> > +	PRE_PROLOGUE_NOPS
> >  
> > -#define SYM_FUNC_START_WEAK_NOALIGN(name)		\
> > -	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
> > -	bti c ;
> > +#define SYM_FUNC_START_WEAK_NOALIGN(name)       \
> > +	PRE_FUNCTION_NOPS                       \
> > +	SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
> > +	BTI_C                                   \
> > +	PRE_PROLOGUE_NOPS
> >  
> > -#define SYM_TYPED_FUNC_START(name)				\
> > -	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> > -	bti c ;
> > +#define SYM_TYPED_FUNC_START(name)                       \
> > +	PRE_FUNCTION_NOPS                                \
> > +	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > +	BTI_C                                            \
> > +	PRE_PROLOGUE_NOPS
> >  
> >  #endif
> > diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > index 67890b445309..21c27b64cf9f 100644
> > --- a/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > @@ -40,7 +40,7 @@
> >   *	x2: 8-byte counter input/output
> >   *	x3: number of 64-byte block to write to output
> >   */
> > -SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> > +SYM_FUNC_START_NOTRACE(__arch_chacha20_blocks_nostack)
> >  
> >  	/* copy0 = "expand 32-byte k" */
> >  	mov_q		x8, 0x3320646e61707865
> > -- 
> > 2.47.3
> > 
> > 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
  2025-10-29 23:53   ` Ben Niu
@ 2025-10-30 12:35     ` Mark Rutland
  2025-10-30 18:07       ` Ben Niu
  0 siblings, 1 reply; 8+ messages in thread
From: Mark Rutland @ 2025-10-30 12:35 UTC (permalink / raw)
  To: Ben Niu
  Cc: catalin.marinas, will, tytso, Jason, linux-arm-kernel, niuben003,
	linux-kernel

On Wed, Oct 29, 2025 at 04:53:48PM -0700, Ben Niu wrote:
> On Wed, Oct 29, 2025 at 02:33:26PM +0000, Mark Rutland wrote:
> > On Mon, Oct 27, 2025 at 11:17:49AM -0700, Ben Niu wrote:
> > > Currently, Arm64 assembly functions always have a bti c
> > > instruction inserted before the prologue. When ftrace is enabled,
> > > no padding nops are inserted at all.
> > > 
> > > This breaks kprobe tracing for asm functions, which assumes that
> > > proper nops are added before and within functions (when ftrace is
> > > enabled) and bti c is only present when CONFIG_ARM64_BTI_KERNEL is
> > > defined.
> > 
> > What exactly do you mean by "breaks kprobe tracing"?
> > 
> > The kprobes code knows NOTHING about those ftrace NOPs, so I cannot see
> > how those are relevant.
> > 
> > The patch adds entries to __patchable_function_entries, which is owned
> > by ftrace, and has NOTHING to do with kprobes.
> 
> Thanks for reviewing my patch, Mark. This is the second ever Linux kernel patch
> I've sent out so I'm still learning how much detail I should put in a patch for
> reviewers. Clearly, not enough details were added for this patch, and sorry
> about that.

Hi Ben,

Thanks, and sorry for the overly strong tone of my initial reply.

My key concern above was that "breaks kprobe tracing" sounds like
something is seriously wrong, whereas IIUC the problem you're trying to
address is "it isn't currently possible to place a trace kprobe on an
assembly function" (i.e. there's a limitation, not a bug).

> kprobe and ftrace seem two seprate things, but kprobe actually checks if a
> function is notrace based off whether the function is padded with proper nops
> for ftrace when ftrace is enabled. See the following call stack for more
> details (obtained by debugging centos 10 arm64 linux kernel):
> 
> #0  lookup_rec (start=18446603338384966592, end=18446603338384967175) at kernel/trace/ftrace.c:1591
> #1  ftrace_location_range (start=18446603338384966592, end=18446603338384967175) at kernel/trace/ftrace.c:1626
> #2  0xffff8000802bd070 [PAC] in __within_notrace_func (addr=<optimized out>, addr@entry=18446603338384966592) at kernel/trace/trace_kprobe.c:456
> #3  0xffff8000802bd13c [PAC] in within_notrace_func (tk=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:464
> #4  0xffff8000802bd534 [PAC] in __register_trace_kprobe (tk=tk@entry=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:496
> #5  0xffff8000802bf7a0 [PAC] in __register_trace_kprobe (tk=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:490
> #6  create_local_trace_kprobe (func=func@entry=0xffff0000cabb19c0 "__arch_copy_to_user", addr=<optimized out>, offs=<optimized out>, is_return=is_return@entry=false)
>     at kernel/trace/trace_kprobe.c:1935
> #7  0xffff80008029ef8c [PAC] in perf_kprobe_init (p_event=p_event@entry=0xffff0000c6d89a40, is_retprobe=false) at kernel/trace/trace_event_perf.c:267
> #8  0xffff800080364ec4 [PAC] in perf_kprobe_event_init (event=0xffff0000c6d89a40) at kernel/events/core.c:11102
> #9  0xffff800080369740 [PAC] in perf_try_init_event (pmu=0xffff8000829347a8 <perf_kprobe>, event=0xffff0000c6d89a40) at kernel/events/core.c:12595
> #10 0xffff8000803699dc [PAC] in perf_init_event (event=event@entry=0xffff0000c6d89a40) at kernel/events/core.c:12693
> #11 0xffff80008036d91c [PAC] in perf_event_alloc (attr=attr@entry=0xffff80008785b970, cpu=cpu@entry=0, task=task@entry=0x0, group_leader=0xffff0000c6d89a40, 
>     group_leader@entry=0x0, parent_event=parent_event@entry=0x0, overflow_handler=<optimized out>, overflow_handler@entry=0x0, context=<optimized out>, context@entry=0x0, 
>     cgroup_fd=cgroup_fd@entry=-1) at kernel/events/core.c:12968
> #12 0xffff800080373244 [PAC] in __do_sys_perf_event_open (attr_uptr=<optimized out>, pid=<optimized out>, cpu=0, group_fd=<optimized out>, flags=<optimized out>)
>     at kernel/events/core.c:13485
> #13 0xffff800080379aa0 [PAC] in __se_sys_perf_event_open (attr_uptr=<optimized out>, pid=<optimized out>, cpu=<optimized out>, group_fd=<optimized out>, flags=<optimized out>)
>     at kernel/events/core.c:13367
> #14 __arm64_sys_perf_event_open (regs=<optimized out>) at kernel/events/core.c:13367
> #15 0xffff80008004997c [PAC] in __invoke_syscall (regs=0xffff80008785beb0, syscall_fn=<optimized out>) at arch/arm64/kernel/syscall.c:35
> #16 invoke_syscall (regs=regs@entry=0xffff80008785beb0, scno=<optimized out>, sc_nr=sc_nr@entry=463, syscall_table=<optimized out>) at arch/arm64/kernel/syscall.c:49
> #17 0xffff800080049a88 [PAC] in el0_svc_common (sc_nr=463, syscall_table=<optimized out>, regs=0xffff80008785beb0, scno=<optimized out>) at arch/arm64/kernel/syscall.c:132
> #18 do_el0_svc (regs=regs@entry=0xffff80008785beb0) at arch/arm64/kernel/syscall.c:151
> #19 0xffff800080fba654 [PAC] in el0_svc (regs=0xffff80008785beb0) at arch/arm64/kernel/entry-common.c:875
> #20 0xffff800080fbab10 [PAC] in el0t_64_sync_handler (regs=<optimized out>) at arch/arm64/kernel/entry-common.c:894
> #21 0xffff800080011684 [PAC] in el0t_64_sync () at arch/arm64/kernel/entry.S:600

Sorry, I had forgotten that trace kprobes do apply this restriction;
thanks for explaining this.

Please note that basic kprobes (i.e. those opened directly by
register_kprobe()) aren't subject to this restriction; the checks in
check_kprobe_address_safe() do not include !within_notrace_func(). This
restriction only applies to trace kprobes (which use the tracing
infrastructure).

> within_notrace_func is only defined if defined(CONFIG_DYNAMIC_FTRACE) && \
> !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE), which searches a list of records
> about whether a function is ftrace-able or not. The list of records is built
> by ftrace_init at boot time.
>  
> > > The patch fixes the bug by inserting nops and bti c in Arm64 asm
> > > in the same way as compiled C code.
> 
> That's correct. I added __patchable_function_entries in assemblies so that
> their addresses are added between __start_mcount_loc and __stop_mcount_loc
> at link time, which is then picked up by ftrace_init. Proper nop and bti c
> instructions are added to assemblies in the way as if the assemblies are
> compiled from C code. This is crucial to satisfy ftrace requirements and nop
> patching at boot time. Otherwise, ftrace_bug will be hit and the assembly
> function won't be ftrace-able and also kprobe-traceable.

Do you actually need these functions to be ftrace traceable, or do you
just care that they can be traced via trace kprobes?

Why does this matter specifically for arm64, and not for other
architectures? AFAICT x86 doesn't do anything to make assembly functions
traceable by ftrace either.

Is there something specific you want to trace, but cannot currently
trace (on arm64)?

> > As it stands, NAK to this change.
> > 
> > I'm not averse to making (some) assembly functions traceable by ftrace,
> > and hence giving those NOPs. However, that's not safe generally (e.g.
> > due to noinstr requirements), and so special care will need to be taken.
> 
> Assemblies that are not traceable should use SYM_CODE_START or the new macro
> SYM_FUNC_START_NOTRACE. If an assembly function starts with SYM_FUNC_START,
> by default, it's assumed to be traceable. I dumped all assemblies under
> arch/arm64, and majority of the assemblies are utility functions (e.g., crypto,
> memset/memcpy, etc.), which seemed safe to trace. 

For better or worse, that is not the case. For example, it is not safe
to trace memset() or memcpy(), because those can be used by noinstr
code. Similarly those can be used early during handling of a kprobe,
resulting in a recursive exception (which will cause a fatal stack
overflow).

> There are a few functions
> in the following (and maybe others) that might not be traceable, so please
> advise.
> ./kernel/entry-ftrace.S
> ./kernel/entry.S
> ./mm/proc.S
> ./kernel/head.S

Practically nothing in those files is safe to trace.

There are plenty of other assembly which must not be traced, e.g.
anything with a "__pi_" prefix.

> We can either have SYM_FUNC_START be traceable by default, or flip the polarity
> and introduce SYM_FUNC_START_TRACEABLE to explicitly mark those functions
> deemed safe to trace.

I would be much happier with the latter (or something to that effect).
However, as with my question about x86 above, I'd like a better
explanation as to why you need to trace assembly functions in the first
place.

The existing restrictions could do with some cleanup, and we might be
able to make it possible use a kprobe tracepoint on functions/code
*without* requiring that it's possible to place an ftrace tracepoint.

Mark.

> > The rationale above does not make sense; it conflates distinct things,
> > and I think a more complete explanation is necessary.
> 
> Please see my explanation above. If you have any questions, please let me know.
> 
> > Mark.
> > 
> > > Note: although this patch unblocks kprobe tracing, fentry is still
> > > broken because no BTF info gets generated from assembly files. A
> > > separate patch is needed to fix that.
> > > 
> > > I built this patch with different combos of the following features
> > > and confirmed kprobe tracing for asm function __arch_copy_to_user
> > > worked in all cases:
> > > 
> > > CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > CONFIG_DYNAMIC_FTRACE_WITH_ARGS
> > > CONFIG_ARM64_BTI_KERNEL
> > > 
> > > Signed-off-by: Ben Niu <benniu@meta.com>
> > > ---
> > >  arch/arm64/include/asm/linkage.h           | 103 ++++++++++++++++-----
> > >  arch/arm64/kernel/vdso/vgetrandom-chacha.S |   2 +-
> > >  2 files changed, 81 insertions(+), 24 deletions(-)
> > > 
> > > diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
> > > index d3acd9c87509..f3f3bc168162 100644
> > > --- a/arch/arm64/include/asm/linkage.h
> > > +++ b/arch/arm64/include/asm/linkage.h
> > > @@ -5,8 +5,47 @@
> > >  #include <asm/assembler.h>
> > >  #endif
> > >  
> > > -#define __ALIGN		.balign CONFIG_FUNCTION_ALIGNMENT
> > > -#define __ALIGN_STR	".balign " #CONFIG_FUNCTION_ALIGNMENT
> > > +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT
> > > +#define __ALIGN_STR ".balign " #CONFIG_FUNCTION_ALIGNMENT
> > > +
> > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > +
> > > +#define PRE_FUNCTION_NOPS                                                   \
> > > +	ALIGN;                                                              \
> > > +	nops CONFIG_FUNCTION_ALIGNMENT / 4 - 2;                             \
> > > +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> > > +	.p2align 3;                                                         \
> > > +	.8byte 1f;                                                          \
> > > +	.popsection;                                                        \
> > > +	1 :;                                                                \
> > > +	nops 2;
> > > +
> > > +#define PRE_PROLOGUE_NOPS nops 2;
> > > +
> > > +#elif defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)
> > > +
> > > +#define PRE_FUNCTION_NOPS
> > > +
> > > +#define PRE_PROLOGUE_NOPS                                                   \
> > > +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> > > +	.p2align 3;                                                         \
> > > +	.8byte 1f;                                                          \
> > > +	.popsection;                                                        \
> > > +	1 :;                                                                \
> > > +	nops 2;
> > > +
> > > +#else
> > > +
> > > +#define PRE_FUNCTION_NOPS
> > > +#define PRE_PROLOGUE_NOPS
> > > +
> > > +#endif
> > > +
> > > +#ifdef CONFIG_ARM64_BTI_KERNEL
> > > +#define BTI_C bti c;
> > > +#else
> > > +#define BTI_C
> > > +#endif
> > >  
> > >  /*
> > >   * When using in-kernel BTI we need to ensure that PCS-conformant
> > > @@ -15,32 +54,50 @@
> > >   * everything, the override is done unconditionally so we're more
> > >   * likely to notice any drift from the overridden definitions.
> > >   */
> > > -#define SYM_FUNC_START(name)				\
> > > -	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> > > -	bti c ;
> > > +#define SYM_FUNC_START(name)                       \
> > > +	PRE_FUNCTION_NOPS                          \
> > > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > > +	BTI_C                                      \
> > > +	PRE_PROLOGUE_NOPS
> > > +
> > > +#define SYM_FUNC_START_NOTRACE(name)               \
> > > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > > +	BTI_C
> > >  
> > > -#define SYM_FUNC_START_NOALIGN(name)			\
> > > -	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)	\
> > > -	bti c ;
> > > +#define SYM_FUNC_START_NOALIGN(name)              \
> > > +	PRE_FUNCTION_NOPS                         \
> > > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
> > > +	BTI_C                                     \
> > > +	PRE_PROLOGUE_NOPS
> > >  
> > > -#define SYM_FUNC_START_LOCAL(name)			\
> > > -	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN)	\
> > > -	bti c ;
> > > +#define SYM_FUNC_START_LOCAL(name)                \
> > > +	PRE_FUNCTION_NOPS                         \
> > > +	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
> > > +	BTI_C                                     \
> > > +	PRE_PROLOGUE_NOPS
> > >  
> > > -#define SYM_FUNC_START_LOCAL_NOALIGN(name)		\
> > > -	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)	\
> > > -	bti c ;
> > > +#define SYM_FUNC_START_LOCAL_NOALIGN(name)       \
> > > +	PRE_FUNCTION_NOPS                        \
> > > +	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
> > > +	BTI_C                                    \
> > > +	PRE_PROLOGUE_NOPS
> > >  
> > > -#define SYM_FUNC_START_WEAK(name)			\
> > > -	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN)	\
> > > -	bti c ;
> > > +#define SYM_FUNC_START_WEAK(name)                \
> > > +	PRE_FUNCTION_NOPS                        \
> > > +	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
> > > +	BTI_C                                    \
> > > +	PRE_PROLOGUE_NOPS
> > >  
> > > -#define SYM_FUNC_START_WEAK_NOALIGN(name)		\
> > > -	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
> > > -	bti c ;
> > > +#define SYM_FUNC_START_WEAK_NOALIGN(name)       \
> > > +	PRE_FUNCTION_NOPS                       \
> > > +	SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
> > > +	BTI_C                                   \
> > > +	PRE_PROLOGUE_NOPS
> > >  
> > > -#define SYM_TYPED_FUNC_START(name)				\
> > > -	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> > > -	bti c ;
> > > +#define SYM_TYPED_FUNC_START(name)                       \
> > > +	PRE_FUNCTION_NOPS                                \
> > > +	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > > +	BTI_C                                            \
> > > +	PRE_PROLOGUE_NOPS
> > >  
> > >  #endif
> > > diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > > index 67890b445309..21c27b64cf9f 100644
> > > --- a/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > > +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > > @@ -40,7 +40,7 @@
> > >   *	x2: 8-byte counter input/output
> > >   *	x3: number of 64-byte block to write to output
> > >   */
> > > -SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> > > +SYM_FUNC_START_NOTRACE(__arch_chacha20_blocks_nostack)
> > >  
> > >  	/* copy0 = "expand 32-byte k" */
> > >  	mov_q		x8, 0x3320646e61707865
> > > -- 
> > > 2.47.3
> > > 
> > > 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
  2025-10-30 12:35     ` Mark Rutland
@ 2025-10-30 18:07       ` Ben Niu
  2025-11-17 10:34         ` Mark Rutland
  0 siblings, 1 reply; 8+ messages in thread
From: Ben Niu @ 2025-10-30 18:07 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, tytso, Jason, linux-arm-kernel, niuben003,
	linux-kernel

On Thu, Oct 30, 2025 at 12:35:25PM +0000, Mark Rutland wrote:
> On Wed, Oct 29, 2025 at 04:53:48PM -0700, Ben Niu wrote:
> > On Wed, Oct 29, 2025 at 02:33:26PM +0000, Mark Rutland wrote:
> > > On Mon, Oct 27, 2025 at 11:17:49AM -0700, Ben Niu wrote:
> > > > Currently, Arm64 assembly functions always have a bti c
> > > > instruction inserted before the prologue. When ftrace is enabled,
> > > > no padding nops are inserted at all.
> > > > 
> > > > This breaks kprobe tracing for asm functions, which assumes that
> > > > proper nops are added before and within functions (when ftrace is
> > > > enabled) and bti c is only present when CONFIG_ARM64_BTI_KERNEL is
> > > > defined.
> > > 
> > > What exactly do you mean by "breaks kprobe tracing"?
> > > 
> > > The kprobes code knows NOTHING about those ftrace NOPs, so I cannot see
> > > how those are relevant.
> > > 
> > > The patch adds entries to __patchable_function_entries, which is owned
> > > by ftrace, and has NOTHING to do with kprobes.
> > 
> > Thanks for reviewing my patch, Mark. This is the second ever Linux kernel patch
> > I've sent out so I'm still learning how much detail I should put in a patch for
> > reviewers. Clearly, not enough details were added for this patch, and sorry
> > about that.
> 
> Hi Ben,
> 
> Thanks, and sorry for the overly strong tone of my initial reply.
> 
> My key concern above was that "breaks kprobe tracing" sounds like
> something is seriously wrong, whereas IIUC the problem you're trying to
> address is "it isn't currently possible to place a trace kprobe on an
> assembly function" (i.e. there's a limitation, not a bug).

Ok, sounds good. If it's a limitation, I shouldn't have used "breaks kprobe
tracing". :)

> > kprobe and ftrace seem two seprate things, but kprobe actually checks if a
> > function is notrace based off whether the function is padded with proper nops
> > for ftrace when ftrace is enabled. See the following call stack for more
> > details (obtained by debugging centos 10 arm64 linux kernel):
> > 
> > #0  lookup_rec (start=18446603338384966592, end=18446603338384967175) at kernel/trace/ftrace.c:1591
> > #1  ftrace_location_range (start=18446603338384966592, end=18446603338384967175) at kernel/trace/ftrace.c:1626
> > #2  0xffff8000802bd070 [PAC] in __within_notrace_func (addr=<optimized out>, addr@entry=18446603338384966592) at kernel/trace/trace_kprobe.c:456
> > #3  0xffff8000802bd13c [PAC] in within_notrace_func (tk=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:464
> > #4  0xffff8000802bd534 [PAC] in __register_trace_kprobe (tk=tk@entry=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:496
> > #5  0xffff8000802bf7a0 [PAC] in __register_trace_kprobe (tk=0xffff0000cb5b0400) at kernel/trace/trace_kprobe.c:490
> > #6  create_local_trace_kprobe (func=func@entry=0xffff0000cabb19c0 "__arch_copy_to_user", addr=<optimized out>, offs=<optimized out>, is_return=is_return@entry=false)
> >     at kernel/trace/trace_kprobe.c:1935
> > #7  0xffff80008029ef8c [PAC] in perf_kprobe_init (p_event=p_event@entry=0xffff0000c6d89a40, is_retprobe=false) at kernel/trace/trace_event_perf.c:267
> > #8  0xffff800080364ec4 [PAC] in perf_kprobe_event_init (event=0xffff0000c6d89a40) at kernel/events/core.c:11102
> > #9  0xffff800080369740 [PAC] in perf_try_init_event (pmu=0xffff8000829347a8 <perf_kprobe>, event=0xffff0000c6d89a40) at kernel/events/core.c:12595
> > #10 0xffff8000803699dc [PAC] in perf_init_event (event=event@entry=0xffff0000c6d89a40) at kernel/events/core.c:12693
> > #11 0xffff80008036d91c [PAC] in perf_event_alloc (attr=attr@entry=0xffff80008785b970, cpu=cpu@entry=0, task=task@entry=0x0, group_leader=0xffff0000c6d89a40, 
> >     group_leader@entry=0x0, parent_event=parent_event@entry=0x0, overflow_handler=<optimized out>, overflow_handler@entry=0x0, context=<optimized out>, context@entry=0x0, 
> >     cgroup_fd=cgroup_fd@entry=-1) at kernel/events/core.c:12968
> > #12 0xffff800080373244 [PAC] in __do_sys_perf_event_open (attr_uptr=<optimized out>, pid=<optimized out>, cpu=0, group_fd=<optimized out>, flags=<optimized out>)
> >     at kernel/events/core.c:13485
> > #13 0xffff800080379aa0 [PAC] in __se_sys_perf_event_open (attr_uptr=<optimized out>, pid=<optimized out>, cpu=<optimized out>, group_fd=<optimized out>, flags=<optimized out>)
> >     at kernel/events/core.c:13367
> > #14 __arm64_sys_perf_event_open (regs=<optimized out>) at kernel/events/core.c:13367
> > #15 0xffff80008004997c [PAC] in __invoke_syscall (regs=0xffff80008785beb0, syscall_fn=<optimized out>) at arch/arm64/kernel/syscall.c:35
> > #16 invoke_syscall (regs=regs@entry=0xffff80008785beb0, scno=<optimized out>, sc_nr=sc_nr@entry=463, syscall_table=<optimized out>) at arch/arm64/kernel/syscall.c:49
> > #17 0xffff800080049a88 [PAC] in el0_svc_common (sc_nr=463, syscall_table=<optimized out>, regs=0xffff80008785beb0, scno=<optimized out>) at arch/arm64/kernel/syscall.c:132
> > #18 do_el0_svc (regs=regs@entry=0xffff80008785beb0) at arch/arm64/kernel/syscall.c:151
> > #19 0xffff800080fba654 [PAC] in el0_svc (regs=0xffff80008785beb0) at arch/arm64/kernel/entry-common.c:875
> > #20 0xffff800080fbab10 [PAC] in el0t_64_sync_handler (regs=<optimized out>) at arch/arm64/kernel/entry-common.c:894
> > #21 0xffff800080011684 [PAC] in el0t_64_sync () at arch/arm64/kernel/entry.S:600
> 
> Sorry, I had forgotten that trace kprobes do apply this restriction;
> thanks for explaining this.
> 
> Please note that basic kprobes (i.e. those opened directly by
> register_kprobe()) aren't subject to this restriction; the checks in
> check_kprobe_address_safe() do not include !within_notrace_func(). This
> restriction only applies to trace kprobes (which use the tracing
> infrastructure).

Right within_notrace_func is just (false) when ftrace is not enabled.

> > within_notrace_func is only defined if defined(CONFIG_DYNAMIC_FTRACE) && \
> > !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE), which searches a list of records
> > about whether a function is ftrace-able or not. The list of records is built
> > by ftrace_init at boot time.
> >  
> > > > The patch fixes the bug by inserting nops and bti c in Arm64 asm
> > > > in the same way as compiled C code.
> > 
> > That's correct. I added __patchable_function_entries in assemblies so that
> > their addresses are added between __start_mcount_loc and __stop_mcount_loc
> > at link time, which is then picked up by ftrace_init. Proper nop and bti c
> > instructions are added to assemblies in the way as if the assemblies are
> > compiled from C code. This is crucial to satisfy ftrace requirements and nop
> > patching at boot time. Otherwise, ftrace_bug will be hit and the assembly
> > function won't be ftrace-able and also kprobe-traceable.
> 
> Do you actually need these functions to be ftrace traceable, or do you
> just care that they can be traced via trace kprobes?

We don't care if they are traced by ftrace or kprobes. Actually, as I mentioned
in the original email, even with my patch, ftrace still does not work for those
assembly functions because of lack of BTF info in assemblies. Adding those nops
only unblocks kprobes and it's good enough for us.
 
> Why does this matter specifically for arm64, and not for other
> architectures? AFAICT x86 doesn't do anything to make assembly functions
> traceable by ftrace either.
>
> Is there something specific you want to trace, but cannot currently
> trace (on arm64)?

For some reason, we only saw Arm64 Linux asm functions __arch_copy_to_user and
__arch_copy_from_user being hot in our workloads, not those counterpart asm
functions on x86, so we are trying to understand and improve performance of
those Arm64 asm functions. This patch mainly helps understand the workloads
and the following patch tries to improve the performance. Enabling tracing
will help us tune the asm functions. In addition, Arm64 future-gen CPUs will
support FEAT_MOPS, which include instructions copying data between user and
kernel. Thanks to tracing support, we may also help Arm tune their FEAT_MOPS
implementations.

20251018052237.1368504-2-benniu@meta.com

Although I would also love to trace x86 asm functions, it's not my focus right
now and we don't have any demand for that. Plus, fixing assemblies of different
CPUs will involve folks from different companies in the same email chain, which
may increase communication cost.
 
> > > As it stands, NAK to this change.
> > > 
> > > I'm not averse to making (some) assembly functions traceable by ftrace,
> > > and hence giving those NOPs. However, that's not safe generally (e.g.
> > > due to noinstr requirements), and so special care will need to be taken.
> > 
> > Assemblies that are not traceable should use SYM_CODE_START or the new macro
> > SYM_FUNC_START_NOTRACE. If an assembly function starts with SYM_FUNC_START,
> > by default, it's assumed to be traceable. I dumped all assemblies under
> > arch/arm64, and majority of the assemblies are utility functions (e.g., crypto,
> > memset/memcpy, etc.), which seemed safe to trace. 
> 
> For better or worse, that is not the case. For example, it is not safe
> to trace memset() or memcpy(), because those can be used by noinstr
> code. Similarly those can be used early during handling of a kprobe,
> resulting in a recursive exception (which will cause a fatal stack
> overflow).
> 
> > There are a few functions
> > in the following (and maybe others) that might not be traceable, so please
> > advise.
> > ./kernel/entry-ftrace.S
> > ./kernel/entry.S
> > ./mm/proc.S
> > ./kernel/head.S
> 
> Practically nothing in those files is safe to trace.
> 
> There are plenty of other assembly which must not be traced, e.g.
> anything with a "__pi_" prefix. 

Ok, thanks, this is critical info.

> > We can either have SYM_FUNC_START be traceable by default, or flip the polarity
> > and introduce SYM_FUNC_START_TRACEABLE to explicitly mark those functions
> > deemed safe to trace.
> 
> I would be much happier with the latter (or something to that effect).
> However, as with my question about x86 above, I'd like a better
> explanation as to why you need to trace assembly functions in the first
> place.

Please see my explanation above. In short, we do need tracing support for
__arch_copy_from_user and __arch_copy_to_user to better understand our workloads
and how to tune the new implementations for those two asm functions.

> The existing restrictions could do with some cleanup, and we might be
> able to make it possible use a kprobe tracepoint on functions/code
> *without* requiring that it's possible to place an ftrace tracepoint.

Right, that's another way to enable tracing for asm functions. I thought about
it but didn't pursue that because adding nops makes the asm functions look
similar to compiled C code, which unifies code gen and makes it possible to
enable ftrace for asm functions in the future. Furthermore, it limits the scope
of changes to only Arm64 without touching the kprobe/ftrace infra. However, if
kprobe/ftrace maintainers are ok with adding new logic for kprobe to not look at
within_notrace_func for asm functions, I'm also happy to go that route.

Ben

> Mark.
> 
> > > The rationale above does not make sense; it conflates distinct things,
> > > and I think a more complete explanation is necessary.
> > 
> > Please see my explanation above. If you have any questions, please let me know.
> > 
> > > Mark.
> > > 
> > > > Note: although this patch unblocks kprobe tracing, fentry is still
> > > > broken because no BTF info gets generated from assembly files. A
> > > > separate patch is needed to fix that.
> > > > 
> > > > I built this patch with different combos of the following features
> > > > and confirmed kprobe tracing for asm function __arch_copy_to_user
> > > > worked in all cases:
> > > > 
> > > > CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > > CONFIG_DYNAMIC_FTRACE_WITH_ARGS
> > > > CONFIG_ARM64_BTI_KERNEL
> > > > 
> > > > Signed-off-by: Ben Niu <benniu@meta.com>
> > > > ---
> > > >  arch/arm64/include/asm/linkage.h           | 103 ++++++++++++++++-----
> > > >  arch/arm64/kernel/vdso/vgetrandom-chacha.S |   2 +-
> > > >  2 files changed, 81 insertions(+), 24 deletions(-)
> > > > 
> > > > diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
> > > > index d3acd9c87509..f3f3bc168162 100644
> > > > --- a/arch/arm64/include/asm/linkage.h
> > > > +++ b/arch/arm64/include/asm/linkage.h
> > > > @@ -5,8 +5,47 @@
> > > >  #include <asm/assembler.h>
> > > >  #endif
> > > >  
> > > > -#define __ALIGN		.balign CONFIG_FUNCTION_ALIGNMENT
> > > > -#define __ALIGN_STR	".balign " #CONFIG_FUNCTION_ALIGNMENT
> > > > +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT
> > > > +#define __ALIGN_STR ".balign " #CONFIG_FUNCTION_ALIGNMENT
> > > > +
> > > > +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
> > > > +
> > > > +#define PRE_FUNCTION_NOPS                                                   \
> > > > +	ALIGN;                                                              \
> > > > +	nops CONFIG_FUNCTION_ALIGNMENT / 4 - 2;                             \
> > > > +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> > > > +	.p2align 3;                                                         \
> > > > +	.8byte 1f;                                                          \
> > > > +	.popsection;                                                        \
> > > > +	1 :;                                                                \
> > > > +	nops 2;
> > > > +
> > > > +#define PRE_PROLOGUE_NOPS nops 2;
> > > > +
> > > > +#elif defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS)
> > > > +
> > > > +#define PRE_FUNCTION_NOPS
> > > > +
> > > > +#define PRE_PROLOGUE_NOPS                                                   \
> > > > +	.pushsection __patchable_function_entries, "awo", @progbits, .text; \
> > > > +	.p2align 3;                                                         \
> > > > +	.8byte 1f;                                                          \
> > > > +	.popsection;                                                        \
> > > > +	1 :;                                                                \
> > > > +	nops 2;
> > > > +
> > > > +#else
> > > > +
> > > > +#define PRE_FUNCTION_NOPS
> > > > +#define PRE_PROLOGUE_NOPS
> > > > +
> > > > +#endif
> > > > +
> > > > +#ifdef CONFIG_ARM64_BTI_KERNEL
> > > > +#define BTI_C bti c;
> > > > +#else
> > > > +#define BTI_C
> > > > +#endif
> > > >  
> > > >  /*
> > > >   * When using in-kernel BTI we need to ensure that PCS-conformant
> > > > @@ -15,32 +54,50 @@
> > > >   * everything, the override is done unconditionally so we're more
> > > >   * likely to notice any drift from the overridden definitions.
> > > >   */
> > > > -#define SYM_FUNC_START(name)				\
> > > > -	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> > > > -	bti c ;
> > > > +#define SYM_FUNC_START(name)                       \
> > > > +	PRE_FUNCTION_NOPS                          \
> > > > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > > > +	BTI_C                                      \
> > > > +	PRE_PROLOGUE_NOPS
> > > > +
> > > > +#define SYM_FUNC_START_NOTRACE(name)               \
> > > > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > > > +	BTI_C
> > > >  
> > > > -#define SYM_FUNC_START_NOALIGN(name)			\
> > > > -	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)	\
> > > > -	bti c ;
> > > > +#define SYM_FUNC_START_NOALIGN(name)              \
> > > > +	PRE_FUNCTION_NOPS                         \
> > > > +	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
> > > > +	BTI_C                                     \
> > > > +	PRE_PROLOGUE_NOPS
> > > >  
> > > > -#define SYM_FUNC_START_LOCAL(name)			\
> > > > -	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN)	\
> > > > -	bti c ;
> > > > +#define SYM_FUNC_START_LOCAL(name)                \
> > > > +	PRE_FUNCTION_NOPS                         \
> > > > +	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
> > > > +	BTI_C                                     \
> > > > +	PRE_PROLOGUE_NOPS
> > > >  
> > > > -#define SYM_FUNC_START_LOCAL_NOALIGN(name)		\
> > > > -	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)	\
> > > > -	bti c ;
> > > > +#define SYM_FUNC_START_LOCAL_NOALIGN(name)       \
> > > > +	PRE_FUNCTION_NOPS                        \
> > > > +	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
> > > > +	BTI_C                                    \
> > > > +	PRE_PROLOGUE_NOPS
> > > >  
> > > > -#define SYM_FUNC_START_WEAK(name)			\
> > > > -	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN)	\
> > > > -	bti c ;
> > > > +#define SYM_FUNC_START_WEAK(name)                \
> > > > +	PRE_FUNCTION_NOPS                        \
> > > > +	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
> > > > +	BTI_C                                    \
> > > > +	PRE_PROLOGUE_NOPS
> > > >  
> > > > -#define SYM_FUNC_START_WEAK_NOALIGN(name)		\
> > > > -	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
> > > > -	bti c ;
> > > > +#define SYM_FUNC_START_WEAK_NOALIGN(name)       \
> > > > +	PRE_FUNCTION_NOPS                       \
> > > > +	SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
> > > > +	BTI_C                                   \
> > > > +	PRE_PROLOGUE_NOPS
> > > >  
> > > > -#define SYM_TYPED_FUNC_START(name)				\
> > > > -	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
> > > > -	bti c ;
> > > > +#define SYM_TYPED_FUNC_START(name)                       \
> > > > +	PRE_FUNCTION_NOPS                                \
> > > > +	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
> > > > +	BTI_C                                            \
> > > > +	PRE_PROLOGUE_NOPS
> > > >  
> > > >  #endif
> > > > diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > > > index 67890b445309..21c27b64cf9f 100644
> > > > --- a/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > > > +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> > > > @@ -40,7 +40,7 @@
> > > >   *	x2: 8-byte counter input/output
> > > >   *	x3: number of 64-byte block to write to output
> > > >   */
> > > > -SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> > > > +SYM_FUNC_START_NOTRACE(__arch_chacha20_blocks_nostack)
> > > >  
> > > >  	/* copy0 = "expand 32-byte k" */
> > > >  	mov_q		x8, 0x3320646e61707865
> > > > -- 
> > > > 2.47.3
> > > > 
> > > > 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
  2025-10-30 18:07       ` Ben Niu
@ 2025-11-17 10:34         ` Mark Rutland
  2025-12-10 20:16           ` Withdraw " Ben Niu
  0 siblings, 1 reply; 8+ messages in thread
From: Mark Rutland @ 2025-11-17 10:34 UTC (permalink / raw)
  To: Ben Niu
  Cc: catalin.marinas, will, tytso, Jason, linux-arm-kernel, niuben003,
	linux-kernel

On Thu, Oct 30, 2025 at 11:07:51AM -0700, Ben Niu wrote:
> On Thu, Oct 30, 2025 at 12:35:25PM +0000, Mark Rutland wrote:
> > Is there something specific you want to trace, but cannot currently
> > trace (on arm64)?
> 
> For some reason, we only saw Arm64 Linux asm functions __arch_copy_to_user and
> __arch_copy_from_user being hot in our workloads, not those counterpart asm
> functions on x86, so we are trying to understand and improve performance of
> those Arm64 asm functions.

Are you sure that's not an artifact of those being out-of-line on arm64,
but inline on x86? On x86, the out-of-line forms are only used when the
CPU doesn't have FSRM, and when the CPU *does* have FSRM, the logic gets
inlined. See raw_copy_from_user(), raw_copy_to_user(), and
copy_user_generic() in arch/x86/include/asm/uaccess_64.h.

Have you checked that inlining is not skewing your results, and
artificially making those look hotter on am64 by virtue of centralizing
samples to the same IP/PC range?

Can you share any information on those workloads? e.g. which callchains
were hot?

Mark.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Withdraw [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
  2025-11-17 10:34         ` Mark Rutland
@ 2025-12-10 20:16           ` Ben Niu
  2025-12-12 16:33             ` Mark Rutland
  0 siblings, 1 reply; 8+ messages in thread
From: Ben Niu @ 2025-12-10 20:16 UTC (permalink / raw)
  To: Mark Rutland
  Cc: catalin.marinas, will, tytso, Jason, linux-arm-kernel, niuben003,
	benniu, linux-kernel

On Mon, Nov 17, 2025 at 10:34:22AM +0000, Mark Rutland wrote:
> On Thu, Oct 30, 2025 at 11:07:51AM -0700, Ben Niu wrote:
> > On Thu, Oct 30, 2025 at 12:35:25PM +0000, Mark Rutland wrote:
> > > Is there something specific you want to trace, but cannot currently
> > > trace (on arm64)?
> > 
> > For some reason, we only saw Arm64 Linux asm functions __arch_copy_to_user and
> > __arch_copy_from_user being hot in our workloads, not those counterpart asm
> > functions on x86, so we are trying to understand and improve performance of
> > those Arm64 asm functions.
> 
> Are you sure that's not an artifact of those being out-of-line on arm64,
> but inline on x86? On x86, the out-of-line forms are only used when the
> CPU doesn't have FSRM, and when the CPU *does* have FSRM, the logic gets
> inlined. See raw_copy_from_user(), raw_copy_to_user(), and
> copy_user_generic() in arch/x86/include/asm/uaccess_64.h.

On x86, INLINE_COPY_TO_USER is not defined in the latest linux kernel and our
internal branch, so _copy_to_user is always defined as an extern function
(no-inline), which ends up inlining copy_user_generic. copy_user_generic
executes FSRM rep movs if CPU supports it (our case), otherwise, it calls
rep_movs_alternative, which issues plain movs to copy memory.

FSRM is vectorized internally by the CPU and Arm FEAT_MOPS seems similar to
what FSRM does, but Neoverse V2 does not support FEAT_MOPS so the kernel
resorts to sttr to copy memory.

> Have you checked that inlining is not skewing your results, and
> artificially making those look hotter on am64 by virtue of centralizing
> samples to the same IP/PC range?

As mentioned above, _copy_to_user is not inlined on x86.

> Can you share any information on those workloads? e.g. which callchains
> were hot?

Please reach out to James Greenhalgh and Chris Goodyer at Arm for more details
about those workloads, which I can't share in a public channel.

By bpftracing, we found that most of __arch_copy_to_user calls are against size
2k-4k. I wrote a simple microbenchmark to test and compare the performance of
copy_to_user on both x64 and arm64, see
https://github.com/mcfi/benchmark/tree/main/ku_copy
Below were data collected. You could see that Arm64 ku copy speed had a dip
when the size hits 2K-16K.

Routine	        Size	Arm64 bytes/s / x64 bytes /s
copy_to_user	8	203.4%
copy_from_user	8	248.5%
copy_to_user	16	194.6%
copy_from_user	16	240.6%
copy_to_user	32	195.0%
copy_from_user	32	236.5%
copy_to_user	64	193.2%
copy_from_user	64	240.4%
copy_to_user	128	211.0%
copy_from_user	128	273.0%
copy_to_user	256	185.3%
copy_from_user	256	229.3%
copy_to_user	512	152.9%
copy_from_user	512	182.4%
copy_to_user	1024	121.0%
copy_from_user	1024	141.9%
copy_to_user	2048	95.1%
copy_from_user	2048	108.7%
copy_to_user	4096	78.9%
copy_from_user	4096	85.7%
copy_to_user	8192	69.7%
copy_from_user	8192	73.5%
copy_to_user	16384	65.9%
copy_from_user	16384	68.8%
copy_to_user	32768	117.3%
copy_from_user	32768	118.6%
copy_to_user	65536	115.5%
copy_from_user	65536	117.4%
copy_to_user	131072	114.6%
copy_from_user	131072	113.3%
copy_to_user	262144	114.4%
copy_from_user	262144	109.3%

I sent out a patch for faster __arch_copy_from_user/__arch_copy_to_user. Please
see this message ID 20251018052237.1368504-2-benniu@meta.com for more details.

> Mark.

In addition, I'm withdrawing this patch because we found a workaround to trace
__arch_copy_to_user/__arch_copy_from_user through watchpoints. This message ID
aTjpzfqTEGPcird5@meta.com should have all the details. Thank you for reviewing
this patch and all your helpful comments.

Ben


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Withdraw [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions
  2025-12-10 20:16           ` Withdraw " Ben Niu
@ 2025-12-12 16:33             ` Mark Rutland
  0 siblings, 0 replies; 8+ messages in thread
From: Mark Rutland @ 2025-12-12 16:33 UTC (permalink / raw)
  To: Ben Niu
  Cc: catalin.marinas, will, tytso, Jason, linux-arm-kernel, niuben003,
	linux-kernel

On Wed, Dec 10, 2025 at 12:16:17PM -0800, Ben Niu wrote:
> On Mon, Nov 17, 2025 at 10:34:22AM +0000, Mark Rutland wrote:
> > On Thu, Oct 30, 2025 at 11:07:51AM -0700, Ben Niu wrote:
> > > On Thu, Oct 30, 2025 at 12:35:25PM +0000, Mark Rutland wrote:
> > > > Is there something specific you want to trace, but cannot currently
> > > > trace (on arm64)?
> > > 
> > > For some reason, we only saw Arm64 Linux asm functions __arch_copy_to_user and
> > > __arch_copy_from_user being hot in our workloads, not those counterpart asm
> > > functions on x86, so we are trying to understand and improve performance of
> > > those Arm64 asm functions.
> > 
> > Are you sure that's not an artifact of those being out-of-line on arm64,
> > but inline on x86? On x86, the out-of-line forms are only used when the
> > CPU doesn't have FSRM, and when the CPU *does* have FSRM, the logic gets
> > inlined. See raw_copy_from_user(), raw_copy_to_user(), and
> > copy_user_generic() in arch/x86/include/asm/uaccess_64.h.
> 
> On x86, INLINE_COPY_TO_USER is not defined in the latest linux kernel and our
> internal branch, so _copy_to_user is always defined as an extern function
> (no-inline), which ends up inlining copy_user_generic. copy_user_generic
> executes FSRM rep movs if CPU supports it (our case), otherwise, it calls
> rep_movs_alternative, which issues plain movs to copy memory.

> > Have you checked that inlining is not skewing your results, and
> > artificially making those look hotter on am64 by virtue of centralizing
> > samples to the same IP/PC range?
> 
> As mentioned above, _copy_to_user is not inlined on x86.

Thanks for confirming!

> > Can you share any information on those workloads? e.g. which callchains
> > were hot?
> 
> Please reach out to James Greenhalgh and Chris Goodyer at Arm for more details
> about those workloads, which I can't share in a public channel.

If you can't share this info publicly, that's fair enough.

Please note that upstream it's hard to justify changing things based on
confidential information. Sharing information with me in private isn't
all that helpful as it would not be clear what I could subsequently
share in public.

The reason that I've asked is that it would be very interesting to know
whether there's a specific subsystem, driver, or code path that's
hitting this hard, because a better option might be "don't do that", and
attempt to avoid the uaccesses entirely (e.g. accessing a kernel alias
with get_user_pages()).

If there's anything that you can share about that, it'd be very helpful.

Mark.


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-12-12 16:33 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-27 18:17 [PATCH] tracing: Enable kprobe tracing for Arm64 asm functions Ben Niu
2025-10-29 14:33 ` Mark Rutland
2025-10-29 23:53   ` Ben Niu
2025-10-30 12:35     ` Mark Rutland
2025-10-30 18:07       ` Ben Niu
2025-11-17 10:34         ` Mark Rutland
2025-12-10 20:16           ` Withdraw " Ben Niu
2025-12-12 16:33             ` Mark Rutland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).