linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] RISC-V: Fix a register to store the percpu offset
@ 2025-07-04  8:45 Yunhui Cui
  2025-07-07  7:55 ` Clément Léger
  2025-07-07 12:50 ` [PATCH] RISC-V: store percpu offset in CSR_SCRATCH Radim Krčmář
  0 siblings, 2 replies; 11+ messages in thread
From: Yunhui Cui @ 2025-07-04  8:45 UTC (permalink / raw)
  To: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, cuiyunhui, haibo1.xu, samuel.holland,
	linux-kbuild, linux-kernel, linux-mm, linux-riscv

The following data was collected from tests conducted on the
Spacemit(R) X60 using the fixed register method:

No modifications:
6.77, 6.791, 6.792, 6.826, 6.784, 6.839, 6.776, 6.733, 6.795, 6.763
Average: 6.786839305

ffix-x27:
7.106, 7.035, 7.362, 7.141, 7.096, 7.182, 7.109, 7.126, 7.186, 7.12
Average: 7.145826539

ffix-x27 + x27(s11) used for offset optimization:
7.059, 6.951, 6.961, 6.985, 6.93, 6.964, 6.977, 6.907, 6.983, 6.944
Average: 6.965993173

Analysis:
The fixed register method reduced performance by 5.29%.
The per-CPU offset optimization improved performance by 2.52%.

Issues with the fixed register method (beyond code size):
Performance degradation due to the loss of one general-purpose register.
Each handle_exception() call requires loading the per-CPU offset into the
fixed register.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
 Makefile                        |  4 ++--
 arch/riscv/include/asm/percpu.h | 22 ++++++++++++++++++++++
 arch/riscv/kernel/asm-offsets.c |  1 +
 arch/riscv/kernel/entry.S       | 11 +++++++++--
 arch/riscv/kernel/smpboot.c     |  3 +++
 5 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 arch/riscv/include/asm/percpu.h

diff --git a/Makefile b/Makefile
index b7d5f2f0def0..e291f865adc4 100644
--- a/Makefile
+++ b/Makefile
@@ -1026,8 +1026,8 @@ include $(addprefix $(srctree)/, $(include-y))
 
 # Add user supplied CPPFLAGS, AFLAGS, CFLAGS and RUSTFLAGS as the last assignments
 KBUILD_CPPFLAGS += $(KCPPFLAGS)
-KBUILD_AFLAGS   += $(KAFLAGS)
-KBUILD_CFLAGS   += $(KCFLAGS)
+KBUILD_AFLAGS   += $(KAFLAGS) -ffixed-x27
+KBUILD_CFLAGS   += $(KCFLAGS) -ffixed-x27
 KBUILD_RUSTFLAGS += $(KRUSTFLAGS)
 
 KBUILD_LDFLAGS_MODULE += --build-id=sha1
diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 000000000000..5d6b109cfab7
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+static inline void set_my_cpu_offset(unsigned long off)
+{
+	asm volatile("addi s11, %0, 0" :: "r" (off));
+}
+
+static inline unsigned long __kern_my_cpu_offset(void)
+{
+	unsigned long off;
+	asm ("mv %0, s11" :"=r" (off) :);
+	return off;
+}
+
+#define __my_cpu_offset __kern_my_cpu_offset()
+
+#include <asm-generic/percpu.h>
+
+#endif
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index a03129f40c46..0ce96f30bf32 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -35,6 +35,7 @@ void asm_offsets(void)
 	OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
 	OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
 	OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
+	OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
 	OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
 	OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
 	OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 9d1a305d5508..529d6576265e 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -77,6 +77,13 @@ SYM_CODE_START(handle_exception)
 	 */
 	csrw CSR_SCRATCH, x0
 
+	/* load __per_cpu_offset[cpu] to s11*/
+	REG_L t6, TASK_TI_CPU(tp)
+	slli t6, t6, 3
+	la s11, __per_cpu_offset
+	add s11, s11, t6
+	REG_L s11, 0(s11)
+
 	/* Load the global pointer */
 	load_global_pointer
 
@@ -298,7 +305,7 @@ SYM_FUNC_START(__switch_to)
 	REG_S s8,  TASK_THREAD_S8_RA(a3)
 	REG_S s9,  TASK_THREAD_S9_RA(a3)
 	REG_S s10, TASK_THREAD_S10_RA(a3)
-	REG_S s11, TASK_THREAD_S11_RA(a3)
+	/* REG_S s11, TASK_THREAD_S11_RA(a3) */
 	/* Save the kernel shadow call stack pointer */
 	scs_save_current
 	/* Restore context from next->thread */
@@ -315,7 +322,7 @@ SYM_FUNC_START(__switch_to)
 	REG_L s8,  TASK_THREAD_S8_RA(a4)
 	REG_L s9,  TASK_THREAD_S9_RA(a4)
 	REG_L s10, TASK_THREAD_S10_RA(a4)
-	REG_L s11, TASK_THREAD_S11_RA(a4)
+	/* REG_L s11, TASK_THREAD_S11_RA(a4) */
 	/* The offset of thread_info in task_struct is zero. */
 	move tp, a1
 	/* Switch to the next shadow call stack */
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index fb6ab7f8bfbd..6fa12cc84523 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -43,6 +43,7 @@ static DECLARE_COMPLETION(cpu_running);
 
 void __init smp_prepare_boot_cpu(void)
 {
+	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 }
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
@@ -240,6 +241,8 @@ asmlinkage __visible void smp_callin(void)
 	mmgrab(mm);
 	current->active_mm = mm;
 
+	set_my_cpu_offset(per_cpu_offset(curr_cpuid));
+
 	store_cpu_topology(curr_cpuid);
 	notify_cpu_starting(curr_cpuid);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC] RISC-V: Fix a register to store the percpu offset
  2025-07-04  8:45 [PATCH RFC] RISC-V: Fix a register to store the percpu offset Yunhui Cui
@ 2025-07-07  7:55 ` Clément Léger
  2025-07-07 12:50 ` [PATCH] RISC-V: store percpu offset in CSR_SCRATCH Radim Krčmář
  1 sibling, 0 replies; 11+ messages in thread
From: Clément Léger @ 2025-07-07  7:55 UTC (permalink / raw)
  To: Yunhui Cui, masahiroy, nathan, nicolas.schier, dennis, tj, cl,
	paul.walmsley, palmer, aou, alex, andybnac, bjorn, cyrilbur,
	rostedt, puranjay, ben.dooks, zhangchunyan, ruanjinjie, jszhang,
	charlie, antonb, ajones, debug, haibo1.xu, samuel.holland,
	linux-kbuild, linux-kernel, linux-mm, linux-riscv



On 04/07/2025 10:45, Yunhui Cui wrote:
> The following data was collected from tests conducted on the
> Spacemit(R) X60 using the fixed register method:
> 
> No modifications:
> 6.77, 6.791, 6.792, 6.826, 6.784, 6.839, 6.776, 6.733, 6.795, 6.763
> Average: 6.786839305
> 
> ffix-x27:
> 7.106, 7.035, 7.362, 7.141, 7.096, 7.182, 7.109, 7.126, 7.186, 7.12
> Average: 7.145826539
> 
> ffix-x27 + x27(s11) used for offset optimization:
> 7.059, 6.951, 6.961, 6.985, 6.93, 6.964, 6.977, 6.907, 6.983, 6.944
> Average: 6.965993173

Hi Yunhui,

Out of curiosity, did you tried using a register different than x27 ?

Thanks,

Clément

> 
> Analysis:
> The fixed register method reduced performance by 5.29%.
> The per-CPU offset optimization improved performance by 2.52%.
> 
> Issues with the fixed register method (beyond code size):
> Performance degradation due to the loss of one general-purpose register.
> Each handle_exception() call requires loading the per-CPU offset into the
> fixed register.
> 
> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> ---
>  Makefile                        |  4 ++--
>  arch/riscv/include/asm/percpu.h | 22 ++++++++++++++++++++++
>  arch/riscv/kernel/asm-offsets.c |  1 +
>  arch/riscv/kernel/entry.S       | 11 +++++++++--
>  arch/riscv/kernel/smpboot.c     |  3 +++
>  5 files changed, 37 insertions(+), 4 deletions(-)
>  create mode 100644 arch/riscv/include/asm/percpu.h
> 
> diff --git a/Makefile b/Makefile
> index b7d5f2f0def0..e291f865adc4 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -1026,8 +1026,8 @@ include $(addprefix $(srctree)/, $(include-y))
>  
>  # Add user supplied CPPFLAGS, AFLAGS, CFLAGS and RUSTFLAGS as the last assignments
>  KBUILD_CPPFLAGS += $(KCPPFLAGS)
> -KBUILD_AFLAGS   += $(KAFLAGS)
> -KBUILD_CFLAGS   += $(KCFLAGS)
> +KBUILD_AFLAGS   += $(KAFLAGS) -ffixed-x27
> +KBUILD_CFLAGS   += $(KCFLAGS) -ffixed-x27
>  KBUILD_RUSTFLAGS += $(KRUSTFLAGS)
>  
>  KBUILD_LDFLAGS_MODULE += --build-id=sha1
> diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
> new file mode 100644
> index 000000000000..5d6b109cfab7
> --- /dev/null
> +++ b/arch/riscv/include/asm/percpu.h
> @@ -0,0 +1,22 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#ifndef __ASM_PERCPU_H
> +#define __ASM_PERCPU_H
> +
> +static inline void set_my_cpu_offset(unsigned long off)
> +{
> +	asm volatile("addi s11, %0, 0" :: "r" (off));
> +}
> +
> +static inline unsigned long __kern_my_cpu_offset(void)
> +{
> +	unsigned long off;
> +	asm ("mv %0, s11" :"=r" (off) :);
> +	return off;
> +}
> +
> +#define __my_cpu_offset __kern_my_cpu_offset()
> +
> +#include <asm-generic/percpu.h>
> +
> +#endif
> diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
> index a03129f40c46..0ce96f30bf32 100644
> --- a/arch/riscv/kernel/asm-offsets.c
> +++ b/arch/riscv/kernel/asm-offsets.c
> @@ -35,6 +35,7 @@ void asm_offsets(void)
>  	OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
>  	OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
>  	OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
> +	OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
>  	OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
>  	OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
>  	OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index 9d1a305d5508..529d6576265e 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -77,6 +77,13 @@ SYM_CODE_START(handle_exception)
>  	 */
>  	csrw CSR_SCRATCH, x0
>  
> +	/* load __per_cpu_offset[cpu] to s11*/
> +	REG_L t6, TASK_TI_CPU(tp)
> +	slli t6, t6, 3
> +	la s11, __per_cpu_offset
> +	add s11, s11, t6
> +	REG_L s11, 0(s11)
> +
>  	/* Load the global pointer */
>  	load_global_pointer
>  
> @@ -298,7 +305,7 @@ SYM_FUNC_START(__switch_to)
>  	REG_S s8,  TASK_THREAD_S8_RA(a3)
>  	REG_S s9,  TASK_THREAD_S9_RA(a3)
>  	REG_S s10, TASK_THREAD_S10_RA(a3)
> -	REG_S s11, TASK_THREAD_S11_RA(a3)
> +	/* REG_S s11, TASK_THREAD_S11_RA(a3) */
>  	/* Save the kernel shadow call stack pointer */
>  	scs_save_current
>  	/* Restore context from next->thread */
> @@ -315,7 +322,7 @@ SYM_FUNC_START(__switch_to)
>  	REG_L s8,  TASK_THREAD_S8_RA(a4)
>  	REG_L s9,  TASK_THREAD_S9_RA(a4)
>  	REG_L s10, TASK_THREAD_S10_RA(a4)
> -	REG_L s11, TASK_THREAD_S11_RA(a4)
> +	/* REG_L s11, TASK_THREAD_S11_RA(a4) */
>  	/* The offset of thread_info in task_struct is zero. */
>  	move tp, a1
>  	/* Switch to the next shadow call stack */
> diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
> index fb6ab7f8bfbd..6fa12cc84523 100644
> --- a/arch/riscv/kernel/smpboot.c
> +++ b/arch/riscv/kernel/smpboot.c
> @@ -43,6 +43,7 @@ static DECLARE_COMPLETION(cpu_running);
>  
>  void __init smp_prepare_boot_cpu(void)
>  {
> +	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
>  }
>  
>  void __init smp_prepare_cpus(unsigned int max_cpus)
> @@ -240,6 +241,8 @@ asmlinkage __visible void smp_callin(void)
>  	mmgrab(mm);
>  	current->active_mm = mm;
>  
> +	set_my_cpu_offset(per_cpu_offset(curr_cpuid));
> +
>  	store_cpu_topology(curr_cpuid);
>  	notify_cpu_starting(curr_cpuid);
>  


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-04  8:45 [PATCH RFC] RISC-V: Fix a register to store the percpu offset Yunhui Cui
  2025-07-07  7:55 ` Clément Léger
@ 2025-07-07 12:50 ` Radim Krčmář
  2025-07-08 10:07   ` [External] " yunhui cui
  1 sibling, 1 reply; 11+ messages in thread
From: Radim Krčmář @ 2025-07-07 12:50 UTC (permalink / raw)
  To: Yunhui Cui, masahiroy, nathan, nicolas.schier, dennis, tj, cl,
	paul.walmsley, palmer, aou, alex, andybnac, bjorn, cyrilbur,
	rostedt, puranjay, ben.dooks, zhangchunyan, ruanjinjie, jszhang,
	charlie, cleger, antonb, ajones, debug, haibo1.xu, samuel.holland,
	linux-kbuild, linux-kernel, linux-mm, linux-riscv
  Cc: linux-riscv

2025-07-04T16:45:00+08:00, Yunhui Cui <cuiyunhui@bytedance.com>:
> The following data was collected from tests conducted on the
> Spacemit(R) X60 using the fixed register method:
> [...]
> The fixed register method reduced performance by 5.29%.
> The per-CPU offset optimization improved performance by 2.52%.

What is the performance if you use the scratch register?

The patch below is completely unoptimized as I didn't want to shuffle
code around too much, but it could give a rough idea.

Thanks.

---8<---
The scratch register currently denotes the mode before exception, but we
can just use two different exception entry points to provide the same
information, which frees the scratch register for the percpu offset.

The user/kernel entry paths need more through rewrite, because they are
terribly wasteful right now.
---
Applies on top of d7b8f8e20813f0179d8ef519541a3527e7661d3a (v6.16-rc5)

 arch/riscv/include/asm/percpu.h | 13 ++++++++++
 arch/riscv/kernel/entry.S       | 46 ++++++++++++++++++++-------------
 arch/riscv/kernel/head.S        |  7 +----
 arch/riscv/kernel/smpboot.c     |  7 +++++
 arch/riscv/kernel/stacktrace.c  |  4 +--
 5 files changed, 51 insertions(+), 26 deletions(-)
 create mode 100644 arch/riscv/include/asm/percpu.h

diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 000000000000..2c838514e3ea
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+static inline void set_my_cpu_offset(unsigned long off)
+{
+	csr_write(CSR_SCRATCH, off);
+}
+
+#define __my_cpu_offset csr_read(CSR_SCRATCH)
+
+#include <asm-generic/percpu.h>
+
+#endif
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 75656afa2d6b..e48c553d6779 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -91,18 +91,8 @@
 	REG_L	a0, TASK_TI_A0(tp)
 .endm
 
-
-SYM_CODE_START(handle_exception)
-	/*
-	 * If coming from userspace, preserve the user thread pointer and load
-	 * the kernel thread pointer.  If we came from the kernel, the scratch
-	 * register will contain 0, and we should continue on the current TP.
-	 */
-	csrrw tp, CSR_SCRATCH, tp
-	bnez tp, .Lsave_context
-
-.Lrestore_kernel_tpsp:
-	csrr tp, CSR_SCRATCH
+SYM_CODE_START(handle_kernel_exception)
+	csrw CSR_SCRATCH, tp
 
 #ifdef CONFIG_64BIT
 	/*
@@ -126,8 +116,22 @@ SYM_CODE_START(handle_exception)
 	bnez sp, handle_kernel_stack_overflow
 	REG_L sp, TASK_TI_KERNEL_SP(tp)
 #endif
+	j handle_exception
+ASM_NOKPROBE(handle_kernel_exception)
+SYM_CODE_END(handle_kernel_exception)
 
-.Lsave_context:
+SYM_CODE_START(handle_user_exception)
+	/*
+	 * If coming from userspace, preserve the user thread pointer and load
+	 * the kernel thread pointer.
+	 */
+	csrrw tp, CSR_SCRATCH, tp
+	j handle_exception
+
+SYM_CODE_END(handle_user_exception)
+ASM_NOKPROBE(handle_user_exception)
+
+SYM_CODE_START_NOALIGN(handle_exception)
 	REG_S sp, TASK_TI_USER_SP(tp)
 	REG_L sp, TASK_TI_KERNEL_SP(tp)
 	addi sp, sp, -(PT_SIZE_ON_STACK)
@@ -158,11 +162,15 @@ SYM_CODE_START(handle_exception)
 	REG_S s4, PT_CAUSE(sp)
 	REG_S s5, PT_TP(sp)
 
-	/*
-	 * Set the scratch register to 0, so that if a recursive exception
-	 * occurs, the exception vector knows it came from the kernel
-	 */
-	csrw CSR_SCRATCH, x0
+	REG_L s0, TASK_TI_CPU(tp)
+	slli s0, s0, 3
+	la s1, __per_cpu_offset
+	add s1, s1, s0
+	REG_L s1, 0(s1)
+
+	csrw CSR_SCRATCH, s1
+	la s1, handle_kernel_exception
+	csrw CSR_TVEC, s1
 
 	/* Load the global pointer */
 	load_global_pointer
@@ -236,6 +244,8 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
 	 * structures again.
 	 */
 	csrw CSR_SCRATCH, tp
+	la a0, handle_user_exception
+	csrw CSR_TVEC, a0
 1:
 #ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE
 	move a0, sp
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index bdf3352acf4c..d8858334af2d 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -188,14 +188,9 @@ secondary_start_sbi:
 .align 2
 .Lsetup_trap_vector:
 	/* Set trap vector to exception handler */
-	la a0, handle_exception
+	la a0, handle_kernel_exception
 	csrw CSR_TVEC, a0
 
-	/*
-	 * Set sup0 scratch register to 0, indicating to exception vector that
-	 * we are presently executing in kernel.
-	 */
-	csrw CSR_SCRATCH, zero
 	ret
 
 SYM_CODE_END(_start)
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 601a321e0f17..2db44b10bedb 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -41,6 +41,11 @@
 
 static DECLARE_COMPLETION(cpu_running);
 
+void __init smp_prepare_boot_cpu(void)
+{
+	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+}
+
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int cpuid;
@@ -225,6 +230,8 @@ asmlinkage __visible void smp_callin(void)
 	mmgrab(mm);
 	current->active_mm = mm;
 
+	set_my_cpu_offset(per_cpu_offset(curr_cpuid));
+
 	store_cpu_topology(curr_cpuid);
 	notify_cpu_starting(curr_cpuid);
 
diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index 3fe9e6edef8f..69b2f390a2d4 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -16,7 +16,7 @@
 
 #ifdef CONFIG_FRAME_POINTER
 
-extern asmlinkage void handle_exception(void);
+extern asmlinkage void handle_kernel_exception(void);
 extern unsigned long ret_from_exception_end;
 
 static inline int fp_is_valid(unsigned long fp, unsigned long sp)
@@ -72,7 +72,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
 			fp = frame->fp;
 			pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra,
 						   &frame->ra);
-			if (pc >= (unsigned long)handle_exception &&
+			if (pc >= (unsigned long)handle_kernel_exception &&
 			    pc < (unsigned long)&ret_from_exception_end) {
 				if (unlikely(!fn(arg, pc)))
 					break;

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [External] [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-07 12:50 ` [PATCH] RISC-V: store percpu offset in CSR_SCRATCH Radim Krčmář
@ 2025-07-08 10:07   ` yunhui cui
  2025-07-08 11:10     ` Radim Krčmář
  0 siblings, 1 reply; 11+ messages in thread
From: yunhui cui @ 2025-07-08 10:07 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

Hi Radim,

On Mon, Jul 7, 2025 at 8:50 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>
> 2025-07-04T16:45:00+08:00, Yunhui Cui <cuiyunhui@bytedance.com>:
> > The following data was collected from tests conducted on the
> > Spacemit(R) X60 using the fixed register method:
> > [...]
> > The fixed register method reduced performance by 5.29%.
> > The per-CPU offset optimization improved performance by 2.52%.
>
> What is the performance if you use the scratch register?
>
> The patch below is completely unoptimized as I didn't want to shuffle
> code around too much, but it could give a rough idea.
>
> Thanks.
>
> ---8<---
> The scratch register currently denotes the mode before exception, but we
> can just use two different exception entry points to provide the same
> information, which frees the scratch register for the percpu offset.
>
> The user/kernel entry paths need more through rewrite, because they are
> terribly wasteful right now.
> ---
> Applies on top of d7b8f8e20813f0179d8ef519541a3527e7661d3a (v6.16-rc5)
>
>  arch/riscv/include/asm/percpu.h | 13 ++++++++++
>  arch/riscv/kernel/entry.S       | 46 ++++++++++++++++++++-------------
>  arch/riscv/kernel/head.S        |  7 +----
>  arch/riscv/kernel/smpboot.c     |  7 +++++
>  arch/riscv/kernel/stacktrace.c  |  4 +--
>  5 files changed, 51 insertions(+), 26 deletions(-)
>  create mode 100644 arch/riscv/include/asm/percpu.h
>
> diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
> new file mode 100644
> index 000000000000..2c838514e3ea
> --- /dev/null
> +++ b/arch/riscv/include/asm/percpu.h
> @@ -0,0 +1,13 @@
> +#ifndef __ASM_PERCPU_H
> +#define __ASM_PERCPU_H
> +
> +static inline void set_my_cpu_offset(unsigned long off)
> +{
> +       csr_write(CSR_SCRATCH, off);
> +}
> +
> +#define __my_cpu_offset csr_read(CSR_SCRATCH)
> +
> +#include <asm-generic/percpu.h>
> +
> +#endif
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index 75656afa2d6b..e48c553d6779 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -91,18 +91,8 @@
>         REG_L   a0, TASK_TI_A0(tp)
>  .endm
>
> -
> -SYM_CODE_START(handle_exception)
> -       /*
> -        * If coming from userspace, preserve the user thread pointer and load
> -        * the kernel thread pointer.  If we came from the kernel, the scratch
> -        * register will contain 0, and we should continue on the current TP.
> -        */
> -       csrrw tp, CSR_SCRATCH, tp
> -       bnez tp, .Lsave_context
> -
> -.Lrestore_kernel_tpsp:
> -       csrr tp, CSR_SCRATCH
> +SYM_CODE_START(handle_kernel_exception)
> +       csrw CSR_SCRATCH, tp
>
>  #ifdef CONFIG_64BIT
>         /*
> @@ -126,8 +116,22 @@ SYM_CODE_START(handle_exception)
>         bnez sp, handle_kernel_stack_overflow
>         REG_L sp, TASK_TI_KERNEL_SP(tp)
>  #endif
> +       j handle_exception
> +ASM_NOKPROBE(handle_kernel_exception)
> +SYM_CODE_END(handle_kernel_exception)
>
> -.Lsave_context:
> +SYM_CODE_START(handle_user_exception)
> +       /*
> +        * If coming from userspace, preserve the user thread pointer and load
> +        * the kernel thread pointer.
> +        */
> +       csrrw tp, CSR_SCRATCH, tp
> +       j handle_exception
> +
> +SYM_CODE_END(handle_user_exception)
> +ASM_NOKPROBE(handle_user_exception)
> +
> +SYM_CODE_START_NOALIGN(handle_exception)
>         REG_S sp, TASK_TI_USER_SP(tp)
>         REG_L sp, TASK_TI_KERNEL_SP(tp)
>         addi sp, sp, -(PT_SIZE_ON_STACK)
> @@ -158,11 +162,15 @@ SYM_CODE_START(handle_exception)
>         REG_S s4, PT_CAUSE(sp)
>         REG_S s5, PT_TP(sp)
>
> -       /*
> -        * Set the scratch register to 0, so that if a recursive exception
> -        * occurs, the exception vector knows it came from the kernel
> -        */
> -       csrw CSR_SCRATCH, x0
> +       REG_L s0, TASK_TI_CPU(tp)
> +       slli s0, s0, 3
> +       la s1, __per_cpu_offset
> +       add s1, s1, s0
> +       REG_L s1, 0(s1)
> +
> +       csrw CSR_SCRATCH, s1

This patch cleverly differentiates whether an exception originates
from user mode or kernel mode. However, there's still an issue with
using CSR_SCRATCH: each time handle_exception() is called, the
following instructions must be executed:

REG_L s0, TASK_TI_CPU(tp)
slli s0, s0, 3
la s1, __per_cpu_offset
add s1, s1, s0
REG_L s1, 0(s1)
csrw CSR_SCRATCH, s1

Should we consider adding a dedicated CSR (e.g., CSR_SCRATCH2) to
store the percpu offset instead?
See: https://lists.riscv.org/g/tech-privileged/topic/113437553#msg2506


> +       la s1, handle_kernel_exception
> +       csrw CSR_TVEC, s1
>
>         /* Load the global pointer */
>         load_global_pointer
> @@ -236,6 +244,8 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
>          * structures again.
>          */
>         csrw CSR_SCRATCH, tp
> +       la a0, handle_user_exception
> +       csrw CSR_TVEC, a0
>  1:
>  #ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE
>         move a0, sp
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index bdf3352acf4c..d8858334af2d 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -188,14 +188,9 @@ secondary_start_sbi:
>  .align 2
>  .Lsetup_trap_vector:
>         /* Set trap vector to exception handler */
> -       la a0, handle_exception
> +       la a0, handle_kernel_exception
>         csrw CSR_TVEC, a0
>
> -       /*
> -        * Set sup0 scratch register to 0, indicating to exception vector that
> -        * we are presently executing in kernel.
> -        */
> -       csrw CSR_SCRATCH, zero
>         ret
>
>  SYM_CODE_END(_start)
> diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
> index 601a321e0f17..2db44b10bedb 100644
> --- a/arch/riscv/kernel/smpboot.c
> +++ b/arch/riscv/kernel/smpboot.c
> @@ -41,6 +41,11 @@
>
>  static DECLARE_COMPLETION(cpu_running);
>
> +void __init smp_prepare_boot_cpu(void)
> +{
> +       set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
> +}
> +
>  void __init smp_prepare_cpus(unsigned int max_cpus)
>  {
>         int cpuid;
> @@ -225,6 +230,8 @@ asmlinkage __visible void smp_callin(void)
>         mmgrab(mm);
>         current->active_mm = mm;
>
> +       set_my_cpu_offset(per_cpu_offset(curr_cpuid));
> +
>         store_cpu_topology(curr_cpuid);
>         notify_cpu_starting(curr_cpuid);
>
> diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
> index 3fe9e6edef8f..69b2f390a2d4 100644
> --- a/arch/riscv/kernel/stacktrace.c
> +++ b/arch/riscv/kernel/stacktrace.c
> @@ -16,7 +16,7 @@
>
>  #ifdef CONFIG_FRAME_POINTER
>
> -extern asmlinkage void handle_exception(void);
> +extern asmlinkage void handle_kernel_exception(void);
>  extern unsigned long ret_from_exception_end;
>
>  static inline int fp_is_valid(unsigned long fp, unsigned long sp)
> @@ -72,7 +72,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
>                         fp = frame->fp;
>                         pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra,
>                                                    &frame->ra);
> -                       if (pc >= (unsigned long)handle_exception &&
> +                       if (pc >= (unsigned long)handle_kernel_exception &&
>                             pc < (unsigned long)&ret_from_exception_end) {
>                                 if (unlikely(!fn(arg, pc)))
>                                         break;

Thanks,
Yunhui

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [External] [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-08 10:07   ` [External] " yunhui cui
@ 2025-07-08 11:10     ` Radim Krčmář
  2025-07-09 11:42       ` yunhui cui
  0 siblings, 1 reply; 11+ messages in thread
From: Radim Krčmář @ 2025-07-08 11:10 UTC (permalink / raw)
  To: yunhui cui
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

2025-07-08T18:07:27+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> This patch cleverly differentiates whether an exception originates
> from user mode or kernel mode. However, there's still an issue with
> using CSR_SCRATCH: each time handle_exception() is called, the
> following instructions must be executed:
>
> REG_L s0, TASK_TI_CPU(tp)
> slli s0, s0, 3
> la s1, __per_cpu_offset
> add s1, s1, s0
> REG_L s1, 0(s1)
> csrw CSR_SCRATCH, s1

We can minimize the cost at exception entry by storing the precomputed
offset in thread_info, which bloats the struct, and also incurs update
cost on cpu migration, but should still be a net performance gain.

The minimal code at exception entry would be:

  REG_L s0, TASK_TI_PERCPU_OFFSET(tp)
  csrw CSR_SCRATCH, s0

> Should we consider adding a dedicated CSR (e.g., CSR_SCRATCH2) to
> store the percpu offset instead?
> See: https://lists.riscv.org/g/tech-privileged/topic/113437553#msg2506

It would be nice to gather more data on the CSR_SCRATCH approach.
Basically, the overhead of "REG_L s0, TASK_TI_PERCPU_OFFSET(tp)".
(Or the longer sequence if we think it is worth it.)

Can you benchmark the patch after reverting percpu.h, so we include the
overhead of switching CSR_SCRATCH, but without any benefits provided by
the per-cpu offset?
The baseline would be the patch with reverted percpu.h, and reverted the
sequence that sets the CSR_SCRATCH in handle_exception, so we roughly
estimate the benefit of adding CSR_SCRATCH2.

The CSR_SCRATCH2 does add overhead to hardware, and to domain context
switches, and we also have to do something else for a few years anyway,
because it's not even ratified...  It's possible we might not benefit
enough from CSR_SCRATCH2 to make a good case for it.

Thanks.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [External] [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-08 11:10     ` Radim Krčmář
@ 2025-07-09 11:42       ` yunhui cui
  2025-07-09 14:20         ` Radim Krčmář
  0 siblings, 1 reply; 11+ messages in thread
From: yunhui cui @ 2025-07-09 11:42 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

Hi Radim,

On Tue, Jul 8, 2025 at 7:10 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>
> 2025-07-08T18:07:27+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> > This patch cleverly differentiates whether an exception originates
> > from user mode or kernel mode. However, there's still an issue with
> > using CSR_SCRATCH: each time handle_exception() is called, the
> > following instructions must be executed:
> >
> > REG_L s0, TASK_TI_CPU(tp)
> > slli s0, s0, 3
> > la s1, __per_cpu_offset
> > add s1, s1, s0
> > REG_L s1, 0(s1)
> > csrw CSR_SCRATCH, s1
>
> We can minimize the cost at exception entry by storing the precomputed
> offset in thread_info, which bloats the struct, and also incurs update
> cost on cpu migration, but should still be a net performance gain.
>
> The minimal code at exception entry would be:
>
>   REG_L s0, TASK_TI_PERCPU_OFFSET(tp)
>   csrw CSR_SCRATCH, s0
>
> > Should we consider adding a dedicated CSR (e.g., CSR_SCRATCH2) to
> > store the percpu offset instead?
> > See: https://lists.riscv.org/g/tech-privileged/topic/113437553#msg2506
>
> It would be nice to gather more data on the CSR_SCRATCH approach.
> Basically, the overhead of "REG_L s0, TASK_TI_PERCPU_OFFSET(tp)".
> (Or the longer sequence if we think it is worth it.)
>
> Can you benchmark the patch after reverting percpu.h, so we include the
> overhead of switching CSR_SCRATCH, but without any benefits provided by
> the per-cpu offset?
> The baseline would be the patch with reverted percpu.h, and reverted the
> sequence that sets the CSR_SCRATCH in handle_exception, so we roughly
> estimate the benefit of adding CSR_SCRATCH2.
>
> The CSR_SCRATCH2 does add overhead to hardware, and to domain context
> switches, and we also have to do something else for a few years anyway,
> because it's not even ratified...  It's possible we might not benefit
> enough from CSR_SCRATCH2 to make a good case for it.
>
> Thanks.

Bench platform: Spacemit(R) X60
No changes:
6.77, 6.791, 6.792, 6.826, 6.784, 6.839, 6.776, 6.733, 6.795, 6.763
Geometric mean: 6.786839305
Reusing the current scratch:
7.085, 7.09, 7.021, 7.089, 7.068, 7.034, 7.06, 7.062, 7.065, 7.051
Geometric mean: 7.062466876
A degradation of approximately 4.06% is observed. The possible cause
of the degradation is that the CSR_TVEC register is set every time a
kernel/user exception occurs.

The following is the patch without percpu optimization, which only
tests the overhead of separating exceptions into kernel and user
modes.

---
 arch/riscv/kernel/entry.S | 39 ++++++++++++++++++++++-----------------
 arch/riscv/kernel/head.S  |  7 +------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 9d1a305d5508..cc2fd4cd54a0 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -19,17 +19,8 @@

  .section .irqentry.text, "ax"

-SYM_CODE_START(handle_exception)
- /*
- * If coming from userspace, preserve the user thread pointer and load
- * the kernel thread pointer.  If we came from the kernel, the scratch
- * register will contain 0, and we should continue on the current TP.
- */
- csrrw tp, CSR_SCRATCH, tp
- bnez tp, .Lsave_context
-
-.Lrestore_kernel_tpsp:
- csrr tp, CSR_SCRATCH
+SYM_CODE_START(handle_kernel_exception)
+ csrw CSR_SCRATCH, tp
  REG_S sp, TASK_TI_KERNEL_SP(tp)

 #ifdef CONFIG_VMAP_STACK
@@ -40,7 +31,20 @@ SYM_CODE_START(handle_exception)
  REG_L sp, TASK_TI_KERNEL_SP(tp)
 #endif

-.Lsave_context:
+ j handle_exception
+SYM_CODE_END(handle_kernel_exception)
+
+SYM_CODE_START(handle_user_exception)
+ /*
+ * If coming from userspace, preserve the user thread pointer and load
+ * the kernel thread pointer.
+ */
+ csrrw tp, CSR_SCRATCH, tp
+ j handle_exception
+
+SYM_CODE_END(handle_user_exception)
+
+SYM_CODE_START_NOALIGN(handle_exception)
  REG_S sp, TASK_TI_USER_SP(tp)
  REG_L sp, TASK_TI_KERNEL_SP(tp)
  addi sp, sp, -(PT_SIZE_ON_STACK)
@@ -71,11 +75,8 @@ SYM_CODE_START(handle_exception)
  REG_S s4, PT_CAUSE(sp)
  REG_S s5, PT_TP(sp)

- /*
- * Set the scratch register to 0, so that if a recursive exception
- * occurs, the exception vector knows it came from the kernel
- */
- csrw CSR_SCRATCH, x0
+ la s1, handle_kernel_exception
+ csrw CSR_TVEC, s1

  /* Load the global pointer */
  load_global_pointer
@@ -141,6 +142,10 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
  * structures again.
  */
  csrw CSR_SCRATCH, tp
+
+ la a0, handle_user_exception
+ csrw CSR_TVEC, a0
+
 1:
 #ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE
  move a0, sp
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index a2e2f0dd3899..992acec3bc87 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -172,14 +172,9 @@ secondary_start_sbi:
 .align 2
 .Lsetup_trap_vector:
  /* Set trap vector to exception handler */
- la a0, handle_exception
+ la a0, handle_kernel_exception
  csrw CSR_TVEC, a0

- /*
- * Set sup0 scratch register to 0, indicating to exception vector that
- * we are presently executing in kernel.
- */
- csrw CSR_SCRATCH, zero
  ret

 .align 2
--
2.43.0


Thanks,
Yunhui

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [External] [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-09 11:42       ` yunhui cui
@ 2025-07-09 14:20         ` Radim Krčmář
  2025-07-10  3:45           ` yunhui cui
  0 siblings, 1 reply; 11+ messages in thread
From: Radim Krčmář @ 2025-07-09 14:20 UTC (permalink / raw)
  To: yunhui cui
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

2025-07-09T19:42:26+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> Bench platform: Spacemit(R) X60
> No changes:
> 6.77, 6.791, 6.792, 6.826, 6.784, 6.839, 6.776, 6.733, 6.795, 6.763
> Geometric mean: 6.786839305
> Reusing the current scratch:
> 7.085, 7.09, 7.021, 7.089, 7.068, 7.034, 7.06, 7.062, 7.065, 7.051
> Geometric mean: 7.062466876

Great results.

> A degradation of approximately 4.06% is observed. The possible cause
> of the degradation is that the CSR_TVEC register is set every time a
> kernel/user exception occurs.

I assume the same.

> The following is the patch without percpu optimization, which only
> tests the overhead of separating exceptions into kernel and user
> modes.

Is the overhead above with this patch?  And when we then use the
CSR_SCRATCH for percpu, does it degrade even further?

Thanks.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [External] [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-09 14:20         ` Radim Krčmář
@ 2025-07-10  3:45           ` yunhui cui
  2025-07-10  6:35             ` Radim Krčmář
  0 siblings, 1 reply; 11+ messages in thread
From: yunhui cui @ 2025-07-10  3:45 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

Hi Radim,

On Wed, Jul 9, 2025 at 10:20 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>
> 2025-07-09T19:42:26+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> > Bench platform: Spacemit(R) X60
> > No changes:
> > 6.77, 6.791, 6.792, 6.826, 6.784, 6.839, 6.776, 6.733, 6.795, 6.763
> > Geometric mean: 6.786839305
> > Reusing the current scratch:
> > 7.085, 7.09, 7.021, 7.089, 7.068, 7.034, 7.06, 7.062, 7.065, 7.051
> > Geometric mean: 7.062466876
>
> Great results.
>
> > A degradation of approximately 4.06% is observed. The possible cause
> > of the degradation is that the CSR_TVEC register is set every time a
> > kernel/user exception occurs.
>
> I assume the same.
>
> > The following is the patch without percpu optimization, which only
> > tests the overhead of separating exceptions into kernel and user
> > modes.
>
> Is the overhead above with this patch?  And when we then use the
> CSR_SCRATCH for percpu, does it degrade even further?
>
> Thanks.


We can see that the percpu optimization is around 2.5% through the
method of fixing registers, and we can consider that the percpu
optimization can bring a 2.5% gain. Is there no need to add the percpu
optimization logic on the basis of the scratch patch for testing?

Reference: https://lists.riscv.org/g/tech-privileged/message/2485


Thanks,
Yunhui

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [External] [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-10  3:45           ` yunhui cui
@ 2025-07-10  6:35             ` Radim Krčmář
  2025-07-10 11:47               ` yunhui cui
  0 siblings, 1 reply; 11+ messages in thread
From: Radim Krčmář @ 2025-07-10  6:35 UTC (permalink / raw)
  To: yunhui cui
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

2025-07-10T11:45:06+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> On Wed, Jul 9, 2025 at 10:20 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>> Is the overhead above with this patch?  And when we then use the
>> CSR_SCRATCH for percpu, does it degrade even further?
>
> We can see that the percpu optimization is around 2.5% through the
> method of fixing registers, and we can consider that the percpu
> optimization can bring a 2.5% gain. Is there no need to add the percpu
> optimization logic on the basis of the scratch patch for testing?
>
> Reference: https://lists.riscv.org/g/tech-privileged/message/2485

That is when the value is in a GPR, though, and we don't know the
performance of a CSR_SCRATCH access.
We can hope that it's not much worse than a GPR, but an implementation
might choose to be very slow with CSR_SCRATCH.

I have in mind another method where we can use the current CSR_SCRATCH
without changing CSR_TVAL, but I don't really want to spend time on it
if reading the CSR doesn't give any benefit.

It would be to store the percpu offset in CSR_SCRATCH permanently, do
the early exception register shuffling with a percpu area storage, and
load the thread pointer from there as well.
That method would also eliminate writing CSR_SCRATCH on every exception
entry+exit, so maybe it makes sense to try it even if CSRs are slow...

Thanks.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [External] [PATCH] RISC-V: store percpu offset in CSR_SCRATCH
  2025-07-10  6:35             ` Radim Krčmář
@ 2025-07-10 11:47               ` yunhui cui
  2025-07-10 16:40                 ` [PATCH] RISC-V: store precomputed percpu_offset in the task struct Radim Krčmář
  0 siblings, 1 reply; 11+ messages in thread
From: yunhui cui @ 2025-07-10 11:47 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

Hi Radim,

On Thu, Jul 10, 2025 at 2:35 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>
> 2025-07-10T11:45:06+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> > On Wed, Jul 9, 2025 at 10:20 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
> >> Is the overhead above with this patch?  And when we then use the
> >> CSR_SCRATCH for percpu, does it degrade even further?
> >
> > We can see that the percpu optimization is around 2.5% through the
> > method of fixing registers, and we can consider that the percpu
> > optimization can bring a 2.5% gain. Is there no need to add the percpu
> > optimization logic on the basis of the scratch patch for testing?
> >
> > Reference: https://lists.riscv.org/g/tech-privileged/message/2485
>
> That is when the value is in a GPR, though, and we don't know the
> performance of a CSR_SCRATCH access.
> We can hope that it's not much worse than a GPR, but an implementation
> might choose to be very slow with CSR_SCRATCH.
>
> I have in mind another method where we can use the current CSR_SCRATCH
> without changing CSR_TVAL, but I don't really want to spend time on it
> if reading the CSR doesn't give any benefit.
>
> It would be to store the percpu offset in CSR_SCRATCH permanently, do
> the early exception register shuffling with a percpu area storage, and
> load the thread pointer from there as well.
> That method would also eliminate writing CSR_SCRATCH on every exception
> entry+exit, so maybe it makes sense to try it even if CSRs are slow...
>
> Thanks.


Based on the patch, optimizations for percpu offset have been added,
with the following data:
6.989 7.046 6.976 6.986 7.001 7.017 7.007 7.064 7.008 7.039
Geometric mean: 7.013248303
Compared to reusing the scratch register, the performance has improved
by approximately 0.7%.

If more optimizations can be made to the scratch register, there
should be further performance improvements.

Patch:
---
 arch/riscv/include/asm/percpu.h | 14 ++++++++++++++
 arch/riscv/kernel/asm-offsets.c |  1 +
 arch/riscv/kernel/entry.S       |  7 +++++++
 arch/riscv/kernel/smpboot.c     |  3 +++
 4 files changed, 25 insertions(+)
 create mode 100644 arch/riscv/include/asm/percpu.h

diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 000000000000..1fbfcb108f84
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+static inline void set_my_cpu_offset(unsigned long off)
+{
+        csr_write(CSR_SCRATCH, off);
+}
+
+#define __my_cpu_offset csr_read(CSR_SCRATCH)
+
+#include <asm-generic/percpu.h>
+
+#endif
+
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index a03129f40c46..0ce96f30bf32 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -35,6 +35,7 @@ void asm_offsets(void)
  OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
  OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
  OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
+ OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
  OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
  OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
  OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index cc2fd4cd54a0..82caeee91c15 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -75,6 +75,13 @@ SYM_CODE_START_NOALIGN(handle_exception)
  REG_S s4, PT_CAUSE(sp)
  REG_S s5, PT_TP(sp)

+ REG_L s0, TASK_TI_CPU(tp)
+ slli s0, s0, 3
+ la s1, __per_cpu_offset
+ add s1, s1, s0
+ REG_L s1, 0(s1)
+ csrw CSR_SCRATCH, s1
+
  la s1, handle_kernel_exception
  csrw CSR_TVEC, s1

diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index fb6ab7f8bfbd..6fa12cc84523 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -43,6 +43,7 @@ static DECLARE_COMPLETION(cpu_running);

 void __init smp_prepare_boot_cpu(void)
 {
+ set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 }

 void __init smp_prepare_cpus(unsigned int max_cpus)
@@ -240,6 +241,8 @@ asmlinkage __visible void smp_callin(void)
  mmgrab(mm);
  current->active_mm = mm;

+ set_my_cpu_offset(per_cpu_offset(curr_cpuid));
+
  store_cpu_topology(curr_cpuid);
  notify_cpu_starting(curr_cpuid);

--
2.43.0

Thanks,
Yunhui

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH] RISC-V: store precomputed percpu_offset in the task struct
  2025-07-10 11:47               ` yunhui cui
@ 2025-07-10 16:40                 ` Radim Krčmář
  0 siblings, 0 replies; 11+ messages in thread
From: Radim Krčmář @ 2025-07-10 16:40 UTC (permalink / raw)
  To: yunhui cui
  Cc: masahiroy, nathan, nicolas.schier, dennis, tj, cl, paul.walmsley,
	palmer, aou, alex, andybnac, bjorn, cyrilbur, rostedt, puranjay,
	ben.dooks, zhangchunyan, ruanjinjie, jszhang, charlie, cleger,
	antonb, ajones, debug, haibo1.xu, samuel.holland, linux-kbuild,
	linux-kernel, linux-mm, linux-riscv, linux-riscv, wangziang.ok

2025-07-10T19:47:27+08:00, yunhui cui <cuiyunhui@bytedance.com>:
> On Thu, Jul 10, 2025 at 2:35 PM Radim Krčmář <rkrcmar@ventanamicro.com> wrote:
>> It would be to store the percpu offset in CSR_SCRATCH permanently, do
>> the early exception register shuffling with a percpu area storage, and
>> load the thread pointer from there as well.
>> That method would also eliminate writing CSR_SCRATCH on every exception
>> entry+exit, so maybe it makes sense to try it even if CSRs are slow...
>
> Based on the patch, optimizations for percpu offset have been added,
> with the following data:
> 6.989 7.046 6.976 6.986 7.001 7.017 7.007 7.064 7.008 7.039
> Geometric mean: 7.013248303
> Compared to reusing the scratch register, the performance has improved
> by approximately 0.7%.

Nice, thanks.  The CSR_SCRATCH accesses seem much slower than GPRs, and
possibly even slower than L1 hit -- we might gain more by storing the
precomputed offset in the task struct.

Can you check this patch as well?

(It should be compared against a variant of CSR_SCRATCH that uses the
 TASK_TI_PERCPU_OFFSET optimizations, but we can try to interpolate. :])

---8<---
RISC-V: store precomputed percpu_offset in the task struct

Exploring the memoization trade-off... hoping that __set_task_cpu covers
everything. :)

I didn't put any though into where the percpu_offset should live, and
the naive approach is to put it next to cpu.
This needs more work to not break build on other arches, because I
directly added RISC-V specific code to __set_task_cpu, to save time
figuring out where else it could be.
---
 arch/riscv/include/asm/asm.h         | 6 +-----
 arch/riscv/include/asm/percpu.h      | 8 ++++++++
 arch/riscv/include/asm/thread_info.h | 3 ++-
 arch/riscv/kernel/asm-offsets.c      | 1 +
 arch/riscv/kernel/smpboot.c          | 6 ++++++
 kernel/sched/sched.h                 | 1 +
 6 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 arch/riscv/include/asm/percpu.h

diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h
index a8a2af6dfe9d..2a6b831d9cdf 100644
--- a/arch/riscv/include/asm/asm.h
+++ b/arch/riscv/include/asm/asm.h
@@ -91,11 +91,7 @@
 #endif
 
 .macro asm_per_cpu dst sym tmp
-	REG_L \tmp, TASK_TI_CPU_NUM(tp)
-	slli  \tmp, \tmp, PER_CPU_OFFSET_SHIFT
-	la    \dst, __per_cpu_offset
-	add   \dst, \dst, \tmp
-	REG_L \tmp, 0(\dst)
+	REG_L \tmp, TASK_TI_PERCPU_OFFSET(tp)
 	la    \dst, \sym
 	add   \dst, \dst, \tmp
 .endm
diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 000000000000..c37a0fce6ebc
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,8 @@
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+#define __my_cpu_offset (current_thread_info()->percpu_offset)
+
+#include <asm-generic/percpu.h>
+
+#endif
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index f5916a70879a..da776b7a1d02 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -60,8 +60,9 @@ struct thread_info {
 	 */
 	long			kernel_sp;	/* Kernel stack pointer */
 	long			user_sp;	/* User stack pointer */
-	int			cpu;
+	int			cpu;		// TODO: could be packed better
 	unsigned long		syscall_work;	/* SYSCALL_WORK_ flags */
+	unsigned long		percpu_offset;	// XXX: randomly placed here
 #ifdef CONFIG_SHADOW_CALL_STACK
 	void			*scs_base;
 	void			*scs_sp;
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 6e8c0d6feae9..9c7bb4d7e3b3 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -50,6 +50,7 @@ void asm_offsets(void)
 #endif
 
 	OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
+	OFFSET(TASK_TI_PERCPU_OFFSET, task_struct, thread_info.percpu_offset);
 	OFFSET(TASK_THREAD_F0,  task_struct, thread.fstate.f[0]);
 	OFFSET(TASK_THREAD_F1,  task_struct, thread.fstate.f[1]);
 	OFFSET(TASK_THREAD_F2,  task_struct, thread.fstate.f[2]);
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 601a321e0f17..3c09c8f3e30c 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -41,6 +41,11 @@
 
 static DECLARE_COMPLETION(cpu_running);
 
+void __init smp_prepare_boot_cpu(void)
+{
+	current_thread_info()->percpu_offset = per_cpu_offset(smp_processor_id());
+}
+
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int cpuid;
@@ -183,6 +188,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int ret = 0;
 	tidle->thread_info.cpu = cpu;
+	tidle->thread_info.percpu_offset = per_cpu_offset(cpu);
 
 	ret = start_secondary_cpu(cpu, tidle);
 	if (!ret) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 475bb5998295..2180a85b1403 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2199,6 +2199,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 	 */
 	smp_wmb();
 	WRITE_ONCE(task_thread_info(p)->cpu, cpu);
+	WRITE_ONCE(task_thread_info(p)->percpu_offset, per_cpu_offset(cpu));
 	p->wake_cpu = cpu;
 #endif
 }

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-07-10 16:40 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-04  8:45 [PATCH RFC] RISC-V: Fix a register to store the percpu offset Yunhui Cui
2025-07-07  7:55 ` Clément Léger
2025-07-07 12:50 ` [PATCH] RISC-V: store percpu offset in CSR_SCRATCH Radim Krčmář
2025-07-08 10:07   ` [External] " yunhui cui
2025-07-08 11:10     ` Radim Krčmář
2025-07-09 11:42       ` yunhui cui
2025-07-09 14:20         ` Radim Krčmář
2025-07-10  3:45           ` yunhui cui
2025-07-10  6:35             ` Radim Krčmář
2025-07-10 11:47               ` yunhui cui
2025-07-10 16:40                 ` [PATCH] RISC-V: store precomputed percpu_offset in the task struct Radim Krčmář

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).