linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h
@ 2025-06-18  3:43 Yunhui Cui
  2025-06-18  3:43 ` [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm Yunhui Cui
  2025-06-21 14:33 ` [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h Yury Norov
  0 siblings, 2 replies; 10+ messages in thread
From: Yunhui Cui @ 2025-06-18  3:43 UTC (permalink / raw)
  To: yury.norov, linux, paul.walmsley, palmer, aou, alex, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm
  Cc: Yunhui Cui

The arch/riscv/include/asm/bitops.h does not functionally require
including /linux/irqflags.h. Additionally, adding
arch/riscv/include/asm/percpu.h causes a circular inclusion:
kernel/bounds.c
->include/linux/log2.h
->include/linux/bitops.h
->arch/riscv/include/asm/bitops.h
->include/linux/irqflags.h
->include/linux/find.h
->return val ? __ffs(val) : size;
->arch/riscv/include/asm/bitops.h

The compilation log is as follows:
CC      kernel/bounds.s
In file included from ./include/linux/bitmap.h:11,
               from ./include/linux/cpumask.h:12,
               from ./arch/riscv/include/asm/processor.h:55,
               from ./arch/riscv/include/asm/thread_info.h:42,
               from ./include/linux/thread_info.h:60,
               from ./include/asm-generic/preempt.h:5,
               from ./arch/riscv/include/generated/asm/preempt.h:1,
               from ./include/linux/preempt.h:79,
               from ./arch/riscv/include/asm/percpu.h:8,
               from ./include/linux/irqflags.h:19,
               from ./arch/riscv/include/asm/bitops.h:14,
               from ./include/linux/bitops.h:68,
               from ./include/linux/log2.h:12,
               from kernel/bounds.c:13:
./include/linux/find.h: In function 'find_next_bit':
./include/linux/find.h:66:30: error: implicit declaration of function '__ffs' [-Wimplicit-function-declaration]
   66 |                 return val ? __ffs(val) : size;
      |                              ^~~~~

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
 arch/riscv/include/asm/bitops.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index d59310f74c2ba..d9837b3cf05fe 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -11,7 +11,6 @@
 #endif /* _LINUX_BITOPS_H */
 
 #include <linux/compiler.h>
-#include <linux/irqflags.h>
 #include <asm/barrier.h>
 #include <asm/bitsperlong.h>
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-06-18  3:43 [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h Yunhui Cui
@ 2025-06-18  3:43 ` Yunhui Cui
  2025-07-17 13:04   ` Alexandre Ghiti
  2025-06-21 14:33 ` [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h Yury Norov
  1 sibling, 1 reply; 10+ messages in thread
From: Yunhui Cui @ 2025-06-18  3:43 UTC (permalink / raw)
  To: yury.norov, linux, paul.walmsley, palmer, aou, alex, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm
  Cc: Yunhui Cui

Current percpu operations rely on generic implementations, where
raw_local_irq_save() introduces substantial overhead. Optimization
is achieved through atomic operations and preemption disabling.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
 arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 arch/riscv/include/asm/percpu.h

diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
new file mode 100644
index 0000000000000..423c0d01f874c
--- /dev/null
+++ b/arch/riscv/include/asm/percpu.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_PERCPU_H
+#define __ASM_PERCPU_H
+
+#include <linux/preempt.h>
+
+#define PERCPU_RW_OPS(sz)						\
+static inline unsigned long __percpu_read_##sz(void *ptr)		\
+{									\
+	return READ_ONCE(*(u##sz *)ptr);				\
+}									\
+									\
+static inline void __percpu_write_##sz(void *ptr, unsigned long val)	\
+{									\
+	WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);				\
+}
+
+#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)			\
+static inline void							\
+__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)		\
+{									\
+	asm volatile (							\
+	"amo" #amo_insn #sfx " zero, %[val], %[ptr]"			\
+	: [ptr] "+A" (*(u##sz *)ptr)					\
+	: [val] "r" ((u##sz)(val))					\
+	: "memory");							\
+}
+
+#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)		\
+static inline u##sz							\
+__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long val)	\
+{									\
+	register u##sz ret;						\
+									\
+	asm volatile (							\
+	"amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"			\
+	: [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)			\
+	: [val] "r" ((u##sz)(val))					\
+	: "memory");							\
+									\
+	return ret + val;						\
+}
+
+#define PERCPU_OP(name, amo_insn)					\
+	__PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)			\
+	__PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)			\
+	__PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)			\
+	__PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)			\
+
+#define PERCPU_RET_OP(name, amo_insn)					\
+	__PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn)			\
+	__PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)		\
+	__PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)		\
+	__PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
+
+PERCPU_RW_OPS(8)
+PERCPU_RW_OPS(16)
+PERCPU_RW_OPS(32)
+PERCPU_RW_OPS(64)
+
+PERCPU_OP(add, add)
+PERCPU_OP(andnot, and)
+PERCPU_OP(or, or)
+PERCPU_RET_OP(add, add)
+
+#undef PERCPU_RW_OPS
+#undef __PERCPU_AMO_OP_CASE
+#undef __PERCPU_AMO_RET_OP_CASE
+#undef PERCPU_OP
+#undef PERCPU_RET_OP
+
+#define _pcp_protect(op, pcp, ...)					\
+({									\
+	preempt_disable_notrace();					\
+	op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);				\
+	preempt_enable_notrace();					\
+})
+
+#define _pcp_protect_return(op, pcp, args...)				\
+({									\
+	typeof(pcp) __retval;						\
+	preempt_disable_notrace();					\
+	__retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);	\
+	preempt_enable_notrace();					\
+	__retval;							\
+})
+
+#define this_cpu_read_1(pcp)		_pcp_protect_return(__percpu_read_8, pcp)
+#define this_cpu_read_2(pcp)		_pcp_protect_return(__percpu_read_16, pcp)
+#define this_cpu_read_4(pcp)		_pcp_protect_return(__percpu_read_32, pcp)
+#define this_cpu_read_8(pcp)		_pcp_protect_return(__percpu_read_64, pcp)
+
+#define this_cpu_write_1(pcp, val)	_pcp_protect(__percpu_write_8, pcp, (unsigned long)val)
+#define this_cpu_write_2(pcp, val)	_pcp_protect(__percpu_write_16, pcp, (unsigned long)val)
+#define this_cpu_write_4(pcp, val)	_pcp_protect(__percpu_write_32, pcp, (unsigned long)val)
+#define this_cpu_write_8(pcp, val)	_pcp_protect(__percpu_write_64, pcp, (unsigned long)val)
+
+#define this_cpu_add_1(pcp, val)	_pcp_protect(__percpu_add_amo_case_8, pcp, val)
+#define this_cpu_add_2(pcp, val)	_pcp_protect(__percpu_add_amo_case_16, pcp, val)
+#define this_cpu_add_4(pcp, val)	_pcp_protect(__percpu_add_amo_case_32, pcp, val)
+#define this_cpu_add_8(pcp, val)	_pcp_protect(__percpu_add_amo_case_64, pcp, val)
+
+#define this_cpu_add_return_1(pcp, val)		\
+_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
+
+#define this_cpu_add_return_2(pcp, val)		\
+_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
+
+#define this_cpu_add_return_4(pcp, val)		\
+_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
+
+#define this_cpu_add_return_8(pcp, val)		\
+_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
+
+#define this_cpu_and_1(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
+#define this_cpu_and_2(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
+#define this_cpu_and_4(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
+#define this_cpu_and_8(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)
+
+#define this_cpu_or_1(pcp, val)	_pcp_protect(__percpu_or_amo_case_8, pcp, val)
+#define this_cpu_or_2(pcp, val)	_pcp_protect(__percpu_or_amo_case_16, pcp, val)
+#define this_cpu_or_4(pcp, val)	_pcp_protect(__percpu_or_amo_case_32, pcp, val)
+#define this_cpu_or_8(pcp, val)	_pcp_protect(__percpu_or_amo_case_64, pcp, val)
+
+#define this_cpu_xchg_1(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
+#define this_cpu_xchg_2(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
+#define this_cpu_xchg_4(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
+#define this_cpu_xchg_8(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
+
+#define this_cpu_cmpxchg_1(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
+#define this_cpu_cmpxchg_2(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
+#define this_cpu_cmpxchg_4(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
+#define this_cpu_cmpxchg_8(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
+
+#include <asm-generic/percpu.h>
+
+#endif /* __ASM_PERCPU_H */
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h
  2025-06-18  3:43 [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h Yunhui Cui
  2025-06-18  3:43 ` [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm Yunhui Cui
@ 2025-06-21 14:33 ` Yury Norov
  1 sibling, 0 replies; 10+ messages in thread
From: Yury Norov @ 2025-06-21 14:33 UTC (permalink / raw)
  To: Yunhui Cui
  Cc: linux, paul.walmsley, palmer, aou, alex, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm

On Wed, Jun 18, 2025 at 11:43:27AM +0800, Yunhui Cui wrote:
> The arch/riscv/include/asm/bitops.h does not functionally require
> including /linux/irqflags.h. Additionally, adding
> arch/riscv/include/asm/percpu.h causes a circular inclusion:
> kernel/bounds.c
> ->include/linux/log2.h
> ->include/linux/bitops.h
> ->arch/riscv/include/asm/bitops.h
> ->include/linux/irqflags.h
> ->include/linux/find.h
> ->return val ? __ffs(val) : size;
> ->arch/riscv/include/asm/bitops.h
> 
> The compilation log is as follows:
> CC      kernel/bounds.s
> In file included from ./include/linux/bitmap.h:11,
>                from ./include/linux/cpumask.h:12,
>                from ./arch/riscv/include/asm/processor.h:55,
>                from ./arch/riscv/include/asm/thread_info.h:42,
>                from ./include/linux/thread_info.h:60,
>                from ./include/asm-generic/preempt.h:5,
>                from ./arch/riscv/include/generated/asm/preempt.h:1,
>                from ./include/linux/preempt.h:79,
>                from ./arch/riscv/include/asm/percpu.h:8,
>                from ./include/linux/irqflags.h:19,
>                from ./arch/riscv/include/asm/bitops.h:14,
>                from ./include/linux/bitops.h:68,
>                from ./include/linux/log2.h:12,
>                from kernel/bounds.c:13:
> ./include/linux/find.h: In function 'find_next_bit':
> ./include/linux/find.h:66:30: error: implicit declaration of function '__ffs' [-Wimplicit-function-declaration]
>    66 |                 return val ? __ffs(val) : size;
>       |                              ^~~~~
> 
> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>

Is this patch a prerequisite for #2 or it fixes build on master for
you?

If fixes broken build, I'll take it and send an -rc pull request.
Otherwise, please work with RISC-V maintainers to move them together.

Thanks,
Yury

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-06-18  3:43 ` [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm Yunhui Cui
@ 2025-07-17 13:04   ` Alexandre Ghiti
  2025-07-17 13:05     ` Alexandre Ghiti
  0 siblings, 1 reply; 10+ messages in thread
From: Alexandre Ghiti @ 2025-07-17 13:04 UTC (permalink / raw)
  To: Yunhui Cui, yury.norov, linux, paul.walmsley, palmer, aou,
	linux-riscv, linux-kernel, dennis, tj, cl, linux-mm

Hi Yunhui,

On 6/18/25 05:43, Yunhui Cui wrote:
> Current percpu operations rely on generic implementations, where
> raw_local_irq_save() introduces substantial overhead. Optimization
> is achieved through atomic operations and preemption disabling.
>
> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> ---
>   arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
>   1 file changed, 138 insertions(+)
>   create mode 100644 arch/riscv/include/asm/percpu.h
>
> diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
> new file mode 100644
> index 0000000000000..423c0d01f874c
> --- /dev/null
> +++ b/arch/riscv/include/asm/percpu.h
> @@ -0,0 +1,138 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#ifndef __ASM_PERCPU_H
> +#define __ASM_PERCPU_H
> +
> +#include <linux/preempt.h>
> +
> +#define PERCPU_RW_OPS(sz)						\
> +static inline unsigned long __percpu_read_##sz(void *ptr)		\
> +{									\
> +	return READ_ONCE(*(u##sz *)ptr);				\
> +}									\
> +									\
> +static inline void __percpu_write_##sz(void *ptr, unsigned long val)	\
> +{									\
> +	WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);				\
> +}
> +
> +#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)			\
> +static inline void							\
> +__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)		\
> +{									\
> +	asm volatile (							\
> +	"amo" #amo_insn #sfx " zero, %[val], %[ptr]"			\
> +	: [ptr] "+A" (*(u##sz *)ptr)					\
> +	: [val] "r" ((u##sz)(val))					\
> +	: "memory");							\
> +}
> +
> +#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)		\
> +static inline u##sz							\
> +__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long val)	\
> +{									\
> +	register u##sz ret;						\
> +									\
> +	asm volatile (							\
> +	"amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"			\
> +	: [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)			\
> +	: [val] "r" ((u##sz)(val))					\
> +	: "memory");							\
> +									\
> +	return ret + val;						\
> +}
> +
> +#define PERCPU_OP(name, amo_insn)					\
> +	__PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)			\
> +	__PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)			\
> +	__PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)			\
> +	__PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)			\
> +
> +#define PERCPU_RET_OP(name, amo_insn)					\
> +	__PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn)			\
> +	__PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)		\
> +	__PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)		\
> +	__PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
> +
> +PERCPU_RW_OPS(8)
> +PERCPU_RW_OPS(16)
> +PERCPU_RW_OPS(32)
> +PERCPU_RW_OPS(64)
> +
> +PERCPU_OP(add, add)
> +PERCPU_OP(andnot, and)
> +PERCPU_OP(or, or)
> +PERCPU_RET_OP(add, add)
> +
> +#undef PERCPU_RW_OPS
> +#undef __PERCPU_AMO_OP_CASE
> +#undef __PERCPU_AMO_RET_OP_CASE
> +#undef PERCPU_OP
> +#undef PERCPU_RET_OP
> +
> +#define _pcp_protect(op, pcp, ...)					\
> +({									\
> +	preempt_disable_notrace();					\
> +	op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);				\
> +	preempt_enable_notrace();					\
> +})
> +
> +#define _pcp_protect_return(op, pcp, args...)				\
> +({									\
> +	typeof(pcp) __retval;						\
> +	preempt_disable_notrace();					\
> +	__retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);	\
> +	preempt_enable_notrace();					\
> +	__retval;							\
> +})
> +
> +#define this_cpu_read_1(pcp)		_pcp_protect_return(__percpu_read_8, pcp)
> +#define this_cpu_read_2(pcp)		_pcp_protect_return(__percpu_read_16, pcp)
> +#define this_cpu_read_4(pcp)		_pcp_protect_return(__percpu_read_32, pcp)
> +#define this_cpu_read_8(pcp)		_pcp_protect_return(__percpu_read_64, pcp)
> +
> +#define this_cpu_write_1(pcp, val)	_pcp_protect(__percpu_write_8, pcp, (unsigned long)val)
> +#define this_cpu_write_2(pcp, val)	_pcp_protect(__percpu_write_16, pcp, (unsigned long)val)
> +#define this_cpu_write_4(pcp, val)	_pcp_protect(__percpu_write_32, pcp, (unsigned long)val)
> +#define this_cpu_write_8(pcp, val)	_pcp_protect(__percpu_write_64, pcp, (unsigned long)val)
> +
> +#define this_cpu_add_1(pcp, val)	_pcp_protect(__percpu_add_amo_case_8, pcp, val)
> +#define this_cpu_add_2(pcp, val)	_pcp_protect(__percpu_add_amo_case_16, pcp, val)
> +#define this_cpu_add_4(pcp, val)	_pcp_protect(__percpu_add_amo_case_32, pcp, val)
> +#define this_cpu_add_8(pcp, val)	_pcp_protect(__percpu_add_amo_case_64, pcp, val)
> +
> +#define this_cpu_add_return_1(pcp, val)		\
> +_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
> +
> +#define this_cpu_add_return_2(pcp, val)		\
> +_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
> +
> +#define this_cpu_add_return_4(pcp, val)		\
> +_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
> +
> +#define this_cpu_add_return_8(pcp, val)		\
> +_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
> +
> +#define this_cpu_and_1(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
> +#define this_cpu_and_2(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
> +#define this_cpu_and_4(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
> +#define this_cpu_and_8(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)


Why do we define __percpu_andnot based on amoand, and use 
__percpu_andnot with ~val here? Can't we just define __percpu_and?


> +
> +#define this_cpu_or_1(pcp, val)	_pcp_protect(__percpu_or_amo_case_8, pcp, val)
> +#define this_cpu_or_2(pcp, val)	_pcp_protect(__percpu_or_amo_case_16, pcp, val)
> +#define this_cpu_or_4(pcp, val)	_pcp_protect(__percpu_or_amo_case_32, pcp, val)
> +#define this_cpu_or_8(pcp, val)	_pcp_protect(__percpu_or_amo_case_64, pcp, val)
> +
> +#define this_cpu_xchg_1(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
> +#define this_cpu_xchg_2(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
> +#define this_cpu_xchg_4(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
> +#define this_cpu_xchg_8(pcp, val)	_pcp_protect_return(xchg_relaxed, pcp, val)
> +
> +#define this_cpu_cmpxchg_1(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> +#define this_cpu_cmpxchg_2(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> +#define this_cpu_cmpxchg_4(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> +#define this_cpu_cmpxchg_8(pcp, o, n)	_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> +
> +#include <asm-generic/percpu.h>
> +
> +#endif /* __ASM_PERCPU_H */


It all looks good to me, just one thing, can you also implement 
this_cpu_cmpxchg64/128()?

And since this is almost a copy/paste from arm64, either mention it at 
the top of the file or (better) merge both implementations somewhere to 
avoid redefining existing code :) But up to you.

Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>

Thanks,

Alex



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-07-17 13:04   ` Alexandre Ghiti
@ 2025-07-17 13:05     ` Alexandre Ghiti
  2025-07-18  6:40       ` [External] " yunhui cui
  0 siblings, 1 reply; 10+ messages in thread
From: Alexandre Ghiti @ 2025-07-17 13:05 UTC (permalink / raw)
  To: Yunhui Cui, yury.norov, linux, paul.walmsley, palmer, aou,
	linux-riscv, linux-kernel, dennis, tj, cl, linux-mm

On 7/17/25 15:04, Alexandre Ghiti wrote:
> Hi Yunhui,
>
> On 6/18/25 05:43, Yunhui Cui wrote:
>> Current percpu operations rely on generic implementations, where
>> raw_local_irq_save() introduces substantial overhead. Optimization
>> is achieved through atomic operations and preemption disabling.
>>
>> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
>> ---
>>   arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
>>   1 file changed, 138 insertions(+)
>>   create mode 100644 arch/riscv/include/asm/percpu.h
>>
>> diff --git a/arch/riscv/include/asm/percpu.h 
>> b/arch/riscv/include/asm/percpu.h
>> new file mode 100644
>> index 0000000000000..423c0d01f874c
>> --- /dev/null
>> +++ b/arch/riscv/include/asm/percpu.h
>> @@ -0,0 +1,138 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +
>> +#ifndef __ASM_PERCPU_H
>> +#define __ASM_PERCPU_H
>> +
>> +#include <linux/preempt.h>
>> +
>> +#define PERCPU_RW_OPS(sz)                        \
>> +static inline unsigned long __percpu_read_##sz(void *ptr)        \
>> +{                                    \
>> +    return READ_ONCE(*(u##sz *)ptr);                \
>> +}                                    \
>> +                                    \
>> +static inline void __percpu_write_##sz(void *ptr, unsigned long 
>> val)    \
>> +{                                    \
>> +    WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);                \
>> +}
>> +
>> +#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)            \
>> +static inline void                            \
>> +__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)        \
>> +{                                    \
>> +    asm volatile (                            \
>> +    "amo" #amo_insn #sfx " zero, %[val], %[ptr]"            \
>> +    : [ptr] "+A" (*(u##sz *)ptr)                    \
>> +    : [val] "r" ((u##sz)(val))                    \
>> +    : "memory");                            \
>> +}
>> +
>> +#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)        \
>> +static inline u##sz                            \
>> +__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long 
>> val)    \
>> +{                                    \
>> +    register u##sz ret;                        \
>> +                                    \
>> +    asm volatile (                            \
>> +    "amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"            \
>> +    : [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)            \
>> +    : [val] "r" ((u##sz)(val))                    \
>> +    : "memory");                            \
>> +                                    \
>> +    return ret + val;                        \
>> +}
>> +
>> +#define PERCPU_OP(name, amo_insn)                    \
>> +    __PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)            \
>> +    __PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)            \
>> +    __PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)            \
>> +    __PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)            \
>> +
>> +#define PERCPU_RET_OP(name, amo_insn)                    \
>> +    __PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn) \
>> +    __PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)        \
>> +    __PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)        \
>> +    __PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
>> +
>> +PERCPU_RW_OPS(8)
>> +PERCPU_RW_OPS(16)
>> +PERCPU_RW_OPS(32)
>> +PERCPU_RW_OPS(64)
>> +
>> +PERCPU_OP(add, add)
>> +PERCPU_OP(andnot, and)
>> +PERCPU_OP(or, or)
>> +PERCPU_RET_OP(add, add)
>> +
>> +#undef PERCPU_RW_OPS
>> +#undef __PERCPU_AMO_OP_CASE
>> +#undef __PERCPU_AMO_RET_OP_CASE
>> +#undef PERCPU_OP
>> +#undef PERCPU_RET_OP
>> +
>> +#define _pcp_protect(op, pcp, ...)                    \
>> +({                                    \
>> +    preempt_disable_notrace();                    \
>> +    op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);                \
>> +    preempt_enable_notrace();                    \
>> +})
>> +
>> +#define _pcp_protect_return(op, pcp, args...)                \
>> +({                                    \
>> +    typeof(pcp) __retval;                        \
>> +    preempt_disable_notrace();                    \
>> +    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
>> +    preempt_enable_notrace();                    \
>> +    __retval;                            \
>> +})
>> +
>> +#define this_cpu_read_1(pcp) _pcp_protect_return(__percpu_read_8, pcp)
>> +#define this_cpu_read_2(pcp) _pcp_protect_return(__percpu_read_16, pcp)
>> +#define this_cpu_read_4(pcp) _pcp_protect_return(__percpu_read_32, pcp)
>> +#define this_cpu_read_8(pcp) _pcp_protect_return(__percpu_read_64, pcp)
>> +
>> +#define this_cpu_write_1(pcp, val) _pcp_protect(__percpu_write_8, 
>> pcp, (unsigned long)val)
>> +#define this_cpu_write_2(pcp, val) _pcp_protect(__percpu_write_16, 
>> pcp, (unsigned long)val)
>> +#define this_cpu_write_4(pcp, val) _pcp_protect(__percpu_write_32, 
>> pcp, (unsigned long)val)
>> +#define this_cpu_write_8(pcp, val) _pcp_protect(__percpu_write_64, 
>> pcp, (unsigned long)val)
>> +
>> +#define this_cpu_add_1(pcp, val) 
>> _pcp_protect(__percpu_add_amo_case_8, pcp, val)
>> +#define this_cpu_add_2(pcp, val) 
>> _pcp_protect(__percpu_add_amo_case_16, pcp, val)
>> +#define this_cpu_add_4(pcp, val) 
>> _pcp_protect(__percpu_add_amo_case_32, pcp, val)
>> +#define this_cpu_add_8(pcp, val) 
>> _pcp_protect(__percpu_add_amo_case_64, pcp, val)
>> +
>> +#define this_cpu_add_return_1(pcp, val)        \
>> +_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
>> +
>> +#define this_cpu_add_return_2(pcp, val)        \
>> +_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
>> +
>> +#define this_cpu_add_return_4(pcp, val)        \
>> +_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
>> +
>> +#define this_cpu_add_return_8(pcp, val)        \
>> +_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
>> +
>> +#define this_cpu_and_1(pcp, val) 
>> _pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
>> +#define this_cpu_and_2(pcp, val) 
>> _pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
>> +#define this_cpu_and_4(pcp, val) 
>> _pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
>> +#define this_cpu_and_8(pcp, val) 
>> _pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)
>
>
> Why do we define __percpu_andnot based on amoand, and use 
> __percpu_andnot with ~val here? Can't we just define __percpu_and?
>
>
>> +
>> +#define this_cpu_or_1(pcp, val) _pcp_protect(__percpu_or_amo_case_8, 
>> pcp, val)
>> +#define this_cpu_or_2(pcp, val) 
>> _pcp_protect(__percpu_or_amo_case_16, pcp, val)
>> +#define this_cpu_or_4(pcp, val) 
>> _pcp_protect(__percpu_or_amo_case_32, pcp, val)
>> +#define this_cpu_or_8(pcp, val) 
>> _pcp_protect(__percpu_or_amo_case_64, pcp, val)
>> +
>> +#define this_cpu_xchg_1(pcp, val) _pcp_protect_return(xchg_relaxed, 
>> pcp, val)
>> +#define this_cpu_xchg_2(pcp, val) _pcp_protect_return(xchg_relaxed, 
>> pcp, val)
>> +#define this_cpu_xchg_4(pcp, val) _pcp_protect_return(xchg_relaxed, 
>> pcp, val)
>> +#define this_cpu_xchg_8(pcp, val) _pcp_protect_return(xchg_relaxed, 
>> pcp, val)
>> +
>> +#define this_cpu_cmpxchg_1(pcp, o, n) 
>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>> +#define this_cpu_cmpxchg_2(pcp, o, n) 
>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>> +#define this_cpu_cmpxchg_4(pcp, o, n) 
>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>> +#define this_cpu_cmpxchg_8(pcp, o, n) 
>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>> +
>> +#include <asm-generic/percpu.h>
>> +
>> +#endif /* __ASM_PERCPU_H */
>
>
> It all looks good to me, just one thing, can you also implement 
> this_cpu_cmpxchg64/128()?
>

One last thing sorry, can you add a cover letter too?

Thanks!

Alex


> And since this is almost a copy/paste from arm64, either mention it at 
> the top of the file or (better) merge both implementations somewhere 
> to avoid redefining existing code :) But up to you.
>
> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
>
> Thanks,
>
> Alex
>
>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [External] Re: [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-07-17 13:05     ` Alexandre Ghiti
@ 2025-07-18  6:40       ` yunhui cui
  2025-07-18 14:22         ` Alexandre Ghiti
  0 siblings, 1 reply; 10+ messages in thread
From: yunhui cui @ 2025-07-18  6:40 UTC (permalink / raw)
  To: Alexandre Ghiti
  Cc: yury.norov, linux, paul.walmsley, palmer, aou, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm

Hi Alex,

On Thu, Jul 17, 2025 at 9:06 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> On 7/17/25 15:04, Alexandre Ghiti wrote:
> > Hi Yunhui,
> >
> > On 6/18/25 05:43, Yunhui Cui wrote:
> >> Current percpu operations rely on generic implementations, where
> >> raw_local_irq_save() introduces substantial overhead. Optimization
> >> is achieved through atomic operations and preemption disabling.
> >>
> >> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> >> ---
> >>   arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
> >>   1 file changed, 138 insertions(+)
> >>   create mode 100644 arch/riscv/include/asm/percpu.h
> >>
> >> diff --git a/arch/riscv/include/asm/percpu.h
> >> b/arch/riscv/include/asm/percpu.h
> >> new file mode 100644
> >> index 0000000000000..423c0d01f874c
> >> --- /dev/null
> >> +++ b/arch/riscv/include/asm/percpu.h
> >> @@ -0,0 +1,138 @@
> >> +/* SPDX-License-Identifier: GPL-2.0-only */
> >> +
> >> +#ifndef __ASM_PERCPU_H
> >> +#define __ASM_PERCPU_H
> >> +
> >> +#include <linux/preempt.h>
> >> +
> >> +#define PERCPU_RW_OPS(sz)                        \
> >> +static inline unsigned long __percpu_read_##sz(void *ptr)        \
> >> +{                                    \
> >> +    return READ_ONCE(*(u##sz *)ptr);                \
> >> +}                                    \
> >> +                                    \
> >> +static inline void __percpu_write_##sz(void *ptr, unsigned long
> >> val)    \
> >> +{                                    \
> >> +    WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);                \
> >> +}
> >> +
> >> +#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)            \
> >> +static inline void                            \
> >> +__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)        \
> >> +{                                    \
> >> +    asm volatile (                            \
> >> +    "amo" #amo_insn #sfx " zero, %[val], %[ptr]"            \
> >> +    : [ptr] "+A" (*(u##sz *)ptr)                    \
> >> +    : [val] "r" ((u##sz)(val))                    \
> >> +    : "memory");                            \
> >> +}
> >> +
> >> +#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)        \
> >> +static inline u##sz                            \
> >> +__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long
> >> val)    \
> >> +{                                    \
> >> +    register u##sz ret;                        \
> >> +                                    \
> >> +    asm volatile (                            \
> >> +    "amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"            \
> >> +    : [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)            \
> >> +    : [val] "r" ((u##sz)(val))                    \
> >> +    : "memory");                            \
> >> +                                    \
> >> +    return ret + val;                        \
> >> +}
> >> +
> >> +#define PERCPU_OP(name, amo_insn)                    \
> >> +    __PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)            \
> >> +    __PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)            \
> >> +    __PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)            \
> >> +    __PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)            \
> >> +
> >> +#define PERCPU_RET_OP(name, amo_insn)                    \
> >> +    __PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn) \
> >> +    __PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)        \
> >> +    __PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)        \
> >> +    __PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
> >> +
> >> +PERCPU_RW_OPS(8)
> >> +PERCPU_RW_OPS(16)
> >> +PERCPU_RW_OPS(32)
> >> +PERCPU_RW_OPS(64)
> >> +
> >> +PERCPU_OP(add, add)
> >> +PERCPU_OP(andnot, and)
> >> +PERCPU_OP(or, or)
> >> +PERCPU_RET_OP(add, add)
> >> +
> >> +#undef PERCPU_RW_OPS
> >> +#undef __PERCPU_AMO_OP_CASE
> >> +#undef __PERCPU_AMO_RET_OP_CASE
> >> +#undef PERCPU_OP
> >> +#undef PERCPU_RET_OP
> >> +
> >> +#define _pcp_protect(op, pcp, ...)                    \
> >> +({                                    \
> >> +    preempt_disable_notrace();                    \
> >> +    op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);                \
> >> +    preempt_enable_notrace();                    \
> >> +})
> >> +
> >> +#define _pcp_protect_return(op, pcp, args...)                \
> >> +({                                    \
> >> +    typeof(pcp) __retval;                        \
> >> +    preempt_disable_notrace();                    \
> >> +    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
> >> +    preempt_enable_notrace();                    \
> >> +    __retval;                            \
> >> +})
> >> +
> >> +#define this_cpu_read_1(pcp) _pcp_protect_return(__percpu_read_8, pcp)
> >> +#define this_cpu_read_2(pcp) _pcp_protect_return(__percpu_read_16, pcp)
> >> +#define this_cpu_read_4(pcp) _pcp_protect_return(__percpu_read_32, pcp)
> >> +#define this_cpu_read_8(pcp) _pcp_protect_return(__percpu_read_64, pcp)
> >> +
> >> +#define this_cpu_write_1(pcp, val) _pcp_protect(__percpu_write_8,
> >> pcp, (unsigned long)val)
> >> +#define this_cpu_write_2(pcp, val) _pcp_protect(__percpu_write_16,
> >> pcp, (unsigned long)val)
> >> +#define this_cpu_write_4(pcp, val) _pcp_protect(__percpu_write_32,
> >> pcp, (unsigned long)val)
> >> +#define this_cpu_write_8(pcp, val) _pcp_protect(__percpu_write_64,
> >> pcp, (unsigned long)val)
> >> +
> >> +#define this_cpu_add_1(pcp, val)
> >> _pcp_protect(__percpu_add_amo_case_8, pcp, val)
> >> +#define this_cpu_add_2(pcp, val)
> >> _pcp_protect(__percpu_add_amo_case_16, pcp, val)
> >> +#define this_cpu_add_4(pcp, val)
> >> _pcp_protect(__percpu_add_amo_case_32, pcp, val)
> >> +#define this_cpu_add_8(pcp, val)
> >> _pcp_protect(__percpu_add_amo_case_64, pcp, val)
> >> +
> >> +#define this_cpu_add_return_1(pcp, val)        \
> >> +_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
> >> +
> >> +#define this_cpu_add_return_2(pcp, val)        \
> >> +_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
> >> +
> >> +#define this_cpu_add_return_4(pcp, val)        \
> >> +_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
> >> +
> >> +#define this_cpu_add_return_8(pcp, val)        \
> >> +_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
> >> +
> >> +#define this_cpu_and_1(pcp, val)
> >> _pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
> >> +#define this_cpu_and_2(pcp, val)
> >> _pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
> >> +#define this_cpu_and_4(pcp, val)
> >> _pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
> >> +#define this_cpu_and_8(pcp, val)
> >> _pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)
> >
> >
> > Why do we define __percpu_andnot based on amoand, and use
> > __percpu_andnot with ~val here? Can't we just define __percpu_and?
> >
> >
> >> +
> >> +#define this_cpu_or_1(pcp, val) _pcp_protect(__percpu_or_amo_case_8,
> >> pcp, val)
> >> +#define this_cpu_or_2(pcp, val)
> >> _pcp_protect(__percpu_or_amo_case_16, pcp, val)
> >> +#define this_cpu_or_4(pcp, val)
> >> _pcp_protect(__percpu_or_amo_case_32, pcp, val)
> >> +#define this_cpu_or_8(pcp, val)
> >> _pcp_protect(__percpu_or_amo_case_64, pcp, val)
> >> +
> >> +#define this_cpu_xchg_1(pcp, val) _pcp_protect_return(xchg_relaxed,
> >> pcp, val)
> >> +#define this_cpu_xchg_2(pcp, val) _pcp_protect_return(xchg_relaxed,
> >> pcp, val)
> >> +#define this_cpu_xchg_4(pcp, val) _pcp_protect_return(xchg_relaxed,
> >> pcp, val)
> >> +#define this_cpu_xchg_8(pcp, val) _pcp_protect_return(xchg_relaxed,
> >> pcp, val)
> >> +
> >> +#define this_cpu_cmpxchg_1(pcp, o, n)
> >> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >> +#define this_cpu_cmpxchg_2(pcp, o, n)
> >> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >> +#define this_cpu_cmpxchg_4(pcp, o, n)
> >> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >> +#define this_cpu_cmpxchg_8(pcp, o, n)
> >> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >> +
> >> +#include <asm-generic/percpu.h>
> >> +
> >> +#endif /* __ASM_PERCPU_H */
> >
> >
> > It all looks good to me, just one thing, can you also implement
> > this_cpu_cmpxchg64/128()?
> >
>
> One last thing sorry, can you add a cover letter too?

Okay.

>
> Thanks!
>
> Alex
>
>
> > And since this is almost a copy/paste from arm64, either mention it at
> > the top of the file or (better) merge both implementations somewhere
> > to avoid redefining existing code :) But up to you.

Actually, there's a concern here. We should account for scenarios
where ZABHA isn't supported. Given that xxx_8() and xxx_16() are
rarely used in practice, could we initially support only xxx_32() and
xxx_64()? For xxx_8() and xxx_16(), we could default to the generic
implementation.


> >
> > Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> >
> > Thanks,
> >
> > Alex
> >
> >
> >

Thanks,
Yunhui

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [External] Re: [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-07-18  6:40       ` [External] " yunhui cui
@ 2025-07-18 14:22         ` Alexandre Ghiti
  2025-07-18 14:33           ` yunhui cui
  0 siblings, 1 reply; 10+ messages in thread
From: Alexandre Ghiti @ 2025-07-18 14:22 UTC (permalink / raw)
  To: yunhui cui
  Cc: yury.norov, linux, paul.walmsley, palmer, aou, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm

Hi Yunhui,

On 7/18/25 08:40, yunhui cui wrote:
> Hi Alex,
>
> On Thu, Jul 17, 2025 at 9:06 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>> On 7/17/25 15:04, Alexandre Ghiti wrote:
>>> Hi Yunhui,
>>>
>>> On 6/18/25 05:43, Yunhui Cui wrote:
>>>> Current percpu operations rely on generic implementations, where
>>>> raw_local_irq_save() introduces substantial overhead. Optimization
>>>> is achieved through atomic operations and preemption disabling.
>>>>
>>>> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
>>>> ---
>>>>    arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
>>>>    1 file changed, 138 insertions(+)
>>>>    create mode 100644 arch/riscv/include/asm/percpu.h
>>>>
>>>> diff --git a/arch/riscv/include/asm/percpu.h
>>>> b/arch/riscv/include/asm/percpu.h
>>>> new file mode 100644
>>>> index 0000000000000..423c0d01f874c
>>>> --- /dev/null
>>>> +++ b/arch/riscv/include/asm/percpu.h
>>>> @@ -0,0 +1,138 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>> +
>>>> +#ifndef __ASM_PERCPU_H
>>>> +#define __ASM_PERCPU_H
>>>> +
>>>> +#include <linux/preempt.h>
>>>> +
>>>> +#define PERCPU_RW_OPS(sz)                        \
>>>> +static inline unsigned long __percpu_read_##sz(void *ptr)        \
>>>> +{                                    \
>>>> +    return READ_ONCE(*(u##sz *)ptr);                \
>>>> +}                                    \
>>>> +                                    \
>>>> +static inline void __percpu_write_##sz(void *ptr, unsigned long
>>>> val)    \
>>>> +{                                    \
>>>> +    WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);                \
>>>> +}
>>>> +
>>>> +#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)            \
>>>> +static inline void                            \
>>>> +__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)        \
>>>> +{                                    \
>>>> +    asm volatile (                            \
>>>> +    "amo" #amo_insn #sfx " zero, %[val], %[ptr]"            \
>>>> +    : [ptr] "+A" (*(u##sz *)ptr)                    \
>>>> +    : [val] "r" ((u##sz)(val))                    \
>>>> +    : "memory");                            \
>>>> +}
>>>> +
>>>> +#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)        \
>>>> +static inline u##sz                            \
>>>> +__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long
>>>> val)    \
>>>> +{                                    \
>>>> +    register u##sz ret;                        \
>>>> +                                    \
>>>> +    asm volatile (                            \
>>>> +    "amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"            \
>>>> +    : [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)            \
>>>> +    : [val] "r" ((u##sz)(val))                    \
>>>> +    : "memory");                            \
>>>> +                                    \
>>>> +    return ret + val;                        \
>>>> +}
>>>> +
>>>> +#define PERCPU_OP(name, amo_insn)                    \
>>>> +    __PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)            \
>>>> +    __PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)            \
>>>> +    __PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)            \
>>>> +    __PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)            \
>>>> +
>>>> +#define PERCPU_RET_OP(name, amo_insn)                    \
>>>> +    __PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn) \
>>>> +    __PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)        \
>>>> +    __PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)        \
>>>> +    __PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
>>>> +
>>>> +PERCPU_RW_OPS(8)
>>>> +PERCPU_RW_OPS(16)
>>>> +PERCPU_RW_OPS(32)
>>>> +PERCPU_RW_OPS(64)
>>>> +
>>>> +PERCPU_OP(add, add)
>>>> +PERCPU_OP(andnot, and)
>>>> +PERCPU_OP(or, or)
>>>> +PERCPU_RET_OP(add, add)
>>>> +
>>>> +#undef PERCPU_RW_OPS
>>>> +#undef __PERCPU_AMO_OP_CASE
>>>> +#undef __PERCPU_AMO_RET_OP_CASE
>>>> +#undef PERCPU_OP
>>>> +#undef PERCPU_RET_OP
>>>> +
>>>> +#define _pcp_protect(op, pcp, ...)                    \
>>>> +({                                    \
>>>> +    preempt_disable_notrace();                    \
>>>> +    op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);                \
>>>> +    preempt_enable_notrace();                    \
>>>> +})
>>>> +
>>>> +#define _pcp_protect_return(op, pcp, args...)                \
>>>> +({                                    \
>>>> +    typeof(pcp) __retval;                        \
>>>> +    preempt_disable_notrace();                    \
>>>> +    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
>>>> +    preempt_enable_notrace();                    \
>>>> +    __retval;                            \
>>>> +})
>>>> +
>>>> +#define this_cpu_read_1(pcp) _pcp_protect_return(__percpu_read_8, pcp)
>>>> +#define this_cpu_read_2(pcp) _pcp_protect_return(__percpu_read_16, pcp)
>>>> +#define this_cpu_read_4(pcp) _pcp_protect_return(__percpu_read_32, pcp)
>>>> +#define this_cpu_read_8(pcp) _pcp_protect_return(__percpu_read_64, pcp)
>>>> +
>>>> +#define this_cpu_write_1(pcp, val) _pcp_protect(__percpu_write_8,
>>>> pcp, (unsigned long)val)
>>>> +#define this_cpu_write_2(pcp, val) _pcp_protect(__percpu_write_16,
>>>> pcp, (unsigned long)val)
>>>> +#define this_cpu_write_4(pcp, val) _pcp_protect(__percpu_write_32,
>>>> pcp, (unsigned long)val)
>>>> +#define this_cpu_write_8(pcp, val) _pcp_protect(__percpu_write_64,
>>>> pcp, (unsigned long)val)
>>>> +
>>>> +#define this_cpu_add_1(pcp, val)
>>>> _pcp_protect(__percpu_add_amo_case_8, pcp, val)
>>>> +#define this_cpu_add_2(pcp, val)
>>>> _pcp_protect(__percpu_add_amo_case_16, pcp, val)
>>>> +#define this_cpu_add_4(pcp, val)
>>>> _pcp_protect(__percpu_add_amo_case_32, pcp, val)
>>>> +#define this_cpu_add_8(pcp, val)
>>>> _pcp_protect(__percpu_add_amo_case_64, pcp, val)
>>>> +
>>>> +#define this_cpu_add_return_1(pcp, val)        \
>>>> +_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
>>>> +
>>>> +#define this_cpu_add_return_2(pcp, val)        \
>>>> +_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
>>>> +
>>>> +#define this_cpu_add_return_4(pcp, val)        \
>>>> +_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
>>>> +
>>>> +#define this_cpu_add_return_8(pcp, val)        \
>>>> +_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
>>>> +
>>>> +#define this_cpu_and_1(pcp, val)
>>>> _pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
>>>> +#define this_cpu_and_2(pcp, val)
>>>> _pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
>>>> +#define this_cpu_and_4(pcp, val)
>>>> _pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
>>>> +#define this_cpu_and_8(pcp, val)
>>>> _pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)
>>>
>>> Why do we define __percpu_andnot based on amoand, and use
>>> __percpu_andnot with ~val here? Can't we just define __percpu_and?


What about that ^?


>>>
>>>> +
>>>> +#define this_cpu_or_1(pcp, val) _pcp_protect(__percpu_or_amo_case_8,
>>>> pcp, val)
>>>> +#define this_cpu_or_2(pcp, val)
>>>> _pcp_protect(__percpu_or_amo_case_16, pcp, val)
>>>> +#define this_cpu_or_4(pcp, val)
>>>> _pcp_protect(__percpu_or_amo_case_32, pcp, val)
>>>> +#define this_cpu_or_8(pcp, val)
>>>> _pcp_protect(__percpu_or_amo_case_64, pcp, val)
>>>> +
>>>> +#define this_cpu_xchg_1(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>> pcp, val)
>>>> +#define this_cpu_xchg_2(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>> pcp, val)
>>>> +#define this_cpu_xchg_4(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>> pcp, val)
>>>> +#define this_cpu_xchg_8(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>> pcp, val)
>>>> +
>>>> +#define this_cpu_cmpxchg_1(pcp, o, n)
>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>> +#define this_cpu_cmpxchg_2(pcp, o, n)
>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>> +#define this_cpu_cmpxchg_4(pcp, o, n)
>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>> +#define this_cpu_cmpxchg_8(pcp, o, n)
>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>> +
>>>> +#include <asm-generic/percpu.h>
>>>> +
>>>> +#endif /* __ASM_PERCPU_H */
>>>
>>> It all looks good to me, just one thing, can you also implement
>>> this_cpu_cmpxchg64/128()?
>>>
>> One last thing sorry, can you add a cover letter too?
> Okay.
>
>> Thanks!
>>
>> Alex
>>
>>
>>> And since this is almost a copy/paste from arm64, either mention it at
>>> the top of the file or (better) merge both implementations somewhere
>>> to avoid redefining existing code :) But up to you.
> Actually, there's a concern here. We should account for scenarios
> where ZABHA isn't supported. Given that xxx_8() and xxx_16() are
> rarely used in practice, could we initially support only xxx_32() and
> xxx_64()? For xxx_8() and xxx_16(), we could default to the generic
> implementation.


Why isn't lr/sc enough?


>
>
>>> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
>>>
>>> Thanks,
>>>
>>> Alex
>>>
>>>
>>>
> Thanks,
> Yunhui
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [External] Re: [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-07-18 14:22         ` Alexandre Ghiti
@ 2025-07-18 14:33           ` yunhui cui
  2025-08-07 14:54             ` Alexandre Ghiti
  0 siblings, 1 reply; 10+ messages in thread
From: yunhui cui @ 2025-07-18 14:33 UTC (permalink / raw)
  To: Alexandre Ghiti
  Cc: yury.norov, linux, paul.walmsley, palmer, aou, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm

Hi Alex,

On Fri, Jul 18, 2025 at 10:23 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> Hi Yunhui,
>
> On 7/18/25 08:40, yunhui cui wrote:
> > Hi Alex,
> >
> > On Thu, Jul 17, 2025 at 9:06 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
> >> On 7/17/25 15:04, Alexandre Ghiti wrote:
> >>> Hi Yunhui,
> >>>
> >>> On 6/18/25 05:43, Yunhui Cui wrote:
> >>>> Current percpu operations rely on generic implementations, where
> >>>> raw_local_irq_save() introduces substantial overhead. Optimization
> >>>> is achieved through atomic operations and preemption disabling.
> >>>>
> >>>> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> >>>> ---
> >>>>    arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
> >>>>    1 file changed, 138 insertions(+)
> >>>>    create mode 100644 arch/riscv/include/asm/percpu.h
> >>>>
> >>>> diff --git a/arch/riscv/include/asm/percpu.h
> >>>> b/arch/riscv/include/asm/percpu.h
> >>>> new file mode 100644
> >>>> index 0000000000000..423c0d01f874c
> >>>> --- /dev/null
> >>>> +++ b/arch/riscv/include/asm/percpu.h
> >>>> @@ -0,0 +1,138 @@
> >>>> +/* SPDX-License-Identifier: GPL-2.0-only */
> >>>> +
> >>>> +#ifndef __ASM_PERCPU_H
> >>>> +#define __ASM_PERCPU_H
> >>>> +
> >>>> +#include <linux/preempt.h>
> >>>> +
> >>>> +#define PERCPU_RW_OPS(sz)                        \
> >>>> +static inline unsigned long __percpu_read_##sz(void *ptr)        \
> >>>> +{                                    \
> >>>> +    return READ_ONCE(*(u##sz *)ptr);                \
> >>>> +}                                    \
> >>>> +                                    \
> >>>> +static inline void __percpu_write_##sz(void *ptr, unsigned long
> >>>> val)    \
> >>>> +{                                    \
> >>>> +    WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);                \
> >>>> +}
> >>>> +
> >>>> +#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)            \
> >>>> +static inline void                            \
> >>>> +__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)        \
> >>>> +{                                    \
> >>>> +    asm volatile (                            \
> >>>> +    "amo" #amo_insn #sfx " zero, %[val], %[ptr]"            \
> >>>> +    : [ptr] "+A" (*(u##sz *)ptr)                    \
> >>>> +    : [val] "r" ((u##sz)(val))                    \
> >>>> +    : "memory");                            \
> >>>> +}
> >>>> +
> >>>> +#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)        \
> >>>> +static inline u##sz                            \
> >>>> +__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long
> >>>> val)    \
> >>>> +{                                    \
> >>>> +    register u##sz ret;                        \
> >>>> +                                    \
> >>>> +    asm volatile (                            \
> >>>> +    "amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"            \
> >>>> +    : [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)            \
> >>>> +    : [val] "r" ((u##sz)(val))                    \
> >>>> +    : "memory");                            \
> >>>> +                                    \
> >>>> +    return ret + val;                        \
> >>>> +}
> >>>> +
> >>>> +#define PERCPU_OP(name, amo_insn)                    \
> >>>> +    __PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)            \
> >>>> +    __PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)            \
> >>>> +    __PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)            \
> >>>> +    __PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)            \
> >>>> +
> >>>> +#define PERCPU_RET_OP(name, amo_insn)                    \
> >>>> +    __PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn) \
> >>>> +    __PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)        \
> >>>> +    __PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)        \
> >>>> +    __PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
> >>>> +
> >>>> +PERCPU_RW_OPS(8)
> >>>> +PERCPU_RW_OPS(16)
> >>>> +PERCPU_RW_OPS(32)
> >>>> +PERCPU_RW_OPS(64)
> >>>> +
> >>>> +PERCPU_OP(add, add)
> >>>> +PERCPU_OP(andnot, and)
> >>>> +PERCPU_OP(or, or)
> >>>> +PERCPU_RET_OP(add, add)
> >>>> +
> >>>> +#undef PERCPU_RW_OPS
> >>>> +#undef __PERCPU_AMO_OP_CASE
> >>>> +#undef __PERCPU_AMO_RET_OP_CASE
> >>>> +#undef PERCPU_OP
> >>>> +#undef PERCPU_RET_OP
> >>>> +
> >>>> +#define _pcp_protect(op, pcp, ...)                    \
> >>>> +({                                    \
> >>>> +    preempt_disable_notrace();                    \
> >>>> +    op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);                \
> >>>> +    preempt_enable_notrace();                    \
> >>>> +})
> >>>> +
> >>>> +#define _pcp_protect_return(op, pcp, args...)                \
> >>>> +({                                    \
> >>>> +    typeof(pcp) __retval;                        \
> >>>> +    preempt_disable_notrace();                    \
> >>>> +    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
> >>>> +    preempt_enable_notrace();                    \
> >>>> +    __retval;                            \
> >>>> +})
> >>>> +
> >>>> +#define this_cpu_read_1(pcp) _pcp_protect_return(__percpu_read_8, pcp)
> >>>> +#define this_cpu_read_2(pcp) _pcp_protect_return(__percpu_read_16, pcp)
> >>>> +#define this_cpu_read_4(pcp) _pcp_protect_return(__percpu_read_32, pcp)
> >>>> +#define this_cpu_read_8(pcp) _pcp_protect_return(__percpu_read_64, pcp)
> >>>> +
> >>>> +#define this_cpu_write_1(pcp, val) _pcp_protect(__percpu_write_8,
> >>>> pcp, (unsigned long)val)
> >>>> +#define this_cpu_write_2(pcp, val) _pcp_protect(__percpu_write_16,
> >>>> pcp, (unsigned long)val)
> >>>> +#define this_cpu_write_4(pcp, val) _pcp_protect(__percpu_write_32,
> >>>> pcp, (unsigned long)val)
> >>>> +#define this_cpu_write_8(pcp, val) _pcp_protect(__percpu_write_64,
> >>>> pcp, (unsigned long)val)
> >>>> +
> >>>> +#define this_cpu_add_1(pcp, val)
> >>>> _pcp_protect(__percpu_add_amo_case_8, pcp, val)
> >>>> +#define this_cpu_add_2(pcp, val)
> >>>> _pcp_protect(__percpu_add_amo_case_16, pcp, val)
> >>>> +#define this_cpu_add_4(pcp, val)
> >>>> _pcp_protect(__percpu_add_amo_case_32, pcp, val)
> >>>> +#define this_cpu_add_8(pcp, val)
> >>>> _pcp_protect(__percpu_add_amo_case_64, pcp, val)
> >>>> +
> >>>> +#define this_cpu_add_return_1(pcp, val)        \
> >>>> +_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
> >>>> +
> >>>> +#define this_cpu_add_return_2(pcp, val)        \
> >>>> +_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
> >>>> +
> >>>> +#define this_cpu_add_return_4(pcp, val)        \
> >>>> +_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
> >>>> +
> >>>> +#define this_cpu_add_return_8(pcp, val)        \
> >>>> +_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
> >>>> +
> >>>> +#define this_cpu_and_1(pcp, val)
> >>>> _pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
> >>>> +#define this_cpu_and_2(pcp, val)
> >>>> _pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
> >>>> +#define this_cpu_and_4(pcp, val)
> >>>> _pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
> >>>> +#define this_cpu_and_8(pcp, val)
> >>>> _pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)
> >>>
> >>> Why do we define __percpu_andnot based on amoand, and use
> >>> __percpu_andnot with ~val here? Can't we just define __percpu_and?
>
>
> What about that ^?
>
>
> >>>
> >>>> +
> >>>> +#define this_cpu_or_1(pcp, val) _pcp_protect(__percpu_or_amo_case_8,
> >>>> pcp, val)
> >>>> +#define this_cpu_or_2(pcp, val)
> >>>> _pcp_protect(__percpu_or_amo_case_16, pcp, val)
> >>>> +#define this_cpu_or_4(pcp, val)
> >>>> _pcp_protect(__percpu_or_amo_case_32, pcp, val)
> >>>> +#define this_cpu_or_8(pcp, val)
> >>>> _pcp_protect(__percpu_or_amo_case_64, pcp, val)
> >>>> +
> >>>> +#define this_cpu_xchg_1(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>> pcp, val)
> >>>> +#define this_cpu_xchg_2(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>> pcp, val)
> >>>> +#define this_cpu_xchg_4(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>> pcp, val)
> >>>> +#define this_cpu_xchg_8(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>> pcp, val)
> >>>> +
> >>>> +#define this_cpu_cmpxchg_1(pcp, o, n)
> >>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>> +#define this_cpu_cmpxchg_2(pcp, o, n)
> >>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>> +#define this_cpu_cmpxchg_4(pcp, o, n)
> >>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>> +#define this_cpu_cmpxchg_8(pcp, o, n)
> >>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>> +
> >>>> +#include <asm-generic/percpu.h>
> >>>> +
> >>>> +#endif /* __ASM_PERCPU_H */
> >>>
> >>> It all looks good to me, just one thing, can you also implement
> >>> this_cpu_cmpxchg64/128()?
> >>>
> >> One last thing sorry, can you add a cover letter too?
> > Okay.
> >
> >> Thanks!
> >>
> >> Alex
> >>
> >>
> >>> And since this is almost a copy/paste from arm64, either mention it at
> >>> the top of the file or (better) merge both implementations somewhere
> >>> to avoid redefining existing code :) But up to you.
> > Actually, there's a concern here. We should account for scenarios
> > where ZABHA isn't supported. Given that xxx_8() and xxx_16() are
> > rarely used in practice, could we initially support only xxx_32() and
> > xxx_64()? For xxx_8() and xxx_16(), we could default to the generic
> > implementation.
>
>
> Why isn't lr/sc enough?

If I'm not mistaken, the current RISC-V does not support lr.bh/sc.bh,
is that right?

>
>
> >
> >
> >>> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> >>>
> >>> Thanks,
> >>>
> >>> Alex
> >>>
> >>>
> >>>
> > Thanks,
> > Yunhui
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv

Thanks,
Yunhui

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [External] Re: [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-07-18 14:33           ` yunhui cui
@ 2025-08-07 14:54             ` Alexandre Ghiti
  2025-08-19 12:45               ` yunhui cui
  0 siblings, 1 reply; 10+ messages in thread
From: Alexandre Ghiti @ 2025-08-07 14:54 UTC (permalink / raw)
  To: yunhui cui
  Cc: yury.norov, linux, paul.walmsley, palmer, aou, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm

Hi Yunhui,

On 7/18/25 16:33, yunhui cui wrote:
> Hi Alex,
>
> On Fri, Jul 18, 2025 at 10:23 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>> Hi Yunhui,
>>
>> On 7/18/25 08:40, yunhui cui wrote:
>>> Hi Alex,
>>>
>>> On Thu, Jul 17, 2025 at 9:06 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>>>> On 7/17/25 15:04, Alexandre Ghiti wrote:
>>>>> Hi Yunhui,
>>>>>
>>>>> On 6/18/25 05:43, Yunhui Cui wrote:
>>>>>> Current percpu operations rely on generic implementations, where
>>>>>> raw_local_irq_save() introduces substantial overhead. Optimization
>>>>>> is achieved through atomic operations and preemption disabling.
>>>>>>
>>>>>> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
>>>>>> ---
>>>>>>     arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
>>>>>>     1 file changed, 138 insertions(+)
>>>>>>     create mode 100644 arch/riscv/include/asm/percpu.h
>>>>>>
>>>>>> diff --git a/arch/riscv/include/asm/percpu.h
>>>>>> b/arch/riscv/include/asm/percpu.h
>>>>>> new file mode 100644
>>>>>> index 0000000000000..423c0d01f874c
>>>>>> --- /dev/null
>>>>>> +++ b/arch/riscv/include/asm/percpu.h
>>>>>> @@ -0,0 +1,138 @@
>>>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>>>> +
>>>>>> +#ifndef __ASM_PERCPU_H
>>>>>> +#define __ASM_PERCPU_H
>>>>>> +
>>>>>> +#include <linux/preempt.h>
>>>>>> +
>>>>>> +#define PERCPU_RW_OPS(sz)                        \
>>>>>> +static inline unsigned long __percpu_read_##sz(void *ptr)        \
>>>>>> +{                                    \
>>>>>> +    return READ_ONCE(*(u##sz *)ptr);                \
>>>>>> +}                                    \
>>>>>> +                                    \
>>>>>> +static inline void __percpu_write_##sz(void *ptr, unsigned long
>>>>>> val)    \
>>>>>> +{                                    \
>>>>>> +    WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);                \
>>>>>> +}
>>>>>> +
>>>>>> +#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)            \
>>>>>> +static inline void                            \
>>>>>> +__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)        \
>>>>>> +{                                    \
>>>>>> +    asm volatile (                            \
>>>>>> +    "amo" #amo_insn #sfx " zero, %[val], %[ptr]"            \
>>>>>> +    : [ptr] "+A" (*(u##sz *)ptr)                    \
>>>>>> +    : [val] "r" ((u##sz)(val))                    \
>>>>>> +    : "memory");                            \
>>>>>> +}
>>>>>> +
>>>>>> +#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)        \
>>>>>> +static inline u##sz                            \
>>>>>> +__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long
>>>>>> val)    \
>>>>>> +{                                    \
>>>>>> +    register u##sz ret;                        \
>>>>>> +                                    \
>>>>>> +    asm volatile (                            \
>>>>>> +    "amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"            \
>>>>>> +    : [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)            \
>>>>>> +    : [val] "r" ((u##sz)(val))                    \
>>>>>> +    : "memory");                            \
>>>>>> +                                    \
>>>>>> +    return ret + val;                        \
>>>>>> +}
>>>>>> +
>>>>>> +#define PERCPU_OP(name, amo_insn)                    \
>>>>>> +    __PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)            \
>>>>>> +    __PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)            \
>>>>>> +    __PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)            \
>>>>>> +    __PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)            \
>>>>>> +
>>>>>> +#define PERCPU_RET_OP(name, amo_insn)                    \
>>>>>> +    __PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn) \
>>>>>> +    __PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)        \
>>>>>> +    __PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)        \
>>>>>> +    __PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
>>>>>> +
>>>>>> +PERCPU_RW_OPS(8)
>>>>>> +PERCPU_RW_OPS(16)
>>>>>> +PERCPU_RW_OPS(32)
>>>>>> +PERCPU_RW_OPS(64)
>>>>>> +
>>>>>> +PERCPU_OP(add, add)
>>>>>> +PERCPU_OP(andnot, and)
>>>>>> +PERCPU_OP(or, or)
>>>>>> +PERCPU_RET_OP(add, add)
>>>>>> +
>>>>>> +#undef PERCPU_RW_OPS
>>>>>> +#undef __PERCPU_AMO_OP_CASE
>>>>>> +#undef __PERCPU_AMO_RET_OP_CASE
>>>>>> +#undef PERCPU_OP
>>>>>> +#undef PERCPU_RET_OP
>>>>>> +
>>>>>> +#define _pcp_protect(op, pcp, ...)                    \
>>>>>> +({                                    \
>>>>>> +    preempt_disable_notrace();                    \
>>>>>> +    op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);                \
>>>>>> +    preempt_enable_notrace();                    \
>>>>>> +})
>>>>>> +
>>>>>> +#define _pcp_protect_return(op, pcp, args...)                \
>>>>>> +({                                    \
>>>>>> +    typeof(pcp) __retval;                        \
>>>>>> +    preempt_disable_notrace();                    \
>>>>>> +    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
>>>>>> +    preempt_enable_notrace();                    \
>>>>>> +    __retval;                            \
>>>>>> +})
>>>>>> +
>>>>>> +#define this_cpu_read_1(pcp) _pcp_protect_return(__percpu_read_8, pcp)
>>>>>> +#define this_cpu_read_2(pcp) _pcp_protect_return(__percpu_read_16, pcp)
>>>>>> +#define this_cpu_read_4(pcp) _pcp_protect_return(__percpu_read_32, pcp)
>>>>>> +#define this_cpu_read_8(pcp) _pcp_protect_return(__percpu_read_64, pcp)
>>>>>> +
>>>>>> +#define this_cpu_write_1(pcp, val) _pcp_protect(__percpu_write_8,
>>>>>> pcp, (unsigned long)val)
>>>>>> +#define this_cpu_write_2(pcp, val) _pcp_protect(__percpu_write_16,
>>>>>> pcp, (unsigned long)val)
>>>>>> +#define this_cpu_write_4(pcp, val) _pcp_protect(__percpu_write_32,
>>>>>> pcp, (unsigned long)val)
>>>>>> +#define this_cpu_write_8(pcp, val) _pcp_protect(__percpu_write_64,
>>>>>> pcp, (unsigned long)val)
>>>>>> +
>>>>>> +#define this_cpu_add_1(pcp, val)
>>>>>> _pcp_protect(__percpu_add_amo_case_8, pcp, val)
>>>>>> +#define this_cpu_add_2(pcp, val)
>>>>>> _pcp_protect(__percpu_add_amo_case_16, pcp, val)
>>>>>> +#define this_cpu_add_4(pcp, val)
>>>>>> _pcp_protect(__percpu_add_amo_case_32, pcp, val)
>>>>>> +#define this_cpu_add_8(pcp, val)
>>>>>> _pcp_protect(__percpu_add_amo_case_64, pcp, val)
>>>>>> +
>>>>>> +#define this_cpu_add_return_1(pcp, val)        \
>>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
>>>>>> +
>>>>>> +#define this_cpu_add_return_2(pcp, val)        \
>>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
>>>>>> +
>>>>>> +#define this_cpu_add_return_4(pcp, val)        \
>>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
>>>>>> +
>>>>>> +#define this_cpu_add_return_8(pcp, val)        \
>>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
>>>>>> +
>>>>>> +#define this_cpu_and_1(pcp, val)
>>>>>> _pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
>>>>>> +#define this_cpu_and_2(pcp, val)
>>>>>> _pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
>>>>>> +#define this_cpu_and_4(pcp, val)
>>>>>> _pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
>>>>>> +#define this_cpu_and_8(pcp, val)
>>>>>> _pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)
>>>>> Why do we define __percpu_andnot based on amoand, and use
>>>>> __percpu_andnot with ~val here? Can't we just define __percpu_and?
>>
>> What about that ^?
>>
>>
>>>>>> +
>>>>>> +#define this_cpu_or_1(pcp, val) _pcp_protect(__percpu_or_amo_case_8,
>>>>>> pcp, val)
>>>>>> +#define this_cpu_or_2(pcp, val)
>>>>>> _pcp_protect(__percpu_or_amo_case_16, pcp, val)
>>>>>> +#define this_cpu_or_4(pcp, val)
>>>>>> _pcp_protect(__percpu_or_amo_case_32, pcp, val)
>>>>>> +#define this_cpu_or_8(pcp, val)
>>>>>> _pcp_protect(__percpu_or_amo_case_64, pcp, val)
>>>>>> +
>>>>>> +#define this_cpu_xchg_1(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>>>> pcp, val)
>>>>>> +#define this_cpu_xchg_2(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>>>> pcp, val)
>>>>>> +#define this_cpu_xchg_4(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>>>> pcp, val)
>>>>>> +#define this_cpu_xchg_8(pcp, val) _pcp_protect_return(xchg_relaxed,
>>>>>> pcp, val)
>>>>>> +
>>>>>> +#define this_cpu_cmpxchg_1(pcp, o, n)
>>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>>>> +#define this_cpu_cmpxchg_2(pcp, o, n)
>>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>>>> +#define this_cpu_cmpxchg_4(pcp, o, n)
>>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>>>> +#define this_cpu_cmpxchg_8(pcp, o, n)
>>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>>>>>> +
>>>>>> +#include <asm-generic/percpu.h>
>>>>>> +
>>>>>> +#endif /* __ASM_PERCPU_H */
>>>>> It all looks good to me, just one thing, can you also implement
>>>>> this_cpu_cmpxchg64/128()?
>>>>>
>>>> One last thing sorry, can you add a cover letter too?
>>> Okay.
>>>
>>>> Thanks!
>>>>
>>>> Alex
>>>>
>>>>
>>>>> And since this is almost a copy/paste from arm64, either mention it at
>>>>> the top of the file or (better) merge both implementations somewhere
>>>>> to avoid redefining existing code :) But up to you.
>>> Actually, there's a concern here. We should account for scenarios
>>> where ZABHA isn't supported. Given that xxx_8() and xxx_16() are
>>> rarely used in practice, could we initially support only xxx_32() and
>>> xxx_64()? For xxx_8() and xxx_16(), we could default to the generic
>>> implementation.
>>
>> Why isn't lr/sc enough?
> If I'm not mistaken, the current RISC-V does not support lr.bh/sc.bh,
> is that right?


Yes, that's right, but we have an implementation of cmpxchg[8|16]() that 
uses lr.w/sc.w which works (unless I missed something, I have just 
checked again), so I think that's alright no?

Thanks,

Alex


>
>>
>>>
>>>>> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Alex
>>>>>
>>>>>
>>>>>
>>> Thanks,
>>> Yunhui
>>>
>>> _______________________________________________
>>> linux-riscv mailing list
>>> linux-riscv@lists.infradead.org
>>> http://lists.infradead.org/mailman/listinfo/linux-riscv
> Thanks,
> Yunhui
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [External] Re: [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm
  2025-08-07 14:54             ` Alexandre Ghiti
@ 2025-08-19 12:45               ` yunhui cui
  0 siblings, 0 replies; 10+ messages in thread
From: yunhui cui @ 2025-08-19 12:45 UTC (permalink / raw)
  To: Alexandre Ghiti
  Cc: yury.norov, linux, paul.walmsley, palmer, aou, linux-riscv,
	linux-kernel, dennis, tj, cl, linux-mm

Hi Alex,

On Thu, Aug 7, 2025 at 10:55 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
>
> Hi Yunhui,
>
> On 7/18/25 16:33, yunhui cui wrote:
> > Hi Alex,
> >
> > On Fri, Jul 18, 2025 at 10:23 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
> >> Hi Yunhui,
> >>
> >> On 7/18/25 08:40, yunhui cui wrote:
> >>> Hi Alex,
> >>>
> >>> On Thu, Jul 17, 2025 at 9:06 PM Alexandre Ghiti <alex@ghiti.fr> wrote:
> >>>> On 7/17/25 15:04, Alexandre Ghiti wrote:
> >>>>> Hi Yunhui,
> >>>>>
> >>>>> On 6/18/25 05:43, Yunhui Cui wrote:
> >>>>>> Current percpu operations rely on generic implementations, where
> >>>>>> raw_local_irq_save() introduces substantial overhead. Optimization
> >>>>>> is achieved through atomic operations and preemption disabling.
> >>>>>>
> >>>>>> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> >>>>>> ---
> >>>>>>     arch/riscv/include/asm/percpu.h | 138 ++++++++++++++++++++++++++++++++
> >>>>>>     1 file changed, 138 insertions(+)
> >>>>>>     create mode 100644 arch/riscv/include/asm/percpu.h
> >>>>>>
> >>>>>> diff --git a/arch/riscv/include/asm/percpu.h
> >>>>>> b/arch/riscv/include/asm/percpu.h
> >>>>>> new file mode 100644
> >>>>>> index 0000000000000..423c0d01f874c
> >>>>>> --- /dev/null
> >>>>>> +++ b/arch/riscv/include/asm/percpu.h
> >>>>>> @@ -0,0 +1,138 @@
> >>>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
> >>>>>> +
> >>>>>> +#ifndef __ASM_PERCPU_H
> >>>>>> +#define __ASM_PERCPU_H
> >>>>>> +
> >>>>>> +#include <linux/preempt.h>
> >>>>>> +
> >>>>>> +#define PERCPU_RW_OPS(sz)                        \
> >>>>>> +static inline unsigned long __percpu_read_##sz(void *ptr)        \
> >>>>>> +{                                    \
> >>>>>> +    return READ_ONCE(*(u##sz *)ptr);                \
> >>>>>> +}                                    \
> >>>>>> +                                    \
> >>>>>> +static inline void __percpu_write_##sz(void *ptr, unsigned long
> >>>>>> val)    \
> >>>>>> +{                                    \
> >>>>>> +    WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);                \
> >>>>>> +}
> >>>>>> +
> >>>>>> +#define __PERCPU_AMO_OP_CASE(sfx, name, sz, amo_insn)            \
> >>>>>> +static inline void                            \
> >>>>>> +__percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)        \
> >>>>>> +{                                    \
> >>>>>> +    asm volatile (                            \
> >>>>>> +    "amo" #amo_insn #sfx " zero, %[val], %[ptr]"            \
> >>>>>> +    : [ptr] "+A" (*(u##sz *)ptr)                    \
> >>>>>> +    : [val] "r" ((u##sz)(val))                    \
> >>>>>> +    : "memory");                            \
> >>>>>> +}
> >>>>>> +
> >>>>>> +#define __PERCPU_AMO_RET_OP_CASE(sfx, name, sz, amo_insn)        \
> >>>>>> +static inline u##sz                            \
> >>>>>> +__percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long
> >>>>>> val)    \
> >>>>>> +{                                    \
> >>>>>> +    register u##sz ret;                        \
> >>>>>> +                                    \
> >>>>>> +    asm volatile (                            \
> >>>>>> +    "amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"            \
> >>>>>> +    : [ptr] "+A" (*(u##sz *)ptr), [ret] "=r" (ret)            \
> >>>>>> +    : [val] "r" ((u##sz)(val))                    \
> >>>>>> +    : "memory");                            \
> >>>>>> +                                    \
> >>>>>> +    return ret + val;                        \
> >>>>>> +}
> >>>>>> +
> >>>>>> +#define PERCPU_OP(name, amo_insn)                    \
> >>>>>> +    __PERCPU_AMO_OP_CASE(.b, name, 8, amo_insn)            \
> >>>>>> +    __PERCPU_AMO_OP_CASE(.h, name, 16, amo_insn)            \
> >>>>>> +    __PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)            \
> >>>>>> +    __PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)            \
> >>>>>> +
> >>>>>> +#define PERCPU_RET_OP(name, amo_insn)                    \
> >>>>>> +    __PERCPU_AMO_RET_OP_CASE(.b, name, 8, amo_insn) \
> >>>>>> +    __PERCPU_AMO_RET_OP_CASE(.h, name, 16, amo_insn)        \
> >>>>>> +    __PERCPU_AMO_RET_OP_CASE(.w, name, 32, amo_insn)        \
> >>>>>> +    __PERCPU_AMO_RET_OP_CASE(.d, name, 64, amo_insn)
> >>>>>> +
> >>>>>> +PERCPU_RW_OPS(8)
> >>>>>> +PERCPU_RW_OPS(16)
> >>>>>> +PERCPU_RW_OPS(32)
> >>>>>> +PERCPU_RW_OPS(64)
> >>>>>> +
> >>>>>> +PERCPU_OP(add, add)
> >>>>>> +PERCPU_OP(andnot, and)
> >>>>>> +PERCPU_OP(or, or)
> >>>>>> +PERCPU_RET_OP(add, add)
> >>>>>> +
> >>>>>> +#undef PERCPU_RW_OPS
> >>>>>> +#undef __PERCPU_AMO_OP_CASE
> >>>>>> +#undef __PERCPU_AMO_RET_OP_CASE
> >>>>>> +#undef PERCPU_OP
> >>>>>> +#undef PERCPU_RET_OP
> >>>>>> +
> >>>>>> +#define _pcp_protect(op, pcp, ...)                    \
> >>>>>> +({                                    \
> >>>>>> +    preempt_disable_notrace();                    \
> >>>>>> +    op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);                \
> >>>>>> +    preempt_enable_notrace();                    \
> >>>>>> +})
> >>>>>> +
> >>>>>> +#define _pcp_protect_return(op, pcp, args...)                \
> >>>>>> +({                                    \
> >>>>>> +    typeof(pcp) __retval;                        \
> >>>>>> +    preempt_disable_notrace();                    \
> >>>>>> +    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
> >>>>>> +    preempt_enable_notrace();                    \
> >>>>>> +    __retval;                            \
> >>>>>> +})
> >>>>>> +
> >>>>>> +#define this_cpu_read_1(pcp) _pcp_protect_return(__percpu_read_8, pcp)
> >>>>>> +#define this_cpu_read_2(pcp) _pcp_protect_return(__percpu_read_16, pcp)
> >>>>>> +#define this_cpu_read_4(pcp) _pcp_protect_return(__percpu_read_32, pcp)
> >>>>>> +#define this_cpu_read_8(pcp) _pcp_protect_return(__percpu_read_64, pcp)
> >>>>>> +
> >>>>>> +#define this_cpu_write_1(pcp, val) _pcp_protect(__percpu_write_8,
> >>>>>> pcp, (unsigned long)val)
> >>>>>> +#define this_cpu_write_2(pcp, val) _pcp_protect(__percpu_write_16,
> >>>>>> pcp, (unsigned long)val)
> >>>>>> +#define this_cpu_write_4(pcp, val) _pcp_protect(__percpu_write_32,
> >>>>>> pcp, (unsigned long)val)
> >>>>>> +#define this_cpu_write_8(pcp, val) _pcp_protect(__percpu_write_64,
> >>>>>> pcp, (unsigned long)val)
> >>>>>> +
> >>>>>> +#define this_cpu_add_1(pcp, val)
> >>>>>> _pcp_protect(__percpu_add_amo_case_8, pcp, val)
> >>>>>> +#define this_cpu_add_2(pcp, val)
> >>>>>> _pcp_protect(__percpu_add_amo_case_16, pcp, val)
> >>>>>> +#define this_cpu_add_4(pcp, val)
> >>>>>> _pcp_protect(__percpu_add_amo_case_32, pcp, val)
> >>>>>> +#define this_cpu_add_8(pcp, val)
> >>>>>> _pcp_protect(__percpu_add_amo_case_64, pcp, val)
> >>>>>> +
> >>>>>> +#define this_cpu_add_return_1(pcp, val)        \
> >>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_8, pcp, val)
> >>>>>> +
> >>>>>> +#define this_cpu_add_return_2(pcp, val)        \
> >>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_16, pcp, val)
> >>>>>> +
> >>>>>> +#define this_cpu_add_return_4(pcp, val)        \
> >>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_32, pcp, val)
> >>>>>> +
> >>>>>> +#define this_cpu_add_return_8(pcp, val)        \
> >>>>>> +_pcp_protect_return(__percpu_add_return_amo_case_64, pcp, val)
> >>>>>> +
> >>>>>> +#define this_cpu_and_1(pcp, val)
> >>>>>> _pcp_protect(__percpu_andnot_amo_case_8, pcp, ~val)
> >>>>>> +#define this_cpu_and_2(pcp, val)
> >>>>>> _pcp_protect(__percpu_andnot_amo_case_16, pcp, ~val)
> >>>>>> +#define this_cpu_and_4(pcp, val)
> >>>>>> _pcp_protect(__percpu_andnot_amo_case_32, pcp, ~val)
> >>>>>> +#define this_cpu_and_8(pcp, val)
> >>>>>> _pcp_protect(__percpu_andnot_amo_case_64, pcp, ~val)
> >>>>> Why do we define __percpu_andnot based on amoand, and use
> >>>>> __percpu_andnot with ~val here? Can't we just define __percpu_and?
> >>
> >> What about that ^?
> >>
> >>
> >>>>>> +
> >>>>>> +#define this_cpu_or_1(pcp, val) _pcp_protect(__percpu_or_amo_case_8,
> >>>>>> pcp, val)
> >>>>>> +#define this_cpu_or_2(pcp, val)
> >>>>>> _pcp_protect(__percpu_or_amo_case_16, pcp, val)
> >>>>>> +#define this_cpu_or_4(pcp, val)
> >>>>>> _pcp_protect(__percpu_or_amo_case_32, pcp, val)
> >>>>>> +#define this_cpu_or_8(pcp, val)
> >>>>>> _pcp_protect(__percpu_or_amo_case_64, pcp, val)
> >>>>>> +
> >>>>>> +#define this_cpu_xchg_1(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>>>> pcp, val)
> >>>>>> +#define this_cpu_xchg_2(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>>>> pcp, val)
> >>>>>> +#define this_cpu_xchg_4(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>>>> pcp, val)
> >>>>>> +#define this_cpu_xchg_8(pcp, val) _pcp_protect_return(xchg_relaxed,
> >>>>>> pcp, val)
> >>>>>> +
> >>>>>> +#define this_cpu_cmpxchg_1(pcp, o, n)
> >>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>>>> +#define this_cpu_cmpxchg_2(pcp, o, n)
> >>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>>>> +#define this_cpu_cmpxchg_4(pcp, o, n)
> >>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>>>> +#define this_cpu_cmpxchg_8(pcp, o, n)
> >>>>>> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
> >>>>>> +
> >>>>>> +#include <asm-generic/percpu.h>
> >>>>>> +
> >>>>>> +#endif /* __ASM_PERCPU_H */
> >>>>> It all looks good to me, just one thing, can you also implement
> >>>>> this_cpu_cmpxchg64/128()?
> >>>>>
> >>>> One last thing sorry, can you add a cover letter too?
> >>> Okay.
> >>>
> >>>> Thanks!
> >>>>
> >>>> Alex
> >>>>
> >>>>
> >>>>> And since this is almost a copy/paste from arm64, either mention it at
> >>>>> the top of the file or (better) merge both implementations somewhere
> >>>>> to avoid redefining existing code :) But up to you.
> >>> Actually, there's a concern here. We should account for scenarios
> >>> where ZABHA isn't supported. Given that xxx_8() and xxx_16() are
> >>> rarely used in practice, could we initially support only xxx_32() and
> >>> xxx_64()? For xxx_8() and xxx_16(), we could default to the generic
> >>> implementation.
> >>
> >> Why isn't lr/sc enough?
> > If I'm not mistaken, the current RISC-V does not support lr.bh/sc.bh,
> > is that right?
>
>
> Yes, that's right, but we have an implementation of cmpxchg[8|16]() that
> uses lr.w/sc.w which works (unless I missed something, I have just
> checked again), so I think that's alright no?

Since RISC-V does not support lr/sc.b/h, when ZABHA is not supported,
we need to use lr/sc.w instead, which requires some additional mask
operations. In fact, 8/16-bit percpu operations are very few. The
number of system startup operations is as follows:

8-bit Adds: 3, 16-bit Adds: 3, 32-bit Adds: 31858, 64-bit Adds: 7656
8-bit Add-Returns: 0, 16-bit Add-Returns: 0, 32-bit Add-Returns: 0,
64-bit Add-Returns: 2
8-bit ANDs: 0, 16-bit ANDs: 0, 32-bit ANDs: 0, 64-bit ANDs: 0
8-bit ANDNOTs: 0, 16-bit ANDNOTs: 0, 32-bit ANDNOTs: 0, 64-bit ANDNOTs: 0
8-bit ORs: 0, 16-bit ORs: 0, 32-bit ORs: 70, 64-bit ORs: 0

So, when ZABHA is not supported, can 8bit/16bit operations directly
fall back to the generic implementation?


>
> Thanks,
>
> Alex
>
>
> >
> >>
> >>>
> >>>>> Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
> >>>>>
> >>>>> Thanks,
> >>>>>
> >>>>> Alex
> >>>>>
> >>>>>
> >>>>>
> >>> Thanks,
> >>> Yunhui
> >>>
> >>> _______________________________________________
> >>> linux-riscv mailing list
> >>> linux-riscv@lists.infradead.org
> >>> http://lists.infradead.org/mailman/listinfo/linux-riscv
> > Thanks,
> > Yunhui
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv

Thanks,
Yunhui

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2025-08-19 12:45 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-18  3:43 [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h Yunhui Cui
2025-06-18  3:43 ` [PATCH RFC 2/2] riscv: introduce percpu.h into include/asm Yunhui Cui
2025-07-17 13:04   ` Alexandre Ghiti
2025-07-17 13:05     ` Alexandre Ghiti
2025-07-18  6:40       ` [External] " yunhui cui
2025-07-18 14:22         ` Alexandre Ghiti
2025-07-18 14:33           ` yunhui cui
2025-08-07 14:54             ` Alexandre Ghiti
2025-08-19 12:45               ` yunhui cui
2025-06-21 14:33 ` [PATCH RFC 1/2] riscv: remove irqflags.h inclusion in asm/bitops.h Yury Norov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).