public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
@ 2024-09-05  6:54 Uros Bizjak
  2024-09-05 11:47 ` Huacai Chen
  0 siblings, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2024-09-05  6:54 UTC (permalink / raw)
  To: loongarch, linux-kernel
  Cc: Uros Bizjak, Huacai Chen, WANG Xuerui, Xi Ruoyao, Thomas Gleixner

_percpu_read() and _percpu_write() macros call __percpu_read()
and __percpu_write() static inline functions that result in a single
assembly instruction. Percpu infrastructure expects its leaf
definitions to encode the size of their percpu variable, so the patch
merges asm clauses from the static inline function into the
corresponding leaf macros.

The secondary effect of this change is to avoid explicit __percpu
function arguments. Currently, __percpu macro is defined in
include/linux/compiler_types.h, but with proposed patch [1],
__percpu definition will need macros from include/asm-generic/percpu.h,
creating forward dependency loop.

The proposed solution is the same as x86 architecture uses.

Patch is compile tested only.

[1] https://lore.kernel.org/lkml/20240812115945.484051-4-ubizjak@gmail.com/

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Xi Ruoyao <xry111@xry111.site>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
v2: Add a missing cast and a missing coma in the asm template,
    reported by kernel test robot. Some formatting changes.
v3: Check the type of _val in _percpu_write().
---
 arch/loongarch/include/asm/percpu.h | 135 ++++++++++------------------
 1 file changed, 46 insertions(+), 89 deletions(-)

diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
index 8f290e5546cf..90ece62ec24c 100644
--- a/arch/loongarch/include/asm/percpu.h
+++ b/arch/loongarch/include/asm/percpu.h
@@ -68,75 +68,6 @@ PERCPU_OP(and, and, &)
 PERCPU_OP(or, or, |)
 #undef PERCPU_OP
 
-static __always_inline unsigned long __percpu_read(void __percpu *ptr, int size)
-{
-	unsigned long ret;
-
-	switch (size) {
-	case 1:
-		__asm__ __volatile__ ("ldx.b %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	case 2:
-		__asm__ __volatile__ ("ldx.h %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	case 4:
-		__asm__ __volatile__ ("ldx.w %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	case 8:
-		__asm__ __volatile__ ("ldx.d %[ret], $r21, %[ptr]	\n"
-		: [ret] "=&r"(ret)
-		: [ptr] "r"(ptr)
-		: "memory");
-		break;
-	default:
-		ret = 0;
-		BUILD_BUG();
-	}
-
-	return ret;
-}
-
-static __always_inline void __percpu_write(void __percpu *ptr, unsigned long val, int size)
-{
-	switch (size) {
-	case 1:
-		__asm__ __volatile__("stx.b %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	case 2:
-		__asm__ __volatile__("stx.h %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	case 4:
-		__asm__ __volatile__("stx.w %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	case 8:
-		__asm__ __volatile__("stx.d %[val], $r21, %[ptr]	\n"
-		:
-		: [val] "r" (val), [ptr] "r" (ptr)
-		: "memory");
-		break;
-	default:
-		BUILD_BUG();
-	}
-}
-
 static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val, int size)
 {
 	switch (size) {
@@ -157,6 +88,44 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 	return 0;
 }
 
+#define __pcpu_op_1(op)		op ".b "
+#define __pcpu_op_2(op)		op ".h "
+#define __pcpu_op_4(op)		op ".w "
+#define __pcpu_op_8(op)		op ".d "
+
+#define _percpu_read(size, _pcp)					\
+({									\
+	unsigned long __pcp_ret;					\
+									\
+	__asm__ __volatile__(						\
+		__pcpu_op_##size("ldx") "%[ret], $r21, %[ptr]	\n"	\
+		: [ret] "=&r"(__pcp_ret)				\
+		: [ptr] "r"(&(_pcp))					\
+		: "memory");						\
+	(typeof(_pcp))__pcp_ret;					\
+})
+
+#define __pcpu_cast_1(val)	(((unsigned long) val) & 0xff)
+#define __pcpu_cast_2(val)	(((unsigned long) val) & 0xffff)
+#define __pcpu_cast_4(val)	(((unsigned long) val) & 0xffffffff)
+#define __pcpu_cast_8(val)	((unsigned long) val)
+
+#define _percpu_write(size, _pcp, _val)					\
+do {									\
+	unsigned long __pcp_val = __pcpu_cast_##size(_val);		\
+									\
+	if (0) {		                                        \
+		typeof(_pcp) pto_tmp__;					\
+		pto_tmp__ = (_val);					\
+		(void)pto_tmp__;					\
+	}								\
+	__asm__ __volatile__(						\
+		__pcpu_op_##size("stx") "%[val], $r21, %[ptr]	\n"	\
+		:							\
+		: [val] "r"(__pcp_val), [ptr] "r"(&(_pcp))		\
+		: "memory");						\
+} while (0)
+
 /* this_cpu_cmpxchg */
 #define _protect_cmpxchg_local(pcp, o, n)			\
 ({								\
@@ -167,18 +136,6 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
 	__ret;							\
 })
 
-#define _percpu_read(pcp)						\
-({									\
-	typeof(pcp) __retval;						\
-	__retval = (typeof(pcp))__percpu_read(&(pcp), sizeof(pcp));	\
-	__retval;							\
-})
-
-#define _percpu_write(pcp, val)						\
-do {									\
-	__percpu_write(&(pcp), (unsigned long)(val), sizeof(pcp));	\
-} while (0)								\
-
 #define _pcp_protect(operation, pcp, val)			\
 ({								\
 	typeof(pcp) __retval;					\
@@ -215,15 +172,15 @@ do {									\
 #define this_cpu_or_4(pcp, val) _percpu_or(pcp, val)
 #define this_cpu_or_8(pcp, val) _percpu_or(pcp, val)
 
-#define this_cpu_read_1(pcp) _percpu_read(pcp)
-#define this_cpu_read_2(pcp) _percpu_read(pcp)
-#define this_cpu_read_4(pcp) _percpu_read(pcp)
-#define this_cpu_read_8(pcp) _percpu_read(pcp)
+#define this_cpu_read_1(pcp) _percpu_read(1, pcp)
+#define this_cpu_read_2(pcp) _percpu_read(2, pcp)
+#define this_cpu_read_4(pcp) _percpu_read(4, pcp)
+#define this_cpu_read_8(pcp) _percpu_read(8, pcp)
 
-#define this_cpu_write_1(pcp, val) _percpu_write(pcp, val)
-#define this_cpu_write_2(pcp, val) _percpu_write(pcp, val)
-#define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
-#define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
+#define this_cpu_write_1(pcp, val) _percpu_write(1, pcp, val)
+#define this_cpu_write_2(pcp, val) _percpu_write(2, pcp, val)
+#define this_cpu_write_4(pcp, val) _percpu_write(4, pcp, val)
+#define this_cpu_write_8(pcp, val) _percpu_write(8, pcp, val)
 
 #define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
 #define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
-- 
2.46.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05  6:54 [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write() Uros Bizjak
@ 2024-09-05 11:47 ` Huacai Chen
  2024-09-05 11:58   ` Xi Ruoyao
  2024-09-05 12:02   ` Uros Bizjak
  0 siblings, 2 replies; 11+ messages in thread
From: Huacai Chen @ 2024-09-05 11:47 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: loongarch, linux-kernel, WANG Xuerui, Xi Ruoyao, Thomas Gleixner

Hi, Uros,

On Thu, Sep 5, 2024 at 2:54 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> _percpu_read() and _percpu_write() macros call __percpu_read()
> and __percpu_write() static inline functions that result in a single
> assembly instruction. Percpu infrastructure expects its leaf
> definitions to encode the size of their percpu variable, so the patch
> merges asm clauses from the static inline function into the
> corresponding leaf macros.
>
> The secondary effect of this change is to avoid explicit __percpu
> function arguments. Currently, __percpu macro is defined in
> include/linux/compiler_types.h, but with proposed patch [1],
> __percpu definition will need macros from include/asm-generic/percpu.h,
> creating forward dependency loop.
>
> The proposed solution is the same as x86 architecture uses.
>
> Patch is compile tested only.
>
> [1] https://lore.kernel.org/lkml/20240812115945.484051-4-ubizjak@gmail.com/
>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Huacai Chen <chenhuacai@kernel.org>
> Cc: WANG Xuerui <kernel@xen0n.name>
> Cc: Xi Ruoyao <xry111@xry111.site>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> ---
> v2: Add a missing cast and a missing coma in the asm template,
>     reported by kernel test robot. Some formatting changes.
> v3: Check the type of _val in _percpu_write().
> ---
>  arch/loongarch/include/asm/percpu.h | 135 ++++++++++------------------
>  1 file changed, 46 insertions(+), 89 deletions(-)
>
> diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
> index 8f290e5546cf..90ece62ec24c 100644
> --- a/arch/loongarch/include/asm/percpu.h
> +++ b/arch/loongarch/include/asm/percpu.h
> @@ -68,75 +68,6 @@ PERCPU_OP(and, and, &)
>  PERCPU_OP(or, or, |)
>  #undef PERCPU_OP
>
> -static __always_inline unsigned long __percpu_read(void __percpu *ptr, int size)
> -{
> -       unsigned long ret;
> -
> -       switch (size) {
> -       case 1:
> -               __asm__ __volatile__ ("ldx.b %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       case 2:
> -               __asm__ __volatile__ ("ldx.h %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       case 4:
> -               __asm__ __volatile__ ("ldx.w %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       case 8:
> -               __asm__ __volatile__ ("ldx.d %[ret], $r21, %[ptr]       \n"
> -               : [ret] "=&r"(ret)
> -               : [ptr] "r"(ptr)
> -               : "memory");
> -               break;
> -       default:
> -               ret = 0;
> -               BUILD_BUG();
> -       }
> -
> -       return ret;
> -}
> -
> -static __always_inline void __percpu_write(void __percpu *ptr, unsigned long val, int size)
> -{
> -       switch (size) {
> -       case 1:
> -               __asm__ __volatile__("stx.b %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       case 2:
> -               __asm__ __volatile__("stx.h %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       case 4:
> -               __asm__ __volatile__("stx.w %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       case 8:
> -               __asm__ __volatile__("stx.d %[val], $r21, %[ptr]        \n"
> -               :
> -               : [val] "r" (val), [ptr] "r" (ptr)
> -               : "memory");
> -               break;
> -       default:
> -               BUILD_BUG();
> -       }
> -}
> -
>  static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val, int size)
>  {
>         switch (size) {
> @@ -157,6 +88,44 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>         return 0;
>  }
>
> +#define __pcpu_op_1(op)                op ".b "
> +#define __pcpu_op_2(op)                op ".h "
> +#define __pcpu_op_4(op)                op ".w "
> +#define __pcpu_op_8(op)                op ".d "
> +
> +#define _percpu_read(size, _pcp)                                       \
> +({                                                                     \
> +       unsigned long __pcp_ret;                                        \
> +                                                                       \
> +       __asm__ __volatile__(                                           \
> +               __pcpu_op_##size("ldx") "%[ret], $r21, %[ptr]   \n"     \
> +               : [ret] "=&r"(__pcp_ret)                                \
> +               : [ptr] "r"(&(_pcp))                                    \
> +               : "memory");                                            \
> +       (typeof(_pcp))__pcp_ret;                                        \
> +})
> +
> +#define __pcpu_cast_1(val)     (((unsigned long) val) & 0xff)
If the input value is less than 0xff, then "& 0xff" is meaningless, if
the input value is more than 0xff, this conversion still cannot give a
correct result for the caller. So I think for all sizes it is enough
to just use "((unsigned long) val)".

> +#define __pcpu_cast_2(val)     (((unsigned long) val) & 0xffff)
> +#define __pcpu_cast_4(val)     (((unsigned long) val) & 0xffffffff)
> +#define __pcpu_cast_8(val)     ((unsigned long) val)
> +
> +#define _percpu_write(size, _pcp, _val)                                        \
> +do {                                                                   \
> +       unsigned long __pcp_val = __pcpu_cast_##size(_val);             \
> +                                                                       \
> +       if (0) {                                                        \
> +               typeof(_pcp) pto_tmp__;                                 \
> +               pto_tmp__ = (_val);                                     \
> +               (void)pto_tmp__;                                        \
> +       }                                                               \
Emmm, in V2 I just confirm that whether it is worth to use macro
instead of inline functions, I think we don't really need such a
checking here. :)


Huacai

> +       __asm__ __volatile__(                                           \
> +               __pcpu_op_##size("stx") "%[val], $r21, %[ptr]   \n"     \
> +               :                                                       \
> +               : [val] "r"(__pcp_val), [ptr] "r"(&(_pcp))              \
> +               : "memory");                                            \
> +} while (0)
> +
>  /* this_cpu_cmpxchg */
>  #define _protect_cmpxchg_local(pcp, o, n)                      \
>  ({                                                             \
> @@ -167,18 +136,6 @@ static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
>         __ret;                                                  \
>  })
>
> -#define _percpu_read(pcp)                                              \
> -({                                                                     \
> -       typeof(pcp) __retval;                                           \
> -       __retval = (typeof(pcp))__percpu_read(&(pcp), sizeof(pcp));     \
> -       __retval;                                                       \
> -})
> -
> -#define _percpu_write(pcp, val)                                                \
> -do {                                                                   \
> -       __percpu_write(&(pcp), (unsigned long)(val), sizeof(pcp));      \
> -} while (0)                                                            \
> -
>  #define _pcp_protect(operation, pcp, val)                      \
>  ({                                                             \
>         typeof(pcp) __retval;                                   \
> @@ -215,15 +172,15 @@ do {                                                                      \
>  #define this_cpu_or_4(pcp, val) _percpu_or(pcp, val)
>  #define this_cpu_or_8(pcp, val) _percpu_or(pcp, val)
>
> -#define this_cpu_read_1(pcp) _percpu_read(pcp)
> -#define this_cpu_read_2(pcp) _percpu_read(pcp)
> -#define this_cpu_read_4(pcp) _percpu_read(pcp)
> -#define this_cpu_read_8(pcp) _percpu_read(pcp)
> +#define this_cpu_read_1(pcp) _percpu_read(1, pcp)
> +#define this_cpu_read_2(pcp) _percpu_read(2, pcp)
> +#define this_cpu_read_4(pcp) _percpu_read(4, pcp)
> +#define this_cpu_read_8(pcp) _percpu_read(8, pcp)
>
> -#define this_cpu_write_1(pcp, val) _percpu_write(pcp, val)
> -#define this_cpu_write_2(pcp, val) _percpu_write(pcp, val)
> -#define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
> -#define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
> +#define this_cpu_write_1(pcp, val) _percpu_write(1, pcp, val)
> +#define this_cpu_write_2(pcp, val) _percpu_write(2, pcp, val)
> +#define this_cpu_write_4(pcp, val) _percpu_write(4, pcp, val)
> +#define this_cpu_write_8(pcp, val) _percpu_write(8, pcp, val)
>
>  #define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
>  #define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
> --
> 2.46.0
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 11:47 ` Huacai Chen
@ 2024-09-05 11:58   ` Xi Ruoyao
  2024-09-05 12:13     ` Uros Bizjak
  2024-09-05 12:02   ` Uros Bizjak
  1 sibling, 1 reply; 11+ messages in thread
From: Xi Ruoyao @ 2024-09-05 11:58 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Huacai Chen, loongarch, linux-kernel, WANG Xuerui,
	Thomas Gleixner

On Thu, 2024-09-05 at 19:47 +0800, Huacai Chen wrote:
> > +#define _percpu_write(size, _pcp, _val)                                        \
> > +do {                                                                   \
> > +       unsigned long __pcp_val = __pcpu_cast_##size(_val);             \
> > +                                                                       \
> > +       if (0) {                                                        \
> > +               typeof(_pcp) pto_tmp__;                                 \
> > +               pto_tmp__ = (_val);                                     \
> > +               (void)pto_tmp__;                                        \
> > +       }                                                               \
> Emmm, in V2 I just confirm that whether it is worth to use macro
> instead of inline functions, I think we don't really need such a
> checking here. :)

It's still (slighly) better (just from aesthetics view) to use inline
function instead of macro.  Is it possible to clear the header
dependency and break the circle so we can still use __percpu in arch
percpu.h instead?

(Simply removing __percpu from the function parameter list causes many
warnings with make C=1.)

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 11:47 ` Huacai Chen
  2024-09-05 11:58   ` Xi Ruoyao
@ 2024-09-05 12:02   ` Uros Bizjak
  2024-09-05 12:09     ` Xi Ruoyao
  1 sibling, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2024-09-05 12:02 UTC (permalink / raw)
  To: Huacai Chen
  Cc: loongarch, linux-kernel, WANG Xuerui, Xi Ruoyao, Thomas Gleixner

On Thu, Sep 5, 2024 at 1:47 PM Huacai Chen <chenhuacai@kernel.org> wrote:
>
> Hi, Uros,
>
> On Thu, Sep 5, 2024 at 2:54 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > _percpu_read() and _percpu_write() macros call __percpu_read()
> > and __percpu_write() static inline functions that result in a single
> > assembly instruction. Percpu infrastructure expects its leaf
> > definitions to encode the size of their percpu variable, so the patch
> > merges asm clauses from the static inline function into the
> > corresponding leaf macros.
> >
> > The secondary effect of this change is to avoid explicit __percpu
> > function arguments. Currently, __percpu macro is defined in
> > include/linux/compiler_types.h, but with proposed patch [1],
> > __percpu definition will need macros from include/asm-generic/percpu.h,
> > creating forward dependency loop.
> >
> > The proposed solution is the same as x86 architecture uses.

> > +#define _percpu_read(size, _pcp)                                       \
> > +({                                                                     \
> > +       unsigned long __pcp_ret;                                        \
> > +                                                                       \
> > +       __asm__ __volatile__(                                           \
> > +               __pcpu_op_##size("ldx") "%[ret], $r21, %[ptr]   \n"     \
> > +               : [ret] "=&r"(__pcp_ret)                                \
> > +               : [ptr] "r"(&(_pcp))                                    \
> > +               : "memory");                                            \
> > +       (typeof(_pcp))__pcp_ret;                                        \
> > +})
> > +
> > +#define __pcpu_cast_1(val)     (((unsigned long) val) & 0xff)
> If the input value is less than 0xff, then "& 0xff" is meaningless, if
> the input value is more than 0xff, this conversion still cannot give a
> correct result for the caller. So I think for all sizes it is enough
> to just use "((unsigned long) val)".

This part is used to force unsigned extension, otherwise the compiler
will use sign-extension of the possibly signed variable.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 12:02   ` Uros Bizjak
@ 2024-09-05 12:09     ` Xi Ruoyao
  2024-09-05 12:16       ` Uros Bizjak
  0 siblings, 1 reply; 11+ messages in thread
From: Xi Ruoyao @ 2024-09-05 12:09 UTC (permalink / raw)
  To: Uros Bizjak, Huacai Chen
  Cc: loongarch, linux-kernel, WANG Xuerui, Thomas Gleixner

On Thu, 2024-09-05 at 14:02 +0200, Uros Bizjak wrote:
> > If the input value is less than 0xff, then "& 0xff" is meaningless, if
> > the input value is more than 0xff, this conversion still cannot give a
> > correct result for the caller. So I think for all sizes it is enough
> > to just use "((unsigned long) val)".
> 
> This part is used to force unsigned extension, otherwise the compiler
> will use sign-extension of the possibly signed variable.

It's not relevant.  For example when size is 2 __pcpu_op_##size("stx")
is expanded to stx.h, and stx.h only stores the lower 16 bits of a
register into MEM[r21 + ptr], the high bits are ignored anyway.

Thus we can just have

+#define _percpu_write(size, _pcp, _val)					\
+do {									\
+	if (0) {		                                        \
+		typeof(_pcp) pto_tmp__;					\
+		pto_tmp__ = (_val);					\
+		(void)pto_tmp__;					\
+	}								\
+	__asm__ __volatile__(						\
+		__pcpu_op_##size("stx") "%[val], $r21, %[ptr]	\n"	\
+		:							\
+		: [val] "r"(_val), [ptr] "r"(&(_pcp))		\
+		: "memory");						\
+} while (0)

-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 11:58   ` Xi Ruoyao
@ 2024-09-05 12:13     ` Uros Bizjak
  0 siblings, 0 replies; 11+ messages in thread
From: Uros Bizjak @ 2024-09-05 12:13 UTC (permalink / raw)
  To: Xi Ruoyao
  Cc: Huacai Chen, loongarch, linux-kernel, WANG Xuerui,
	Thomas Gleixner

On Thu, Sep 5, 2024 at 1:58 PM Xi Ruoyao <xry111@xry111.site> wrote:
>
> On Thu, 2024-09-05 at 19:47 +0800, Huacai Chen wrote:
> > > +#define _percpu_write(size, _pcp, _val)                                        \
> > > +do {                                                                   \
> > > +       unsigned long __pcp_val = __pcpu_cast_##size(_val);             \
> > > +                                                                       \
> > > +       if (0) {                                                        \
> > > +               typeof(_pcp) pto_tmp__;                                 \
> > > +               pto_tmp__ = (_val);                                     \
> > > +               (void)pto_tmp__;                                        \
> > > +       }                                                               \
> > Emmm, in V2 I just confirm that whether it is worth to use macro
> > instead of inline functions, I think we don't really need such a
> > checking here. :)
>
> It's still (slighly) better (just from aesthetics view) to use inline
> function instead of macro.  Is it possible to clear the header
> dependency and break the circle so we can still use __percpu in arch
> percpu.h instead?

Unfortunately, it is not possible. __percpu will use macros, defined
in <include/percpu.h>, and <include/percpu.h> needs
<loongarch/include/asm/percpu.h>. Up to now, it was enough to include
<include/linux/compiler_types.h>, so it was possible to avoid
unresolved include dependency.

> (Simply removing __percpu from the function parameter list causes many
> warnings with make C=1.)

This, and if/when loongson will use percpu named address checks, it
will break compilation.

FYI, it is possible to implement the same percpu address space checks
for loongson (as for x86) with clang and its address space attribute.
The compiler has to provide __typeof_unqual__ and the arch has to
define

# define __per_cpu_qual __attribute__((address_space(1)))

in its percpu.h file.

But we are not there yet ;)

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 12:09     ` Xi Ruoyao
@ 2024-09-05 12:16       ` Uros Bizjak
  2024-09-05 12:46         ` Uros Bizjak
  0 siblings, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2024-09-05 12:16 UTC (permalink / raw)
  To: Xi Ruoyao
  Cc: Huacai Chen, loongarch, linux-kernel, WANG Xuerui,
	Thomas Gleixner

On Thu, Sep 5, 2024 at 2:09 PM Xi Ruoyao <xry111@xry111.site> wrote:
>
> On Thu, 2024-09-05 at 14:02 +0200, Uros Bizjak wrote:
> > > If the input value is less than 0xff, then "& 0xff" is meaningless, if
> > > the input value is more than 0xff, this conversion still cannot give a
> > > correct result for the caller. So I think for all sizes it is enough
> > > to just use "((unsigned long) val)".
> >
> > This part is used to force unsigned extension, otherwise the compiler
> > will use sign-extension of the possibly signed variable.
>
> It's not relevant.  For example when size is 2 __pcpu_op_##size("stx")
> is expanded to stx.h, and stx.h only stores the lower 16 bits of a
> register into MEM[r21 + ptr], the high bits are ignored anyway.
>
> Thus we can just have
>
> +#define _percpu_write(size, _pcp, _val)                                        \
> +do {                                                                   \
> +       if (0) {                                                        \
> +               typeof(_pcp) pto_tmp__;                                 \
> +               pto_tmp__ = (_val);                                     \
> +               (void)pto_tmp__;                                        \
> +       }                                                               \
> +       __asm__ __volatile__(                                           \
> +               __pcpu_op_##size("stx") "%[val], $r21, %[ptr]   \n"     \
> +               :                                                       \
> +               : [val] "r"(_val), [ptr] "r"(&(_pcp))           \
> +               : "memory");                                            \
> +} while (0)

Nice, the less code, the better. If it works for loongson target, then
we don't need this paranoia.

I just played safe and took the approach that x86 took.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 12:16       ` Uros Bizjak
@ 2024-09-05 12:46         ` Uros Bizjak
  2024-09-05 15:04           ` Huacai Chen
  0 siblings, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2024-09-05 12:46 UTC (permalink / raw)
  To: Xi Ruoyao
  Cc: Huacai Chen, loongarch, linux-kernel, WANG Xuerui,
	Thomas Gleixner

On Thu, Sep 5, 2024 at 2:16 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Sep 5, 2024 at 2:09 PM Xi Ruoyao <xry111@xry111.site> wrote:
> >
> > On Thu, 2024-09-05 at 14:02 +0200, Uros Bizjak wrote:
> > > > If the input value is less than 0xff, then "& 0xff" is meaningless, if
> > > > the input value is more than 0xff, this conversion still cannot give a
> > > > correct result for the caller. So I think for all sizes it is enough
> > > > to just use "((unsigned long) val)".
> > >
> > > This part is used to force unsigned extension, otherwise the compiler
> > > will use sign-extension of the possibly signed variable.
> >
> > It's not relevant.  For example when size is 2 __pcpu_op_##size("stx")
> > is expanded to stx.h, and stx.h only stores the lower 16 bits of a
> > register into MEM[r21 + ptr], the high bits are ignored anyway.
> >
> > Thus we can just have
> >
> > +#define _percpu_write(size, _pcp, _val)                                        \
> > +do {                                                                   \
> > +       if (0) {                                                        \
> > +               typeof(_pcp) pto_tmp__;                                 \
> > +               pto_tmp__ = (_val);                                     \
> > +               (void)pto_tmp__;                                        \
> > +       }                                                               \
> > +       __asm__ __volatile__(                                           \
> > +               __pcpu_op_##size("stx") "%[val], $r21, %[ptr]   \n"     \
> > +               :                                                       \
> > +               : [val] "r"(_val), [ptr] "r"(&(_pcp))           \
> > +               : "memory");                                            \
> > +} while (0)
>
> Nice, the less code, the better. If it works for loongson target, then
> we don't need this paranoia.
>
> I just played safe and took the approach that x86 took.

Please note that the original code extended the value to a long type.
If the simplified macro works, then the usage of macros will result in
a better assembly code, where zero-extends will be omitted.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 12:46         ` Uros Bizjak
@ 2024-09-05 15:04           ` Huacai Chen
  2024-09-05 15:09             ` Uros Bizjak
  0 siblings, 1 reply; 11+ messages in thread
From: Huacai Chen @ 2024-09-05 15:04 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Xi Ruoyao, loongarch, linux-kernel, WANG Xuerui, Thomas Gleixner

On Thu, Sep 5, 2024 at 8:46 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Sep 5, 2024 at 2:16 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Thu, Sep 5, 2024 at 2:09 PM Xi Ruoyao <xry111@xry111.site> wrote:
> > >
> > > On Thu, 2024-09-05 at 14:02 +0200, Uros Bizjak wrote:
> > > > > If the input value is less than 0xff, then "& 0xff" is meaningless, if
> > > > > the input value is more than 0xff, this conversion still cannot give a
> > > > > correct result for the caller. So I think for all sizes it is enough
> > > > > to just use "((unsigned long) val)".
> > > >
> > > > This part is used to force unsigned extension, otherwise the compiler
> > > > will use sign-extension of the possibly signed variable.
> > >
> > > It's not relevant.  For example when size is 2 __pcpu_op_##size("stx")
> > > is expanded to stx.h, and stx.h only stores the lower 16 bits of a
> > > register into MEM[r21 + ptr], the high bits are ignored anyway.
> > >
> > > Thus we can just have
> > >
> > > +#define _percpu_write(size, _pcp, _val)                                        \
> > > +do {                                                                   \
> > > +       if (0) {                                                        \
> > > +               typeof(_pcp) pto_tmp__;                                 \
> > > +               pto_tmp__ = (_val);                                     \
> > > +               (void)pto_tmp__;                                        \
> > > +       }                                                               \
> > > +       __asm__ __volatile__(                                           \
> > > +               __pcpu_op_##size("stx") "%[val], $r21, %[ptr]   \n"     \
> > > +               :                                                       \
> > > +               : [val] "r"(_val), [ptr] "r"(&(_pcp))           \
> > > +               : "memory");                                            \
> > > +} while (0)
> >
> > Nice, the less code, the better. If it works for loongson target, then
> > we don't need this paranoia.
> >
> > I just played safe and took the approach that x86 took.
>
> Please note that the original code extended the value to a long type.
> If the simplified macro works, then the usage of macros will result in
> a better assembly code, where zero-extends will be omitted.
OK, please send a simplified V4, remember to remove the if(0)
checking, which is the same as V2, thanks.

Huacai

>
> Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 15:04           ` Huacai Chen
@ 2024-09-05 15:09             ` Uros Bizjak
  2024-09-06  6:25               ` Huacai Chen
  0 siblings, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2024-09-05 15:09 UTC (permalink / raw)
  To: Huacai Chen
  Cc: Xi Ruoyao, loongarch, linux-kernel, WANG Xuerui, Thomas Gleixner

On Thu, Sep 5, 2024 at 5:04 PM Huacai Chen <chenhuacai@kernel.org> wrote:
>
> On Thu, Sep 5, 2024 at 8:46 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Thu, Sep 5, 2024 at 2:16 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Thu, Sep 5, 2024 at 2:09 PM Xi Ruoyao <xry111@xry111.site> wrote:
> > > >
> > > > On Thu, 2024-09-05 at 14:02 +0200, Uros Bizjak wrote:
> > > > > > If the input value is less than 0xff, then "& 0xff" is meaningless, if
> > > > > > the input value is more than 0xff, this conversion still cannot give a
> > > > > > correct result for the caller. So I think for all sizes it is enough
> > > > > > to just use "((unsigned long) val)".
> > > > >
> > > > > This part is used to force unsigned extension, otherwise the compiler
> > > > > will use sign-extension of the possibly signed variable.
> > > >
> > > > It's not relevant.  For example when size is 2 __pcpu_op_##size("stx")
> > > > is expanded to stx.h, and stx.h only stores the lower 16 bits of a
> > > > register into MEM[r21 + ptr], the high bits are ignored anyway.
> > > >
> > > > Thus we can just have
> > > >
> > > > +#define _percpu_write(size, _pcp, _val)                                        \
> > > > +do {                                                                   \
> > > > +       if (0) {                                                        \
> > > > +               typeof(_pcp) pto_tmp__;                                 \
> > > > +               pto_tmp__ = (_val);                                     \
> > > > +               (void)pto_tmp__;                                        \
> > > > +       }                                                               \
> > > > +       __asm__ __volatile__(                                           \
> > > > +               __pcpu_op_##size("stx") "%[val], $r21, %[ptr]   \n"     \
> > > > +               :                                                       \
> > > > +               : [val] "r"(_val), [ptr] "r"(&(_pcp))           \
> > > > +               : "memory");                                            \
> > > > +} while (0)
> > >
> > > Nice, the less code, the better. If it works for loongson target, then
> > > we don't need this paranoia.
> > >
> > > I just played safe and took the approach that x86 took.
> >
> > Please note that the original code extended the value to a long type.
> > If the simplified macro works, then the usage of macros will result in
> > a better assembly code, where zero-extends will be omitted.
> OK, please send a simplified V4, remember to remove the if(0)
> checking, which is the same as V2, thanks.

Are you sure we want to remove type checking on _val ? I'd rather
leave the if(0) part, but remove forcing zero-extension.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write()
  2024-09-05 15:09             ` Uros Bizjak
@ 2024-09-06  6:25               ` Huacai Chen
  0 siblings, 0 replies; 11+ messages in thread
From: Huacai Chen @ 2024-09-06  6:25 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Xi Ruoyao, loongarch, linux-kernel, WANG Xuerui, Thomas Gleixner

On Thu, Sep 5, 2024 at 11:09 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Sep 5, 2024 at 5:04 PM Huacai Chen <chenhuacai@kernel.org> wrote:
> >
> > On Thu, Sep 5, 2024 at 8:46 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Thu, Sep 5, 2024 at 2:16 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Thu, Sep 5, 2024 at 2:09 PM Xi Ruoyao <xry111@xry111.site> wrote:
> > > > >
> > > > > On Thu, 2024-09-05 at 14:02 +0200, Uros Bizjak wrote:
> > > > > > > If the input value is less than 0xff, then "& 0xff" is meaningless, if
> > > > > > > the input value is more than 0xff, this conversion still cannot give a
> > > > > > > correct result for the caller. So I think for all sizes it is enough
> > > > > > > to just use "((unsigned long) val)".
> > > > > >
> > > > > > This part is used to force unsigned extension, otherwise the compiler
> > > > > > will use sign-extension of the possibly signed variable.
> > > > >
> > > > > It's not relevant.  For example when size is 2 __pcpu_op_##size("stx")
> > > > > is expanded to stx.h, and stx.h only stores the lower 16 bits of a
> > > > > register into MEM[r21 + ptr], the high bits are ignored anyway.
> > > > >
> > > > > Thus we can just have
> > > > >
> > > > > +#define _percpu_write(size, _pcp, _val)                                        \
> > > > > +do {                                                                   \
> > > > > +       if (0) {                                                        \
> > > > > +               typeof(_pcp) pto_tmp__;                                 \
> > > > > +               pto_tmp__ = (_val);                                     \
> > > > > +               (void)pto_tmp__;                                        \
> > > > > +       }                                                               \
> > > > > +       __asm__ __volatile__(                                           \
> > > > > +               __pcpu_op_##size("stx") "%[val], $r21, %[ptr]   \n"     \
> > > > > +               :                                                       \
> > > > > +               : [val] "r"(_val), [ptr] "r"(&(_pcp))           \
> > > > > +               : "memory");                                            \
> > > > > +} while (0)
> > > >
> > > > Nice, the less code, the better. If it works for loongson target, then
> > > > we don't need this paranoia.
> > > >
> > > > I just played safe and took the approach that x86 took.
> > >
> > > Please note that the original code extended the value to a long type.
> > > If the simplified macro works, then the usage of macros will result in
> > > a better assembly code, where zero-extends will be omitted.
> > OK, please send a simplified V4, remember to remove the if(0)
> > checking, which is the same as V2, thanks.
>
> Are you sure we want to remove type checking on _val ? I'd rather
> leave the if(0) part, but remove forcing zero-extension.
Yes, please, the checking is not very useful.

Huacai

>
> Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-09-06  6:26 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-05  6:54 [PATCH v3] LoongArch/percpu: Simplify _percpu_read() and _percpu_write() Uros Bizjak
2024-09-05 11:47 ` Huacai Chen
2024-09-05 11:58   ` Xi Ruoyao
2024-09-05 12:13     ` Uros Bizjak
2024-09-05 12:02   ` Uros Bizjak
2024-09-05 12:09     ` Xi Ruoyao
2024-09-05 12:16       ` Uros Bizjak
2024-09-05 12:46         ` Uros Bizjak
2024-09-05 15:04           ` Huacai Chen
2024-09-05 15:09             ` Uros Bizjak
2024-09-06  6:25               ` Huacai Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox