public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg
@ 2023-08-30 15:13 Uros Bizjak
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
  2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
  0 siblings, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

Define target-specific raw_cpu_try_cmpxchg_N and
this_cpu_try_cmpxchg_N macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
 arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d730463..c8309f260d98 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do {									\
 	(typeof(_var))(unsigned long) pco_old__;			\
 })
 
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval)		\
+({									\
+	bool success;							\
+	__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+	__pcpu_type_##size pco_old__ = *pco_oval__;			\
+	__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval);	\
+	asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]",		\
+				    __percpu_arg([var]))		\
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [oval] "+a" (pco_old__),				\
+		    [var] "+m" (_var)					\
+		  : [nval] __pcpu_reg_##size(, pco_new__)		\
+		  : "memory");						\
+	if (unlikely(!success))						\
+		*pco_oval__ = pco_old__;				\
+	likely(success);						\
+})
+
 #if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
 #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval)		\
 ({									\
@@ -343,6 +362,9 @@ do {									\
 #define raw_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
 
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, volatile, pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, volatile, pcp, val)
@@ -350,6 +372,9 @@ do {									\
 #define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
 
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +389,7 @@ do {									\
 #define raw_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, , pcp, val)
 #define raw_cpu_xchg_8(pcp, nval)		raw_percpu_xchg_op(pcp, nval)
 #define raw_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
 
 #define this_cpu_read_8(pcp)			percpu_from_op(8, volatile, "mov", pcp)
 #define this_cpu_write_8(pcp, val)		percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +399,7 @@ do {									\
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
 #define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
@ 2023-08-30 15:13 ` Uros Bizjak
  2023-09-15  9:47   ` Ingo Molnar
  2023-09-15 11:25   ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
  2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
  1 sibling, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after cmpxchg (and related move instruction in front of cmpxchg).

Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.

No functional change intended.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
 arch/x86/include/asm/preempt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25b1bd8..4527e1430c6d 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
 {
 	int old, new;
 
+	old = raw_cpu_read_4(pcpu_hot.preempt_count);
 	do {
-		old = raw_cpu_read_4(pcpu_hot.preempt_count);
 		new = (old & PREEMPT_NEED_RESCHED) |
 			(pc & ~PREEMPT_NEED_RESCHED);
-	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
 }
 
 /*
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15  9:47   ` Ingo Molnar
  2023-09-15 11:15     ` Ingo Molnar
  2023-09-15 12:01     ` Uros Bizjak
  2023-09-15 11:25   ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
  1 sibling, 2 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15  9:47 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin


* Uros Bizjak <ubizjak@gmail.com> wrote:

> Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> compare after cmpxchg (and related move instruction in front of cmpxchg).
> 
> Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> cmpxchg fails. There is no need to re-read the value in the loop.
> 
> No functional change intended.
> 
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> ---
>  arch/x86/include/asm/preempt.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> index 2d13f25b1bd8..4527e1430c6d 100644
> --- a/arch/x86/include/asm/preempt.h
> +++ b/arch/x86/include/asm/preempt.h
> @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
>  {
>  	int old, new;
>  
> +	old = raw_cpu_read_4(pcpu_hot.preempt_count);
>  	do {
> -		old = raw_cpu_read_4(pcpu_hot.preempt_count);
>  		new = (old & PREEMPT_NEED_RESCHED) |
>  			(pc & ~PREEMPT_NEED_RESCHED);
> -	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> +	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));

It would be really nice to have a before/after comparison of generated 
assembly code in the changelog, to demonstrate the effectiveness of this 
optimization.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-09-15  9:47   ` Ingo Molnar
@ 2023-09-15 11:15     ` Ingo Molnar
  2023-09-15 11:22       ` Ingo Molnar
  2023-09-15 12:01     ` Uros Bizjak
  1 sibling, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:15 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin


* Ingo Molnar <mingo@kernel.org> wrote:

> 
> * Uros Bizjak <ubizjak@gmail.com> wrote:
> 
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> > 
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> > 
> > No functional change intended.
> > 
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> >  arch/x86/include/asm/preempt.h | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> >  {
> >  	int old, new;
> >  
> > +	old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >  	do {
> > -		old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >  		new = (old & PREEMPT_NEED_RESCHED) |
> >  			(pc & ~PREEMPT_NEED_RESCHED);
> > -	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > +	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
> 
> It would be really nice to have a before/after comparison of generated 
> assembly code in the changelog, to demonstrate the effectiveness of this 
> optimization.

Never mind, you did exactly that in the September 6 variation of these 
changes. I'll apply those.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-09-15 11:15     ` Ingo Molnar
@ 2023-09-15 11:22       ` Ingo Molnar
  0 siblings, 0 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:22 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin


* Ingo Molnar <mingo@kernel.org> wrote:

> 
> * Ingo Molnar <mingo@kernel.org> wrote:
> 
> > 
> > * Uros Bizjak <ubizjak@gmail.com> wrote:
> > 
> > > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > > compare after cmpxchg (and related move instruction in front of cmpxchg).
> > > 
> > > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > > cmpxchg fails. There is no need to re-read the value in the loop.
> > > 
> > > No functional change intended.
> > > 
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Ingo Molnar <mingo@redhat.com>
> > > Cc: Borislav Petkov <bp@alien8.de>
> > > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > > ---
> > >  arch/x86/include/asm/preempt.h | 4 ++--
> > >  1 file changed, 2 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > > index 2d13f25b1bd8..4527e1430c6d 100644
> > > --- a/arch/x86/include/asm/preempt.h
> > > +++ b/arch/x86/include/asm/preempt.h
> > > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > >  {
> > >  	int old, new;
> > >  
> > > +	old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > >  	do {
> > > -		old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > >  		new = (old & PREEMPT_NEED_RESCHED) |
> > >  			(pc & ~PREEMPT_NEED_RESCHED);
> > > -	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > > +	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
> > 
> > It would be really nice to have a before/after comparison of generated 
> > assembly code in the changelog, to demonstrate the effectiveness of this 
> > optimization.
> 
> Never mind, you did exactly that in the September 6 variation of these 
> changes. I'll apply those.

I mean, this third patch of yours:

   [PATCH] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}

Had a proper disassembly comparison - so I've applied all 3 optimization 
patches to tip:x86/asm as:

  b8e3dfa16ec5 ("x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()")
  5f863897d964 ("x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()")
  54cd971c6f44 ("x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}")

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
  2023-09-15  9:47   ` Ingo Molnar
@ 2023-09-15 11:25   ` tip-bot2 for Uros Bizjak
  1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, Peter Zijlstra, x86, linux-kernel

The following commit has been merged into the x86/asm branch of tip:

Commit-ID:     b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Gitweb:        https://git.kernel.org/tip/b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 30 Aug 2023 17:13:57 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:19:22 +02:00

x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()

Use raw_cpu_try_cmpxchg() instead of raw_cpu_cmpxchg(*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after CMPXCHG (and related MOV instruction in front of CMPXCHG).

Also, raw_cpu_try_cmpxchg() implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230830151623.3900-2-ubizjak@gmail.com
---
 arch/x86/include/asm/preempt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25..4527e14 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
 {
 	int old, new;
 
+	old = raw_cpu_read_4(pcpu_hot.preempt_count);
 	do {
-		old = raw_cpu_read_4(pcpu_hot.preempt_count);
 		new = (old & PREEMPT_NEED_RESCHED) |
 			(pc & ~PREEMPT_NEED_RESCHED);
-	} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+	} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
 }
 
 /*

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
  2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
  2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15 11:25 ` tip-bot2 for Uros Bizjak
  1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Uros Bizjak, Ingo Molnar, x86, linux-kernel

The following commit has been merged into the x86/asm branch of tip:

Commit-ID:     5f863897d964e834a0da35b1e483b5bb8faca522
Gitweb:        https://git.kernel.org/tip/5f863897d964e834a0da35b1e483b5bb8faca522
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 30 Aug 2023 17:13:56 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:18:23 +02:00

x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()

Define target-specific raw_cpu_try_cmpxchg_N() and
this_cpu_try_cmpxchg_N() macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230830151623.3900-1-ubizjak@gmail.com
---
 arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 4c36419..a87db61 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do {									\
 	(typeof(_var))(unsigned long) pco_old__;			\
 })
 
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval)		\
+({									\
+	bool success;							\
+	__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+	__pcpu_type_##size pco_old__ = *pco_oval__;			\
+	__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval);	\
+	asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]",		\
+				    __percpu_arg([var]))		\
+		  CC_SET(z)						\
+		  : CC_OUT(z) (success),				\
+		    [oval] "+a" (pco_old__),				\
+		    [var] "+m" (_var)					\
+		  : [nval] __pcpu_reg_##size(, pco_new__)		\
+		  : "memory");						\
+	if (unlikely(!success))						\
+		*pco_oval__ = pco_old__;				\
+	likely(success);						\
+})
+
 #if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
 #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval)		\
 ({									\
@@ -410,6 +429,9 @@ do {									\
 #define raw_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
 
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(1, volatile, pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(2, volatile, pcp, val)
@@ -417,6 +439,9 @@ do {									\
 #define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval)	percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval)	percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval)	percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
 
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -431,6 +456,7 @@ do {									\
 #define raw_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, , pcp, val)
 #define raw_cpu_xchg_8(pcp, nval)		raw_percpu_xchg_op(pcp, nval)
 #define raw_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
 
 #define this_cpu_read_8(pcp)			percpu_from_op(8, volatile, "mov", pcp)
 #define this_cpu_write_8(pcp, val)		percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -440,6 +466,7 @@ do {									\
 #define this_cpu_add_return_8(pcp, val)		percpu_add_return_op(8, volatile, pcp, val)
 #define this_cpu_xchg_8(pcp, nval)		percpu_xchg_op(8, volatile, pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)	percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
  2023-09-15  9:47   ` Ingo Molnar
  2023-09-15 11:15     ` Ingo Molnar
@ 2023-09-15 12:01     ` Uros Bizjak
  1 sibling, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-09-15 12:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin

On Fri, Sep 15, 2023 at 11:47 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> >
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> >
> > No functional change intended.
> >
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> >  arch/x86/include/asm/preempt.h | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> >  {
> >       int old, new;
> >
> > +     old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >       do {
> > -             old = raw_cpu_read_4(pcpu_hot.preempt_count);
> >               new = (old & PREEMPT_NEED_RESCHED) |
> >                       (pc & ~PREEMPT_NEED_RESCHED);
> > -     } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > +     } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
>
> It would be really nice to have a before/after comparison of generated
> assembly code in the changelog, to demonstrate the effectiveness of this
> optimization.

The  assembly code improvements are in line with other try_cmpxchg
conversions, but for reference, finish_task_switch() from
kernel/sched/core.c that inlines preempt_count_set() improves from:

    5bad:    65 8b 0d 00 00 00 00     mov    %gs:0x0(%rip),%ecx
    5bb4:    89 ca                    mov    %ecx,%edx
    5bb6:    89 c8                    mov    %ecx,%eax
    5bb8:    81 e2 00 00 00 80        and    $0x80000000,%edx
    5bbe:    83 ca 02                 or     $0x2,%edx
    5bc1:    65 0f b1 15 00 00 00     cmpxchg %edx,%gs:0x0(%rip)
    5bc8:    00
    5bc9:    39 c1                    cmp    %eax,%ecx
    5bcb:    75 e0                    jne    5bad <...>
    5bcd:    e9 5a fe ff ff           jmpq   5a2c <...>
    5bd2:

to:

    5bad:    65 8b 05 00 00 00 00     mov    %gs:0x0(%rip),%eax
    5bb4:    89 c2                    mov    %eax,%edx
    5bb6:    81 e2 00 00 00 80        and    $0x80000000,%edx
    5bbc:    83 ca 02                 or     $0x2,%edx
    5bbf:    65 0f b1 15 00 00 00     cmpxchg %edx,%gs:0x0(%rip)
    5bc6:    00
    5bc7:    0f 84 5f fe ff ff        je     5a2c <...>
    5bcd:    eb e5                    jmp    5bb4 <...>
    5bcf:

Please note missing cmp (and mov), loop without extra memory load from
%gs:0x0(%rip) and better predicted jump in the later case. The
improvements with {raw,this}_cpu_try_cmpxchg_128 in the third patch
are even more noticeable, because __int128 value lives in a register
pair, so the comparison needs three separate machine instructions, in
addition to a move of the register pair.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-09-15 12:02 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15  9:47   ` Ingo Molnar
2023-09-15 11:15     ` Ingo Molnar
2023-09-15 11:22       ` Ingo Molnar
2023-09-15 12:01     ` Uros Bizjak
2023-09-15 11:25   ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox