* [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg
@ 2023-08-30 15:13 Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
0 siblings, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
To: x86, linux-kernel
Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
Define target-specific raw_cpu_try_cmpxchg_N and
this_cpu_try_cmpxchg_N macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d730463..c8309f260d98 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do { \
(typeof(_var))(unsigned long) pco_old__; \
})
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+ __pcpu_type_##size pco_old__ = *pco_oval__; \
+ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
+ __percpu_arg([var])) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [oval] "+a" (pco_old__), \
+ [var] "+m" (_var) \
+ : [nval] __pcpu_reg_##size(, pco_new__) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *pco_oval__ = pco_old__; \
+ likely(success); \
+})
+
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \
@@ -343,6 +362,9 @@ do { \
#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
@@ -350,6 +372,9 @@ do { \
#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
/*
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +389,7 @@ do { \
#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +399,7 @@ do { \
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
--
2.41.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
@ 2023-08-30 15:13 ` Uros Bizjak
2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
1 sibling, 2 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-08-30 15:13 UTC (permalink / raw)
To: x86, linux-kernel
Cc: Uros Bizjak, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after cmpxchg (and related move instruction in front of cmpxchg).
Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.
No functional change intended.
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
---
arch/x86/include/asm/preempt.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25b1bd8..4527e1430c6d 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
{
int old, new;
+ old = raw_cpu_read_4(pcpu_hot.preempt_count);
do {
- old = raw_cpu_read_4(pcpu_hot.preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+ } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
}
/*
--
2.41.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:15 ` Ingo Molnar
2023-09-15 12:01 ` Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
1 sibling, 2 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 9:47 UTC (permalink / raw)
To: Uros Bizjak
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
* Uros Bizjak <ubizjak@gmail.com> wrote:
> Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> compare after cmpxchg (and related move instruction in front of cmpxchg).
>
> Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> cmpxchg fails. There is no need to re-read the value in the loop.
>
> No functional change intended.
>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> ---
> arch/x86/include/asm/preempt.h | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> index 2d13f25b1bd8..4527e1430c6d 100644
> --- a/arch/x86/include/asm/preempt.h
> +++ b/arch/x86/include/asm/preempt.h
> @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> {
> int old, new;
>
> + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> do {
> - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> new = (old & PREEMPT_NEED_RESCHED) |
> (pc & ~PREEMPT_NEED_RESCHED);
> - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
It would be really nice to have a before/after comparison of generated
assembly code in the changelog, to demonstrate the effectiveness of this
optimization.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-09-15 9:47 ` Ingo Molnar
@ 2023-09-15 11:15 ` Ingo Molnar
2023-09-15 11:22 ` Ingo Molnar
2023-09-15 12:01 ` Uros Bizjak
1 sibling, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:15 UTC (permalink / raw)
To: Uros Bizjak
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
* Ingo Molnar <mingo@kernel.org> wrote:
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> >
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> >
> > No functional change intended.
> >
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> > arch/x86/include/asm/preempt.h | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > {
> > int old, new;
> >
> > + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > do {
> > - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > new = (old & PREEMPT_NEED_RESCHED) |
> > (pc & ~PREEMPT_NEED_RESCHED);
> > - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
>
> It would be really nice to have a before/after comparison of generated
> assembly code in the changelog, to demonstrate the effectiveness of this
> optimization.
Never mind, you did exactly that in the September 6 variation of these
changes. I'll apply those.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-09-15 11:15 ` Ingo Molnar
@ 2023-09-15 11:22 ` Ingo Molnar
0 siblings, 0 replies; 8+ messages in thread
From: Ingo Molnar @ 2023-09-15 11:22 UTC (permalink / raw)
To: Uros Bizjak
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
* Ingo Molnar <mingo@kernel.org> wrote:
>
> * Ingo Molnar <mingo@kernel.org> wrote:
>
> >
> > * Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > > compare after cmpxchg (and related move instruction in front of cmpxchg).
> > >
> > > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > > cmpxchg fails. There is no need to re-read the value in the loop.
> > >
> > > No functional change intended.
> > >
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Ingo Molnar <mingo@redhat.com>
> > > Cc: Borislav Petkov <bp@alien8.de>
> > > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > > ---
> > > arch/x86/include/asm/preempt.h | 4 ++--
> > > 1 file changed, 2 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > > index 2d13f25b1bd8..4527e1430c6d 100644
> > > --- a/arch/x86/include/asm/preempt.h
> > > +++ b/arch/x86/include/asm/preempt.h
> > > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > > {
> > > int old, new;
> > >
> > > + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > > do {
> > > - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > > new = (old & PREEMPT_NEED_RESCHED) |
> > > (pc & ~PREEMPT_NEED_RESCHED);
> > > - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > > + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
> >
> > It would be really nice to have a before/after comparison of generated
> > assembly code in the changelog, to demonstrate the effectiveness of this
> > optimization.
>
> Never mind, you did exactly that in the September 6 variation of these
> changes. I'll apply those.
I mean, this third patch of yours:
[PATCH] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
Had a proper disassembly comparison - so I've applied all 3 optimization
patches to tip:x86/asm as:
b8e3dfa16ec5 ("x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()")
5f863897d964 ("x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()")
54cd971c6f44 ("x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}")
Thanks,
Ingo
^ permalink raw reply [flat|nested] 8+ messages in thread
* [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15 9:47 ` Ingo Molnar
@ 2023-09-15 11:25 ` tip-bot2 for Uros Bizjak
1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
To: linux-tip-commits
Cc: Uros Bizjak, Ingo Molnar, Peter Zijlstra, x86, linux-kernel
The following commit has been merged into the x86/asm branch of tip:
Commit-ID: b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Gitweb: https://git.kernel.org/tip/b8e3dfa16ec55f310dd95831614af3d24abf5ed5
Author: Uros Bizjak <ubizjak@gmail.com>
AuthorDate: Wed, 30 Aug 2023 17:13:57 +02:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:19:22 +02:00
x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
Use raw_cpu_try_cmpxchg() instead of raw_cpu_cmpxchg(*ptr, old, new) == old.
x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
compare after CMPXCHG (and related MOV instruction in front of CMPXCHG).
Also, raw_cpu_try_cmpxchg() implicitly assigns old *ptr value to "old" when
cmpxchg fails. There is no need to re-read the value in the loop.
No functional change intended.
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230830151623.3900-2-ubizjak@gmail.com
---
arch/x86/include/asm/preempt.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 2d13f25..4527e14 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
{
int old, new;
+ old = raw_cpu_read_4(pcpu_hot.preempt_count);
do {
- old = raw_cpu_read_4(pcpu_hot.preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+ } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
}
/*
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
@ 2023-09-15 11:25 ` tip-bot2 for Uros Bizjak
1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2023-09-15 11:25 UTC (permalink / raw)
To: linux-tip-commits; +Cc: Uros Bizjak, Ingo Molnar, x86, linux-kernel
The following commit has been merged into the x86/asm branch of tip:
Commit-ID: 5f863897d964e834a0da35b1e483b5bb8faca522
Gitweb: https://git.kernel.org/tip/5f863897d964e834a0da35b1e483b5bb8faca522
Author: Uros Bizjak <ubizjak@gmail.com>
AuthorDate: Wed, 30 Aug 2023 17:13:56 +02:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 15 Sep 2023 13:18:23 +02:00
x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
Define target-specific raw_cpu_try_cmpxchg_N() and
this_cpu_try_cmpxchg_N() macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230830151623.3900-1-ubizjak@gmail.com
---
arch/x86/include/asm/percpu.h | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 4c36419..a87db61 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,25 @@ do { \
(typeof(_var))(unsigned long) pco_old__; \
})
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+ __pcpu_type_##size pco_old__ = *pco_oval__; \
+ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
+ __percpu_arg([var])) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [oval] "+a" (pco_old__), \
+ [var] "+m" (_var) \
+ : [nval] __pcpu_reg_##size(, pco_new__) \
+ : "memory"); \
+ if (unlikely(!success)) \
+ *pco_oval__ = pco_old__; \
+ likely(success); \
+})
+
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \
@@ -410,6 +429,9 @@ do { \
#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
@@ -417,6 +439,9 @@ do { \
#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
/*
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -431,6 +456,7 @@ do { \
#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -440,6 +466,7 @@ do { \
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set
2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:15 ` Ingo Molnar
@ 2023-09-15 12:01 ` Uros Bizjak
1 sibling, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2023-09-15 12:01 UTC (permalink / raw)
To: Ingo Molnar
Cc: x86, linux-kernel, Peter Zijlstra, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, H. Peter Anvin
On Fri, Sep 15, 2023 at 11:47 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > Use raw_cpu_try_cmpxchg instead of raw_cpu_cmpxchg (*ptr, old, new) == old.
> > x86 CMPXCHG instruction returns success in ZF flag, so this change saves a
> > compare after cmpxchg (and related move instruction in front of cmpxchg).
> >
> > Also, raw_cpu_try_cmpxchg implicitly assigns old *ptr value to "old" when
> > cmpxchg fails. There is no need to re-read the value in the loop.
> >
> > No functional change intended.
> >
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Borislav Petkov <bp@alien8.de>
> > Cc: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: "H. Peter Anvin" <hpa@zytor.com>
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > ---
> > arch/x86/include/asm/preempt.h | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
> > index 2d13f25b1bd8..4527e1430c6d 100644
> > --- a/arch/x86/include/asm/preempt.h
> > +++ b/arch/x86/include/asm/preempt.h
> > @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
> > {
> > int old, new;
> >
> > + old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > do {
> > - old = raw_cpu_read_4(pcpu_hot.preempt_count);
> > new = (old & PREEMPT_NEED_RESCHED) |
> > (pc & ~PREEMPT_NEED_RESCHED);
> > - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
> > + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
>
> It would be really nice to have a before/after comparison of generated
> assembly code in the changelog, to demonstrate the effectiveness of this
> optimization.
The assembly code improvements are in line with other try_cmpxchg
conversions, but for reference, finish_task_switch() from
kernel/sched/core.c that inlines preempt_count_set() improves from:
5bad: 65 8b 0d 00 00 00 00 mov %gs:0x0(%rip),%ecx
5bb4: 89 ca mov %ecx,%edx
5bb6: 89 c8 mov %ecx,%eax
5bb8: 81 e2 00 00 00 80 and $0x80000000,%edx
5bbe: 83 ca 02 or $0x2,%edx
5bc1: 65 0f b1 15 00 00 00 cmpxchg %edx,%gs:0x0(%rip)
5bc8: 00
5bc9: 39 c1 cmp %eax,%ecx
5bcb: 75 e0 jne 5bad <...>
5bcd: e9 5a fe ff ff jmpq 5a2c <...>
5bd2:
to:
5bad: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax
5bb4: 89 c2 mov %eax,%edx
5bb6: 81 e2 00 00 00 80 and $0x80000000,%edx
5bbc: 83 ca 02 or $0x2,%edx
5bbf: 65 0f b1 15 00 00 00 cmpxchg %edx,%gs:0x0(%rip)
5bc6: 00
5bc7: 0f 84 5f fe ff ff je 5a2c <...>
5bcd: eb e5 jmp 5bb4 <...>
5bcf:
Please note missing cmp (and mov), loop without extra memory load from
%gs:0x0(%rip) and better predicted jump in the later case. The
improvements with {raw,this}_cpu_try_cmpxchg_128 in the third patch
are even more noticeable, because __int128 value lives in a register
pair, so the comparison needs three separate machine instructions, in
addition to a move of the register pair.
Thanks,
Uros.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2023-09-15 12:02 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-30 15:13 [PATCH 1/2] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg Uros Bizjak
2023-08-30 15:13 ` [PATCH 2/2] x86/percpu: Use raw_cpu_try_cmpxchg in preempt_count_set Uros Bizjak
2023-09-15 9:47 ` Ingo Molnar
2023-09-15 11:15 ` Ingo Molnar
2023-09-15 11:22 ` Ingo Molnar
2023-09-15 12:01 ` Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() tip-bot2 for Uros Bizjak
2023-09-15 11:25 ` [tip: x86/asm] x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() tip-bot2 for Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox