public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/4] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32
@ 2024-04-10  6:29 Uros Bizjak
  2024-04-10  6:29 ` [PATCH v3 2/4] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic " Uros Bizjak
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Uros Bizjak @ 2024-04-10  6:29 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, H. Peter Anvin, Peter Zijlstra

Introduce arch_atomic64_try_cmpxchg for 32-bit targets to use
optimized target specific implementation instead of a generic one.
This implementation eliminates dual-word compare after
cmpxchg8b instruction and improves generated asm code from:

    2273:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
    2277:	8b 74 24 2c          	mov    0x2c(%esp),%esi
    227b:	89 d3                	mov    %edx,%ebx
    227d:	89 c2                	mov    %eax,%edx
    227f:	89 5c 24 10          	mov    %ebx,0x10(%esp)
    2283:	8b 7c 24 30          	mov    0x30(%esp),%edi
    2287:	89 44 24 1c          	mov    %eax,0x1c(%esp)
    228b:	31 f2                	xor    %esi,%edx
    228d:	89 d0                	mov    %edx,%eax
    228f:	89 da                	mov    %ebx,%edx
    2291:	31 fa                	xor    %edi,%edx
    2293:	09 d0                	or     %edx,%eax
    2295:	0f 85 a5 00 00 00    	jne    2340 <...>

to:

    2270:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
    2274:	0f 85 a6 00 00 00    	jne    2320 <...>

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/include/asm/atomic64_32.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index d510405e4e1d..11e817dab44a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -61,12 +61,18 @@ ATOMIC64_DECL(add_unless);
 #undef __ATOMIC64_DECL
 #undef ATOMIC64_EXPORT
 
-static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 {
-	return arch_cmpxchg64(&v->counter, o, n);
+	return arch_cmpxchg64(&v->counter, old, new);
 }
 #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
 
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+	return arch_try_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
 static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
 {
 	s64 o;
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 2/4] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic to x86_32
  2024-04-10  6:29 [PATCH v3 1/4] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32 Uros Bizjak
@ 2024-04-10  6:29 ` Uros Bizjak
  2024-04-10 13:40   ` [tip: locking/core] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic() " tip-bot2 for Uros Bizjak
  2024-04-10  6:29 ` [PATCH v3 3/4] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions Uros Bizjak
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 8+ messages in thread
From: Uros Bizjak @ 2024-04-10  6:29 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Mark Rutland, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, Peter Zijlstra

Introduce arch_atomic64_read_nonatomic for 32-bit targets to load
the value from atomic64_t location in a non-atomic way. This
function is intended to be used in cases where a subsequent atomic
operation will handle the torn value, and can be used to prime the
first iteration of unconditional try_cmpxchg() loops.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
v2: New patch.
v3: Expand function comment.
---
 arch/x86/include/asm/atomic64_32.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 11e817dab44a..bc76a88ae481 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,6 +14,32 @@ typedef struct {
 
 #define ATOMIC64_INIT(val)	{ (val) }
 
+/*
+ * Read an atomic64_t non-atomically.
+ *
+ * This is intended to be used in cases where a subsequent atomic operation
+ * will handle the torn value, and can be used to prime the first iteration
+ * of unconditional try_cmpxchg() loops, e.g.:
+ *
+ * 	s64 val = arch_atomic64_read_nonatomic(v);
+ * 	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ *
+ * This is NOT safe to use where the value is not always checked by a
+ * subsequent atomic operation, such as in conditional try_cmpxchg() loops
+ * that can break before the atomic operation, e.g.:
+ *
+ * 	s64 val = arch_atomic64_read_nonatomic(v);
+ * 	do {
+ * 		if (condition(val))
+ * 			break;
+ * 	} while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ */
+static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v)
+{
+	/* See comment in arch_atomic_read(). */
+	return __READ_ONCE(v->counter);
+}
+
 #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
 #ifndef ATOMIC64_EXPORT
 #define ATOMIC64_DECL_ONE __ATOMIC64_DECL
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 3/4] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions
  2024-04-10  6:29 [PATCH v3 1/4] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32 Uros Bizjak
  2024-04-10  6:29 ` [PATCH v3 2/4] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic " Uros Bizjak
@ 2024-04-10  6:29 ` Uros Bizjak
  2024-04-10 13:40   ` [tip: locking/core] " tip-bot2 for Uros Bizjak
  2024-04-10  6:29 ` [PATCH v3 4/4] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions Uros Bizjak
  2024-04-10 13:40 ` [tip: locking/core] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg() to x86_32 tip-bot2 for Uros Bizjak
  3 siblings, 1 reply; 8+ messages in thread
From: Uros Bizjak @ 2024-04-10  6:29 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, H. Peter Anvin, Peter Zijlstra

Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
use arch_atomic64_try_cmpxchg.  This implementation avoids one extra
trip through the cmpxchg loop.

The value preload before the cmpxchg loop does not need to be atomic.
Use arch_atomic64_read_nonatomic(v) to load the value from atomic_t
location in a non-atomic way.

The generated code improves from:

  1917d5:	31 c9                	xor    %ecx,%ecx
  1917d7:	31 db                	xor    %ebx,%ebx
  1917d9:	89 4c 24 3c          	mov    %ecx,0x3c(%esp)
  1917dd:	8b 74 24 24          	mov    0x24(%esp),%esi
  1917e1:	89 c8                	mov    %ecx,%eax
  1917e3:	89 5c 24 34          	mov    %ebx,0x34(%esp)
  1917e7:	8b 7c 24 28          	mov    0x28(%esp),%edi
  1917eb:	21 ce                	and    %ecx,%esi
  1917ed:	89 74 24 4c          	mov    %esi,0x4c(%esp)
  1917f1:	21 df                	and    %ebx,%edi
  1917f3:	89 de                	mov    %ebx,%esi
  1917f5:	89 7c 24 50          	mov    %edi,0x50(%esp)
  1917f9:	8b 54 24 4c          	mov    0x4c(%esp),%edx
  1917fd:	8b 7c 24 2c          	mov    0x2c(%esp),%edi
  191801:	8b 4c 24 50          	mov    0x50(%esp),%ecx
  191805:	89 d3                	mov    %edx,%ebx
  191807:	89 f2                	mov    %esi,%edx
  191809:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
  19180d:	89 c1                	mov    %eax,%ecx
  19180f:	8b 74 24 34          	mov    0x34(%esp),%esi
  191813:	89 d3                	mov    %edx,%ebx
  191815:	89 44 24 4c          	mov    %eax,0x4c(%esp)
  191819:	8b 44 24 3c          	mov    0x3c(%esp),%eax
  19181d:	89 df                	mov    %ebx,%edi
  19181f:	89 54 24 44          	mov    %edx,0x44(%esp)
  191823:	89 ca                	mov    %ecx,%edx
  191825:	31 de                	xor    %ebx,%esi
  191827:	31 c8                	xor    %ecx,%eax
  191829:	09 f0                	or     %esi,%eax
  19182b:	75 ac                	jne    1917d9 <...>

to:

  1912ba:	8b 06                	mov    (%esi),%eax
  1912bc:	8b 56 04             	mov    0x4(%esi),%edx
  1912bf:	89 44 24 3c          	mov    %eax,0x3c(%esp)
  1912c3:	89 c1                	mov    %eax,%ecx
  1912c5:	23 4c 24 34          	and    0x34(%esp),%ecx
  1912c9:	89 d3                	mov    %edx,%ebx
  1912cb:	23 5c 24 38          	and    0x38(%esp),%ebx
  1912cf:	89 54 24 40          	mov    %edx,0x40(%esp)
  1912d3:	89 4c 24 2c          	mov    %ecx,0x2c(%esp)
  1912d7:	89 5c 24 30          	mov    %ebx,0x30(%esp)
  1912db:	8b 5c 24 2c          	mov    0x2c(%esp),%ebx
  1912df:	8b 4c 24 30          	mov    0x30(%esp),%ecx
  1912e3:	f0 0f c7 0e          	lock cmpxchg8b (%esi)
  1912e7:	0f 85 f3 02 00 00    	jne    1915e0 <...>

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
v2: Use arch_atomic64_read_nonatomic().
---
 arch/x86/include/asm/atomic64_32.h | 43 +++++++++++++-----------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index bc76a88ae481..8db2ec4d6cda 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -227,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
 
 static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_and arch_atomic64_fetch_and
 
 static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_or arch_atomic64_fetch_or
 
 static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
 
 static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_add arch_atomic64_fetch_add
 
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 4/4] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions
  2024-04-10  6:29 [PATCH v3 1/4] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32 Uros Bizjak
  2024-04-10  6:29 ` [PATCH v3 2/4] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic " Uros Bizjak
  2024-04-10  6:29 ` [PATCH v3 3/4] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions Uros Bizjak
@ 2024-04-10  6:29 ` Uros Bizjak
  2024-04-10 13:40   ` [tip: locking/core] " tip-bot2 for Uros Bizjak
  2024-04-10 13:40 ` [tip: locking/core] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg() to x86_32 tip-bot2 for Uros Bizjak
  3 siblings, 1 reply; 8+ messages in thread
From: Uros Bizjak @ 2024-04-10  6:29 UTC (permalink / raw)
  To: x86, linux-kernel
  Cc: Uros Bizjak, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, H. Peter Anvin, Peter Zijlstra

There is no need to implement arch_atomic_sub() family of inline
functions, corresponding macros can be directly implemented using
arch_atomic_add() inlines with negated argument.

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/include/asm/atomic.h      | 12 ++----------
 arch/x86/include/asm/atomic64_64.h | 12 ++----------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55a55ec04350..55b4d24356ea 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
 }
 #define arch_atomic_add_return arch_atomic_add_return
 
-static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
-{
-	return arch_atomic_add_return(-i, v);
-}
-#define arch_atomic_sub_return arch_atomic_sub_return
+#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)
 
 static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
 {
@@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
 }
 #define arch_atomic_fetch_add arch_atomic_fetch_add
 
-static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
-{
-	return xadd(&v->counter, -i);
-}
-#define arch_atomic_fetch_sub arch_atomic_fetch_sub
+#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)
 
 static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
 {
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 3165c0feedf7..ae12acae5b06 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
 }
 #define arch_atomic64_add_return arch_atomic64_add_return
 
-static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
-{
-	return arch_atomic64_add_return(-i, v);
-}
-#define arch_atomic64_sub_return arch_atomic64_sub_return
+#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)
 
 static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
@@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 }
 #define arch_atomic64_fetch_add arch_atomic64_fetch_add
 
-static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
-{
-	return xadd(&v->counter, -i);
-}
-#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)
 
 static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 {
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip: locking/core] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions
  2024-04-10  6:29 ` [PATCH v3 4/4] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions Uros Bizjak
@ 2024-04-10 13:40   ` tip-bot2 for Uros Bizjak
  0 siblings, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-04-10 13:40 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, Linus Torvalds, x86, linux-kernel

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     21689e4bfb9ae8f8b45279c53faecaa5a056ffa5
Gitweb:        https://git.kernel.org/tip/21689e4bfb9ae8f8b45279c53faecaa5a056ffa5
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 10 Apr 2024 08:29:36 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 10 Apr 2024 15:04:55 +02:00

locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions

There is no need to implement arch_atomic_sub() family of inline
functions, corresponding macros can be directly implemented using
arch_atomic_add() inlines with negated argument.

No functional changes intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240410062957.322614-4-ubizjak@gmail.com
---
 arch/x86/include/asm/atomic.h      | 12 ++----------
 arch/x86/include/asm/atomic64_64.h | 12 ++----------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55a55ec..55b4d24 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
 }
 #define arch_atomic_add_return arch_atomic_add_return
 
-static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
-{
-	return arch_atomic_add_return(-i, v);
-}
-#define arch_atomic_sub_return arch_atomic_sub_return
+#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)
 
 static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
 {
@@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
 }
 #define arch_atomic_fetch_add arch_atomic_fetch_add
 
-static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
-{
-	return xadd(&v->counter, -i);
-}
-#define arch_atomic_fetch_sub arch_atomic_fetch_sub
+#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)
 
 static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
 {
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 3165c0f..ae12aca 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
 }
 #define arch_atomic64_add_return arch_atomic64_add_return
 
-static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
-{
-	return arch_atomic64_add_return(-i, v);
-}
-#define arch_atomic64_sub_return arch_atomic64_sub_return
+#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)
 
 static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
@@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 }
 #define arch_atomic64_fetch_add arch_atomic64_fetch_add
 
-static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
-{
-	return xadd(&v->counter, -i);
-}
-#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)
 
 static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 {

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip: locking/core] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic() to x86_32
  2024-04-10  6:29 ` [PATCH v3 2/4] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic " Uros Bizjak
@ 2024-04-10 13:40   ` tip-bot2 for Uros Bizjak
  0 siblings, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-04-10 13:40 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Mark Rutland, Uros Bizjak, Ingo Molnar, Linus Torvalds, x86,
	linux-kernel

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     e73c4e34a0e9e3dfcb4e5ee4ccd3039a7b603218
Gitweb:        https://git.kernel.org/tip/e73c4e34a0e9e3dfcb4e5ee4ccd3039a7b603218
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 10 Apr 2024 08:29:34 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 10 Apr 2024 15:04:54 +02:00

locking/atomic/x86: Introduce arch_atomic64_read_nonatomic() to x86_32

Introduce arch_atomic64_read_nonatomic() for 32-bit targets to load
the value from atomic64_t location in a non-atomic way. This
function is intended to be used in cases where a subsequent atomic
operation will handle the torn value, and can be used to prime the
first iteration of unconditional try_cmpxchg() loops.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240410062957.322614-2-ubizjak@gmail.com
---
 arch/x86/include/asm/atomic64_32.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index ec217aa..bbdf174 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,6 +14,32 @@ typedef struct {
 
 #define ATOMIC64_INIT(val)	{ (val) }
 
+/*
+ * Read an atomic64_t non-atomically.
+ *
+ * This is intended to be used in cases where a subsequent atomic operation
+ * will handle the torn value, and can be used to prime the first iteration
+ * of unconditional try_cmpxchg() loops, e.g.:
+ *
+ * 	s64 val = arch_atomic64_read_nonatomic(v);
+ * 	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ *
+ * This is NOT safe to use where the value is not always checked by a
+ * subsequent atomic operation, such as in conditional try_cmpxchg() loops
+ * that can break before the atomic operation, e.g.:
+ *
+ * 	s64 val = arch_atomic64_read_nonatomic(v);
+ * 	do {
+ * 		if (condition(val))
+ * 			break;
+ * 	} while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ */
+static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v)
+{
+	/* See comment in arch_atomic_read(). */
+	return __READ_ONCE(v->counter);
+}
+
 #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
 #ifndef ATOMIC64_EXPORT
 #define ATOMIC64_DECL_ONE __ATOMIC64_DECL

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip: locking/core] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions
  2024-04-10  6:29 ` [PATCH v3 3/4] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions Uros Bizjak
@ 2024-04-10 13:40   ` tip-bot2 for Uros Bizjak
  0 siblings, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-04-10 13:40 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, Linus Torvalds, x86, linux-kernel

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     95ece48165c136b96fae0f6144f55cbf8b24aeb9
Gitweb:        https://git.kernel.org/tip/95ece48165c136b96fae0f6144f55cbf8b24aeb9
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 10 Apr 2024 08:29:35 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 10 Apr 2024 15:04:55 +02:00

locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
use arch_atomic64_try_cmpxchg().  This implementation avoids one extra
trip through the CMPXCHG loop.

The value preload before the cmpxchg loop does not need to be atomic.
Use arch_atomic64_read_nonatomic(v) to load the value from atomic_t
location in a non-atomic way.

The generated code improves from:

  1917d5:	31 c9                	xor    %ecx,%ecx
  1917d7:	31 db                	xor    %ebx,%ebx
  1917d9:	89 4c 24 3c          	mov    %ecx,0x3c(%esp)
  1917dd:	8b 74 24 24          	mov    0x24(%esp),%esi
  1917e1:	89 c8                	mov    %ecx,%eax
  1917e3:	89 5c 24 34          	mov    %ebx,0x34(%esp)
  1917e7:	8b 7c 24 28          	mov    0x28(%esp),%edi
  1917eb:	21 ce                	and    %ecx,%esi
  1917ed:	89 74 24 4c          	mov    %esi,0x4c(%esp)
  1917f1:	21 df                	and    %ebx,%edi
  1917f3:	89 de                	mov    %ebx,%esi
  1917f5:	89 7c 24 50          	mov    %edi,0x50(%esp)
  1917f9:	8b 54 24 4c          	mov    0x4c(%esp),%edx
  1917fd:	8b 7c 24 2c          	mov    0x2c(%esp),%edi
  191801:	8b 4c 24 50          	mov    0x50(%esp),%ecx
  191805:	89 d3                	mov    %edx,%ebx
  191807:	89 f2                	mov    %esi,%edx
  191809:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
  19180d:	89 c1                	mov    %eax,%ecx
  19180f:	8b 74 24 34          	mov    0x34(%esp),%esi
  191813:	89 d3                	mov    %edx,%ebx
  191815:	89 44 24 4c          	mov    %eax,0x4c(%esp)
  191819:	8b 44 24 3c          	mov    0x3c(%esp),%eax
  19181d:	89 df                	mov    %ebx,%edi
  19181f:	89 54 24 44          	mov    %edx,0x44(%esp)
  191823:	89 ca                	mov    %ecx,%edx
  191825:	31 de                	xor    %ebx,%esi
  191827:	31 c8                	xor    %ecx,%eax
  191829:	09 f0                	or     %esi,%eax
  19182b:	75 ac                	jne    1917d9 <...>

to:

  1912ba:	8b 06                	mov    (%esi),%eax
  1912bc:	8b 56 04             	mov    0x4(%esi),%edx
  1912bf:	89 44 24 3c          	mov    %eax,0x3c(%esp)
  1912c3:	89 c1                	mov    %eax,%ecx
  1912c5:	23 4c 24 34          	and    0x34(%esp),%ecx
  1912c9:	89 d3                	mov    %edx,%ebx
  1912cb:	23 5c 24 38          	and    0x38(%esp),%ebx
  1912cf:	89 54 24 40          	mov    %edx,0x40(%esp)
  1912d3:	89 4c 24 2c          	mov    %ecx,0x2c(%esp)
  1912d7:	89 5c 24 30          	mov    %ebx,0x30(%esp)
  1912db:	8b 5c 24 2c          	mov    0x2c(%esp),%ebx
  1912df:	8b 4c 24 30          	mov    0x30(%esp),%ecx
  1912e3:	f0 0f c7 0e          	lock cmpxchg8b (%esi)
  1912e7:	0f 85 f3 02 00 00    	jne    1915e0 <...>

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240410062957.322614-3-ubizjak@gmail.com
---
 arch/x86/include/asm/atomic64_32.h | 43 ++++++++++++-----------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index bbdf174..40ff73b 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -227,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
 
 static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_and arch_atomic64_fetch_and
 
 static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_or arch_atomic64_fetch_or
 
 static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
 }
 
 static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
 
 static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
-	s64 old, c = 0;
+	s64 val = arch_atomic64_read_nonatomic(v);
 
-	while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
-		c = old;
+	do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
 
-	return old;
+	return val;
 }
 #define arch_atomic64_fetch_add arch_atomic64_fetch_add
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [tip: locking/core] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg() to x86_32
  2024-04-10  6:29 [PATCH v3 1/4] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32 Uros Bizjak
                   ` (2 preceding siblings ...)
  2024-04-10  6:29 ` [PATCH v3 4/4] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions Uros Bizjak
@ 2024-04-10 13:40 ` tip-bot2 for Uros Bizjak
  3 siblings, 0 replies; 8+ messages in thread
From: tip-bot2 for Uros Bizjak @ 2024-04-10 13:40 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Uros Bizjak, Ingo Molnar, Linus Torvalds, x86, linux-kernel

The following commit has been merged into the locking/core branch of tip:

Commit-ID:     276b893049e4cdc2f33c009706a75ec18a114485
Gitweb:        https://git.kernel.org/tip/276b893049e4cdc2f33c009706a75ec18a114485
Author:        Uros Bizjak <ubizjak@gmail.com>
AuthorDate:    Wed, 10 Apr 2024 08:29:33 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 10 Apr 2024 15:04:54 +02:00

locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg() to x86_32

Introduce arch_atomic64_try_cmpxchg() for 32-bit targets to use
optimized target specific implementation instead of a generic one.
This implementation eliminates dual-word compare after
cmpxchg8b instruction and improves generated asm code from:

    2273:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
    2277:	8b 74 24 2c          	mov    0x2c(%esp),%esi
    227b:	89 d3                	mov    %edx,%ebx
    227d:	89 c2                	mov    %eax,%edx
    227f:	89 5c 24 10          	mov    %ebx,0x10(%esp)
    2283:	8b 7c 24 30          	mov    0x30(%esp),%edi
    2287:	89 44 24 1c          	mov    %eax,0x1c(%esp)
    228b:	31 f2                	xor    %esi,%edx
    228d:	89 d0                	mov    %edx,%eax
    228f:	89 da                	mov    %ebx,%edx
    2291:	31 fa                	xor    %edi,%edx
    2293:	09 d0                	or     %edx,%eax
    2295:	0f 85 a5 00 00 00    	jne    2340 <...>

to:

    2270:	f0 0f c7 0f          	lock cmpxchg8b (%edi)
    2274:	0f 85 a6 00 00 00    	jne    2320 <...>

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240410062957.322614-1-ubizjak@gmail.com
---
 arch/x86/include/asm/atomic64_32.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 3486d91..ec217aa 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -61,12 +61,18 @@ ATOMIC64_DECL(add_unless);
 #undef __ATOMIC64_DECL
 #undef ATOMIC64_EXPORT
 
-static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 {
-	return arch_cmpxchg64(&v->counter, o, n);
+	return arch_cmpxchg64(&v->counter, old, new);
 }
 #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
 
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+	return arch_try_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
 static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
 {
 	s64 o;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-04-10 13:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-10  6:29 [PATCH v3 1/4] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32 Uros Bizjak
2024-04-10  6:29 ` [PATCH v3 2/4] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic " Uros Bizjak
2024-04-10 13:40   ` [tip: locking/core] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic() " tip-bot2 for Uros Bizjak
2024-04-10  6:29 ` [PATCH v3 3/4] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions Uros Bizjak
2024-04-10 13:40   ` [tip: locking/core] " tip-bot2 for Uros Bizjak
2024-04-10  6:29 ` [PATCH v3 4/4] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions Uros Bizjak
2024-04-10 13:40   ` [tip: locking/core] " tip-bot2 for Uros Bizjak
2024-04-10 13:40 ` [tip: locking/core] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg() to x86_32 tip-bot2 for Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox