[RFC][PATCH] ia64: Fix atomic ops vs memory barriers

public inbox for linux-arch@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC][PATCH] ia64: Fix atomic ops vs memory barriers
@ 2014-02-04 12:22 Peter Zijlstra
  2014-02-04 13:37 ` Peter Zijlstra
  2014-02-04 16:29 ` Linus Torvalds
  0 siblings, 2 replies; 4+ messages in thread
From: Peter Zijlstra @ 2014-02-04 12:22 UTC (permalink / raw)
  To: Tony Luck, Fenghua Yu
  Cc: linux-kernel, linux-arch, Linus Torvalds, Paul McKenney,
	Will Deacon


The IA64 SDM V1-4.4.7 describes the memory ordering rules of IA64.

It states that the cmpxchg.{rel,acq} really have release and acquire
semantics and that xchg has acquire semantics.

Despite this the ia64 atomic_t code says that all atomic ops are fully
serialized, and its smp_mb__{before,after}_atomic_{inc,dec} are
barrier().

OTOH the atomic bitops, which are identically implemented (using
cmpxchg.acq), state that they have acquire semantics and set
smp_mb__before_clear_bit() to smp_mb().

Now either the SDM and the bitops are right, which means the atomic_t,
cmpxchg() and xchg() implementations are wrong, or the SDM is wrong and
cmpxchg.acq is actually fully serialized after all and the bitops
implementation is wrong, not to mention there's a big fat comment
missing someplace.


The below patch assumes the SDM is right (TM), and fixes the atomic_t,
cmpxchg() and xchg() implementations by inserting a mf before the
cmpxchg.acq (or xchg).

This way loads/stores before the mf stay there, loads/stores after the
cmpxchg.acq also stay after and the entire thing acts as a fully
serialized op the way it is supposed to.

For things like ia64_atomic_sub() which is a cmpxchg.acq loop with an
unordered load in, it still works because there is a control dependency
upon the cmpxchg op, therefore the load cannot escape.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 arch/ia64/include/asm/atomic.h       | 31 +++++++++++++++++++++++--------
 arch/ia64/include/asm/bitops.h       | 25 +++++++++++++++++++++++--
 arch/ia64/include/uapi/asm/cmpxchg.h | 13 ++++++++-----
 3 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 6e6fe1839f5d..5052fe7ce34b 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 
 #include <asm/intrinsics.h>
+#include <asm/barrier.h>
 
 
 #define ATOMIC_INIT(i)		{ (i) }
@@ -125,6 +126,7 @@ static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u)
 #define atomic_add_return(i,v)						\
 ({									\
 	int __ia64_aar_i = (i);						\
+ 	ia64_mf();							\
 	(__builtin_constant_p(i)					\
 	 && (   (__ia64_aar_i ==  1) || (__ia64_aar_i ==   4)		\
 	     || (__ia64_aar_i ==  8) || (__ia64_aar_i ==  16)		\
@@ -137,6 +139,7 @@ static __inline__ long atomic64_add_unless(atomic64_t *v, long a, long u)
 #define atomic64_add_return(i,v)					\
 ({									\
 	long __ia64_aar_i = (i);					\
+ 	ia64_mf();							\
 	(__builtin_constant_p(i)					\
 	 && (   (__ia64_aar_i ==  1) || (__ia64_aar_i ==   4)		\
 	     || (__ia64_aar_i ==  8) || (__ia64_aar_i ==  16)		\
@@ -162,7 +165,7 @@ atomic64_add_negative (__s64 i, atomic64_t *v)
 	return atomic64_add_return(i, v) < 0;
 }
 
-#define atomic_sub_return(i,v)						\
+#define __atomic_sub_return(i,v)					\
 ({									\
 	int __ia64_asr_i = (i);						\
 	(__builtin_constant_p(i)					\
@@ -174,7 +177,13 @@ atomic64_add_negative (__s64 i, atomic64_t *v)
 		: ia64_atomic_sub(__ia64_asr_i, v);			\
 })
 
-#define atomic64_sub_return(i,v)					\
+#define atomic_sub_return(i,v)						\
+({									\
+	ia64_mf();							\
+ 	__atomic_sub_return((i), (v));					\
+})
+ 	
+#define __atomic64_sub_return(i,v)					\
 ({									\
 	long __ia64_asr_i = (i);					\
 	(__builtin_constant_p(i)					\
@@ -186,6 +195,12 @@ atomic64_add_negative (__s64 i, atomic64_t *v)
 		: ia64_atomic64_sub(__ia64_asr_i, v);			\
 })
 
+#define atomic64_sub_return(i,v)					\
+({									\
+	ia64_mf();							\
+ 	__atomic64_sub_return((i), (v))					\
+})
+
 #define atomic_dec_return(v)		atomic_sub_return(1, (v))
 #define atomic_inc_return(v)		atomic_add_return(1, (v))
 #define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
@@ -198,20 +213,20 @@ atomic64_add_negative (__s64 i, atomic64_t *v)
 #define atomic64_dec_and_test(v)	(atomic64_sub_return(1, (v)) == 0)
 #define atomic64_inc_and_test(v)	(atomic64_add_return(1, (v)) == 0)
 
-#define atomic_add(i,v)			atomic_add_return((i), (v))
-#define atomic_sub(i,v)			atomic_sub_return((i), (v))
+#define atomic_add(i,v)			(void)__atomic_add_return((i), (v))
+#define atomic_sub(i,v)			(void)__atomic_sub_return((i), (v))
 #define atomic_inc(v)			atomic_add(1, (v))
 #define atomic_dec(v)			atomic_sub(1, (v))
 
-#define atomic64_add(i,v)		atomic64_add_return((i), (v))
-#define atomic64_sub(i,v)		atomic64_sub_return((i), (v))
+#define atomic64_add(i,v)		(void)__atomic64_add_return((i), (v))
+#define atomic64_sub(i,v)		(void)__atomic64_sub_return((i), (v))
 #define atomic64_inc(v)			atomic64_add(1, (v))
 #define atomic64_dec(v)			atomic64_sub(1, (v))
 
 /* Atomic operations are already serializing */
-#define smp_mb__before_atomic_dec()	barrier()
+#define smp_mb__before_atomic_dec()	smp_mb()
 #define smp_mb__after_atomic_dec()	barrier()
-#define smp_mb__before_atomic_inc()	barrier()
+#define smp_mb__before_atomic_inc()	smp_mb()
 #define smp_mb__after_atomic_inc()	barrier()
 
 #endif /* _ASM_IA64_ATOMIC_H */
diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
index c27eccd33349..4655b905f142 100644
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -69,7 +69,7 @@ __set_bit (int nr, volatile void *addr)
  * clear_bit() has "acquire" semantics.
  */
 #define smp_mb__before_clear_bit()	smp_mb()
-#define smp_mb__after_clear_bit()	do { /* skip */; } while (0)
+#define smp_mb__after_clear_bit()	barrier()
 
 /**
  * clear_bit - Clears a bit in memory
@@ -208,6 +208,8 @@ test_and_set_bit (int nr, volatile void *addr)
 	volatile __u32 *m;
 	CMPXCHG_BUGCHECK_DECL
 
+	smp_mb();
+
 	m = (volatile __u32 *) addr + (nr >> 5);
 	bit = 1 << (nr & 31);
 	do {
@@ -225,7 +227,22 @@ test_and_set_bit (int nr, volatile void *addr)
  *
  * This is the same as test_and_set_bit on ia64
  */
-#define test_and_set_bit_lock test_and_set_bit
+static __inline__ int
+test_and_set_bit_lock (int nr, volatile void *addr)
+{
+	__u32 bit, old, new;
+	volatile __u32 *m;
+	CMPXCHG_BUGCHECK_DECL
+
+	m = (volatile __u32 *) addr + (nr >> 5);
+	bit = 1 << (nr & 31);
+	do {
+		CMPXCHG_BUGCHECK(m);
+		old = *m;
+		new = old | bit;
+	} while (cmpxchg_acq(m, old, new) != old);
+	return (old & bit) != 0;
+}
 
 /**
  * __test_and_set_bit - Set a bit and return its old value
@@ -262,6 +279,8 @@ test_and_clear_bit (int nr, volatile void *addr)
 	volatile __u32 *m;
 	CMPXCHG_BUGCHECK_DECL
 
+	smp_mb();
+
 	m = (volatile __u32 *) addr + (nr >> 5);
 	mask = ~(1 << (nr & 31));
 	do {
@@ -307,6 +326,8 @@ test_and_change_bit (int nr, volatile void *addr)
 	volatile __u32 *m;
 	CMPXCHG_BUGCHECK_DECL
 
+	smp_mb();
+
 	m = (volatile __u32 *) addr + (nr >> 5);
 	bit = (1 << (nr & 31));
 	do {
diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
index 4f37dbbb8640..6fe5d6b7a739 100644
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ b/arch/ia64/include/uapi/asm/cmpxchg.h
@@ -53,7 +53,10 @@ extern void ia64_xchg_called_with_bad_pointer(void);
 })
 
 #define xchg(ptr, x)							\
-((__typeof__(*(ptr))) __xchg((unsigned long) (x), (ptr), sizeof(*(ptr))))
+({									\
+ 	ia64_mf();							\
+ 	(__typeof__(*(ptr))) __xchg((unsigned long) (x), (ptr), sizeof(*(ptr)))) \
+)}
 
 /*
  * Atomic compare and exchange.  Compare OLD with MEM, if identical,
@@ -119,11 +122,11 @@ extern long ia64_cmpxchg_called_with_bad_pointer(void);
 	ia64_cmpxchg(rel, (ptr), (o), (n), sizeof(*(ptr)))
 
 /* for compatibility with other platforms: */
-#define cmpxchg(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
-#define cmpxchg64(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
+#define cmpxchg(ptr, o, n)	({ ia64_mf(); cmpxchg_acq((ptr), (o), (n)); })
+#define cmpxchg64(ptr, o, n)	({ ia64_mf(); cmpxchg_acq((ptr), (o), (n)); })
 
-#define cmpxchg_local		cmpxchg
-#define cmpxchg64_local		cmpxchg64
+#define cmpxchg_local		cmpxchg_acq
+#define cmpxchg64_local		cmpxchg_acq
 
 #ifdef CONFIG_IA64_DEBUG_CMPXCHG
 # define CMPXCHG_BUGCHECK_DECL	int _cmpxchg_bugcheck_count = 128;

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC][PATCH] ia64: Fix atomic ops vs memory barriers
  2014-02-04 12:22 [RFC][PATCH] ia64: Fix atomic ops vs memory barriers Peter Zijlstra
@ 2014-02-04 13:37 ` Peter Zijlstra
  2014-02-04 16:29 ` Linus Torvalds
  1 sibling, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2014-02-04 13:37 UTC (permalink / raw)
  To: Tony Luck, Fenghua Yu
  Cc: linux-kernel, linux-arch, Linus Torvalds, Paul McKenney,
	Will Deacon

On Tue, Feb 04, 2014 at 01:22:12PM +0100, Peter Zijlstra wrote:
> 
> The IA64 SDM V1-4.4.7 describes the memory ordering rules of IA64.
> 
> It states that the cmpxchg.{rel,acq} really have release and acquire
> semantics and that xchg has acquire semantics.
> 
> Despite this the ia64 atomic_t code says that all atomic ops are fully
> serialized, and its smp_mb__{before,after}_atomic_{inc,dec} are
> barrier().
> 
> OTOH the atomic bitops, which are identically implemented (using
> cmpxchg.acq), state that they have acquire semantics and set
> smp_mb__before_clear_bit() to smp_mb().
> 
> Now either the SDM and the bitops are right, which means the atomic_t,
> cmpxchg() and xchg() implementations are wrong, or the SDM is wrong and
> cmpxchg.acq is actually fully serialized after all and the bitops
> implementation is wrong, not to mention there's a big fat comment
> missing someplace.
> 
> 
> The below patch assumes the SDM is right (TM), and fixes the atomic_t,
> cmpxchg() and xchg() implementations by inserting a mf before the
> cmpxchg.acq (or xchg).
> 
> This way loads/stores before the mf stay there, loads/stores after the
> cmpxchg.acq also stay after and the entire thing acts as a fully
> serialized op the way it is supposed to.
> 
> For things like ia64_atomic_sub() which is a cmpxchg.acq loop with an
> unordered load in, it still works because there is a control dependency
> upon the cmpxchg op, therefore the load cannot escape.
> 

And assuming all that is true; we also need the below on top.

Because when one writes:

  atomic_dec();
  smp_mb__after_atomic_dec();

One actually expects a full barrier, not an acquire.

---
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -225,8 +225,8 @@ atomic64_add_negative (__s64 i, atomic64
 
 /* Atomic operations are already serializing */
 #define smp_mb__before_atomic_dec()	smp_mb()
-#define smp_mb__after_atomic_dec()	barrier()
+#define smp_mb__after_atomic_dec()	smp_mb()
 #define smp_mb__before_atomic_inc()	smp_mb()
-#define smp_mb__after_atomic_inc()	barrier()
+#define smp_mb__after_atomic_inc()	smp_mb()
 
 #endif /* _ASM_IA64_ATOMIC_H */
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -69,7 +69,7 @@ __set_bit (int nr, volatile void *addr)
  * clear_bit() has "acquire" semantics.
  */
 #define smp_mb__before_clear_bit()	smp_mb()
-#define smp_mb__after_clear_bit()	barrier()
+#define smp_mb__after_clear_bit()	smp_mb()
 
 /**
  * clear_bit - Clears a bit in memory

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC][PATCH] ia64: Fix atomic ops vs memory barriers
  2014-02-04 12:22 [RFC][PATCH] ia64: Fix atomic ops vs memory barriers Peter Zijlstra
  2014-02-04 13:37 ` Peter Zijlstra
@ 2014-02-04 16:29 ` Linus Torvalds
  2014-02-04 16:40   ` Peter Zijlstra
  1 sibling, 1 reply; 4+ messages in thread
From: Linus Torvalds @ 2014-02-04 16:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tony Luck, Fenghua Yu, Linux Kernel Mailing List,
	linux-arch@vger.kernel.org, Paul McKenney, Will Deacon

On Tue, Feb 4, 2014 at 4:22 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> The below patch assumes the SDM is right (TM), and fixes the atomic_t,
> cmpxchg() and xchg() implementations by inserting a mf before the
> cmpxchg.acq (or xchg).

You picked the wrong thing to be right. The SDM is wrong.

Last time this came up, Tony explained it thus:

>> Worse still - early processor implementations actually just ignored
>> the acquire/release and did a full fence all the time.  Unfortunately
>> this meant a lot of badly written code that used .acq when they really
>> wanted .rel became legacy out in the wild - so when we made a cpu
>> that strictly did the .acq or .rel ... all that code started breaking - so
>> we had to back-pedal and keep the "legacy" behavior of a full fence :-(

and since ia64 is basically on life support as an architecture, we can
pretty much agree that the SDM is dead, and the only thing that
matters is implementation.

The above quote was strictly in the context of just cmpxchg, though,
so it's possible that the "fetchadd" instruction acts differently. I
would personally expect it to have the same issues, but let's see what
Tony says.. Tony?

                Linus

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC][PATCH] ia64: Fix atomic ops vs memory barriers
  2014-02-04 16:29 ` Linus Torvalds
@ 2014-02-04 16:40   ` Peter Zijlstra
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2014-02-04 16:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tony Luck, Fenghua Yu, Linux Kernel Mailing List,
	linux-arch@vger.kernel.org, Paul McKenney, Will Deacon

On Tue, Feb 04, 2014 at 08:29:36AM -0800, Linus Torvalds wrote:
> On Tue, Feb 4, 2014 at 4:22 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > The below patch assumes the SDM is right (TM), and fixes the atomic_t,
> > cmpxchg() and xchg() implementations by inserting a mf before the
> > cmpxchg.acq (or xchg).
> 
> You picked the wrong thing to be right. The SDM is wrong.

Figured, just my luck :-)

> Last time this came up, Tony explained it thus:
> 
> >> Worse still - early processor implementations actually just ignored
> >> the acquire/release and did a full fence all the time.  Unfortunately
> >> this meant a lot of badly written code that used .acq when they really
> >> wanted .rel became legacy out in the wild - so when we made a cpu
> >> that strictly did the .acq or .rel ... all that code started breaking - so
> >> we had to back-pedal and keep the "legacy" behavior of a full fence :-(

That would make a lovely comment near ia64_cmpxchg().

> and since ia64 is basically on life support as an architecture, we can
> pretty much agree that the SDM is dead, and the only thing that
> matters is implementation.
> 
> The above quote was strictly in the context of just cmpxchg, though,
> so it's possible that the "fetchadd" instruction acts differently. I
> would personally expect it to have the same issues, but let's see what
> Tony says.. Tony?

I would suspect it to be a full fence too, let me do the reverse patch.

---
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -65,11 +65,8 @@ __set_bit (int nr, volatile void *addr)
 	*((__u32 *) addr + (nr >> 5)) |= (1 << (nr & 31));
 }
 
-/*
- * clear_bit() has "acquire" semantics.
- */
-#define smp_mb__before_clear_bit()	smp_mb()
-#define smp_mb__after_clear_bit()	do { /* skip */; } while (0)
+#define smp_mb__before_clear_bit()	barrier();
+#define smp_mb__after_clear_bit()	barrier();
 
 /**
  * clear_bit - Clears a bit in memory
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ b/arch/ia64/include/uapi/asm/cmpxchg.h
@@ -118,6 +118,15 @@ extern long ia64_cmpxchg_called_with_bad
 #define cmpxchg_rel(ptr, o, n)	\
 	ia64_cmpxchg(rel, (ptr), (o), (n), sizeof(*(ptr)))
 
+/*
+ * Worse still - early processor implementations actually just ignored
+ * the acquire/release and did a full fence all the time.  Unfortunately
+ * this meant a lot of badly written code that used .acq when they really
+ * wanted .rel became legacy out in the wild - so when we made a cpu
+ * that strictly did the .acq or .rel ... all that code started breaking - so
+ * we had to back-pedal and keep the "legacy" behavior of a full fence :-(
+ */
+
 /* for compatibility with other platforms: */
 #define cmpxchg(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
 #define cmpxchg64(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-02-04 16:40 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-04 12:22 [RFC][PATCH] ia64: Fix atomic ops vs memory barriers Peter Zijlstra
2014-02-04 13:37 ` Peter Zijlstra
2014-02-04 16:29 ` Linus Torvalds
2014-02-04 16:40   ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox