[PATCH 0/5] locking: Use acquire/release semantics

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/5] locking: Use acquire/release semantics
@ 2015-09-21 20:17 Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 1/5] asm-generic: Add _{relaxed|acquire|release}() variants for inc/dec atomics Davidlohr Bueso
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-21 20:17 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	Davidlohr Bueso, linux-kernel

Changes from v2:
o Rename patch titles (per Linus), slight changelog
rephrasing.

Hi,

This series continues porting users to Will's new _{acquire|release|relaxed}
optimizations for weakly ordered archs -- and in practice, arm64 being the only
actual user we have, thus could use a _lot_ more testing. While we have osq in
-tip recently, along with Waiman's one for native qspinlock, the following
patches pretty much complete the move over for all things kernel/locking/*.
Of course, particular primitives can be found in the obvious patch titles. Note
that patch 1 adds the necessary machinery to fully support some locks, such as
rwsems.

Compiles on x86 and x86-64.

Thanks!

Davidlohr Bueso (5):
  asm-generic: Add _{relaxed|acquire|release}() variants for inc/dec
    atomics
  locking/mutex: Use acquire/release semantics
  locking/rtmutex: Use acquire/release semantics
  locking/rwsem: Use acquire/release semantics
  locking/mcs: Use acquire/release semantics

 include/asm-generic/atomic-long.h | 29 +++++++-----
 include/asm-generic/mutex-dec.h   |  8 ++--
 include/asm-generic/mutex-xchg.h  | 10 ++--
 include/asm-generic/rwsem.h       | 14 +++---
 include/linux/atomic.h            | 97 +++++++++++++++++++++++++++++++++++++++
 kernel/locking/mcs_spinlock.h     |  4 +-
 kernel/locking/mutex.c            |  9 ++--
 kernel/locking/rtmutex.c          | 30 ++++++++----
 kernel/locking/rwsem-xadd.c       |  5 +-
 9 files changed, 161 insertions(+), 45 deletions(-)

-- 
2.1.4


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/5] asm-generic: Add _{relaxed|acquire|release}() variants for inc/dec atomics
  2015-09-21 20:17 [PATCH 0/5] locking: Use acquire/release semantics Davidlohr Bueso
@ 2015-09-21 20:17 ` Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 2/5] locking/mutex: Use acquire/release semantics Davidlohr Bueso
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-21 20:17 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	Davidlohr Bueso, linux-kernel, Davidlohr Bueso

Similar to what we have for regular add/sub calls. For now, no actual arch
implements them, so everyone falls back to the default atomics... iow,
nothing changes. These will be used in future primitives.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/asm-generic/atomic-long.h | 29 +++++++-----
 include/linux/atomic.h            | 97 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+), 12 deletions(-)

diff --git a/include/asm-generic/atomic-long.h b/include/asm-generic/atomic-long.h
index a94cbeb..c3c7183 100644
--- a/include/asm-generic/atomic-long.h
+++ b/include/asm-generic/atomic-long.h
@@ -154,19 +154,24 @@ static inline int atomic_long_add_negative(long i, atomic_long_t *l)
 	return ATOMIC_LONG_PFX(_add_negative)(i, v);
 }
 
-static inline long atomic_long_inc_return(atomic_long_t *l)
-{
-	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
-
-	return (long)ATOMIC_LONG_PFX(_inc_return)(v);
-}
-
-static inline long atomic_long_dec_return(atomic_long_t *l)
-{
-	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;
-
-	return (long)ATOMIC_LONG_PFX(_dec_return)(v);
+#define ATOMIC_LONG_INC_DEC_OP(op, mo)					\
+static inline long							\
+atomic_long_##op##_return##mo(atomic_long_t *l)				\
+{									\
+	ATOMIC_LONG_PFX(_t) *v = (ATOMIC_LONG_PFX(_t) *)l;		\
+									\
+	return (long)ATOMIC_LONG_PFX(_##op##_return##mo)(v);		\
 }
+ATOMIC_LONG_INC_DEC_OP(inc,)
+ATOMIC_LONG_INC_DEC_OP(inc, _relaxed)
+ATOMIC_LONG_INC_DEC_OP(inc, _acquire)
+ATOMIC_LONG_INC_DEC_OP(inc, _release)
+ATOMIC_LONG_INC_DEC_OP(dec,)
+ATOMIC_LONG_INC_DEC_OP(dec, _relaxed)
+ATOMIC_LONG_INC_DEC_OP(dec, _acquire)
+ATOMIC_LONG_INC_DEC_OP(dec, _release)
+
+#undef ATOMIC_LONG_INC_DEC_OP
 
 static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 {
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 00a5763..fb44d3b 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -81,6 +81,30 @@
 #endif
 #endif /* atomic_add_return_relaxed */
 
+/* atomic_inc_return_relaxed */
+#ifndef atomic_inc_return_relaxed
+#define  atomic_inc_return_relaxed	atomic_inc_return
+#define  atomic_inc_return_acquire	atomic_inc_return
+#define  atomic_inc_return_release	atomic_inc_return
+
+#else /* atomic_inc_return_relaxed */
+
+#ifndef atomic_inc_return_acquire
+#define  atomic_inc_return_acquire(...)					\
+	__atomic_op_acquire(atomic_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_inc_return_release
+#define  atomic_inc_return_release(...)					\
+	__atomic_op_release(atomic_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_inc_return
+#define  atomic_inc_return(...)						\
+	__atomic_op_fence(atomic_inc_return, __VA_ARGS__)
+#endif
+#endif /* atomic_inc_return_relaxed */
+
 /* atomic_sub_return_relaxed */
 #ifndef atomic_sub_return_relaxed
 #define  atomic_sub_return_relaxed	atomic_sub_return
@@ -105,6 +129,30 @@
 #endif
 #endif /* atomic_sub_return_relaxed */
 
+/* atomic_dec_return_relaxed */
+#ifndef atomic_dec_return_relaxed
+#define  atomic_dec_return_relaxed	atomic_dec_return
+#define  atomic_dec_return_acquire	atomic_dec_return
+#define  atomic_dec_return_release	atomic_dec_return
+
+#else /* atomic_dec_return_relaxed */
+
+#ifndef atomic_dec_return_acquire
+#define  atomic_dec_return_acquire(...)					\
+	__atomic_op_acquire(atomic_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_dec_return_release
+#define  atomic_dec_return_release(...)					\
+	__atomic_op_release(atomic_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic_dec_return
+#define  atomic_dec_return(...)						\
+	__atomic_op_fence(atomic_dec_return, __VA_ARGS__)
+#endif
+#endif /* atomic_dec_return_relaxed */
+
 /* atomic_xchg_relaxed */
 #ifndef atomic_xchg_relaxed
 #define  atomic_xchg_relaxed		atomic_xchg
@@ -185,6 +233,31 @@
 #endif
 #endif /* atomic64_add_return_relaxed */
 
+/* atomic64_inc_return_relaxed */
+#ifndef atomic64_inc_return_relaxed
+#define  atomic64_inc_return_relaxed	atomic64_inc_return
+#define  atomic64_inc_return_acquire	atomic64_inc_return
+#define  atomic64_inc_return_release	atomic64_inc_return
+
+#else /* atomic64_inc_return_relaxed */
+
+#ifndef atomic64_inc_return_acquire
+#define  atomic64_inc_return_acquire(...)				\
+	__atomic_op_acquire(atomic64_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_inc_return_release
+#define  atomic64_inc_return_release(...)				\
+	__atomic_op_release(atomic64_inc_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_inc_return
+#define  atomic64_inc_return(...)					\
+	__atomic_op_fence(atomic64_inc_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_inc_return_relaxed */
+
+
 /* atomic64_sub_return_relaxed */
 #ifndef atomic64_sub_return_relaxed
 #define  atomic64_sub_return_relaxed	atomic64_sub_return
@@ -209,6 +282,30 @@
 #endif
 #endif /* atomic64_sub_return_relaxed */
 
+/* atomic64_dec_return_relaxed */
+#ifndef atomic64_dec_return_relaxed
+#define  atomic64_dec_return_relaxed	atomic64_dec_return
+#define  atomic64_dec_return_acquire	atomic64_dec_return
+#define  atomic64_dec_return_release	atomic64_dec_return
+
+#else /* atomic64_dec_return_relaxed */
+
+#ifndef atomic64_dec_return_acquire
+#define  atomic64_dec_return_acquire(...)				\
+	__atomic_op_acquire(atomic64_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_dec_return_release
+#define  atomic64_dec_return_release(...)				\
+	__atomic_op_release(atomic64_dec_return, __VA_ARGS__)
+#endif
+
+#ifndef atomic64_dec_return
+#define  atomic64_dec_return(...)					\
+	__atomic_op_fence(atomic64_dec_return, __VA_ARGS__)
+#endif
+#endif /* atomic64_dec_return_relaxed */
+
 /* atomic64_xchg_relaxed */
 #ifndef atomic64_xchg_relaxed
 #define  atomic64_xchg_relaxed		atomic64_xchg
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/5] locking/mutex: Use acquire/release semantics
  2015-09-21 20:17 [PATCH 0/5] locking: Use acquire/release semantics Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 1/5] asm-generic: Add _{relaxed|acquire|release}() variants for inc/dec atomics Davidlohr Bueso
@ 2015-09-21 20:17 ` Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 3/5] locking/rtmutex: " Davidlohr Bueso
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-21 20:17 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	Davidlohr Bueso, linux-kernel, Davidlohr Bueso

As such, weakly ordered archs can benefit from more relaxed use
of barriers when issuing atomics.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/asm-generic/mutex-dec.h  |  8 ++++----
 include/asm-generic/mutex-xchg.h | 10 +++++-----
 kernel/locking/mutex.c           |  9 +++++----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h
index d4f9fb4..fd694cf 100644
--- a/include/asm-generic/mutex-dec.h
+++ b/include/asm-generic/mutex-dec.h
@@ -20,7 +20,7 @@
 static inline void
 __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		fail_fn(count);
 }
 
@@ -35,7 +35,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		return -1;
 	return 0;
 }
@@ -56,7 +56,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_inc_return(count) <= 0))
+	if (unlikely(atomic_inc_return_release(count) <= 0))
 		fail_fn(count);
 }
 
@@ -80,7 +80,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+	if (likely(atomic_cmpxchg_acquire(count, 1, 0) == 1))
 		return 1;
 	return 0;
 }
diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h
index f169ec0..a6b4a7b 100644
--- a/include/asm-generic/mutex-xchg.h
+++ b/include/asm-generic/mutex-xchg.h
@@ -31,7 +31,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 		 * to ensure that any waiting tasks are woken up by the
 		 * unlock slow path.
 		 */
-		if (likely(atomic_xchg(count, -1) != 1))
+		if (likely(atomic_xchg_acquire(count, -1) != 1))
 			fail_fn(count);
 }
 
@@ -46,7 +46,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_xchg(count, 0) != 1))
+	if (unlikely(atomic_xchg_acquire(count, 0) != 1))
 		if (likely(atomic_xchg(count, -1) != 1))
 			return -1;
 	return 0;
@@ -67,7 +67,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_xchg(count, 1) != 0))
+	if (unlikely(atomic_xchg_release(count, 1) != 0))
 		fail_fn(count);
 }
 
@@ -91,7 +91,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	int prev = atomic_xchg(count, 0);
+	int prev = atomic_xchg_acquire(count, 0);
 
 	if (unlikely(prev < 0)) {
 		/*
@@ -105,7 +105,7 @@ __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 		 *   owner's unlock path needlessly, but that's not a problem
 		 *   in practice. ]
 		 */
-		prev = atomic_xchg(count, prev);
+		prev = atomic_xchg_acquire(count, prev);
 		if (prev < 0)
 			prev = 0;
 	}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6..0551c21 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 static inline bool mutex_try_to_acquire(struct mutex *lock)
 {
 	return !mutex_is_locked(lock) &&
-		(atomic_cmpxchg(&lock->count, 1, 0) == 1);
+		(atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
 }
 
 /*
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 * Once more, try to acquire the lock. Only try-lock the mutex if
 	 * it is unlocked to reduce unnecessary xchg() operations.
 	 */
-	if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+	if (!mutex_is_locked(lock) &&
+	    (atomic_xchg_acquire(&lock->count, 0) == 1))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * non-negative in order to avoid unnecessary xchg operations:
 		 */
 		if (atomic_read(&lock->count) >= 0 &&
-		    (atomic_xchg(&lock->count, -1) == 1))
+		    (atomic_xchg_acquire(&lock->count, -1) == 1))
 			break;
 
 		/*
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	spin_lock_mutex(&lock->wait_lock, flags);
 
-	prev = atomic_xchg(&lock->count, -1);
+	prev = atomic_xchg_acquire(&lock->count, -1);
 	if (likely(prev == 1)) {
 		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/5] locking/rtmutex: Use acquire/release semantics
  2015-09-21 20:17 [PATCH 0/5] locking: Use acquire/release semantics Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 1/5] asm-generic: Add _{relaxed|acquire|release}() variants for inc/dec atomics Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 2/5] locking/mutex: Use acquire/release semantics Davidlohr Bueso
@ 2015-09-21 20:17 ` Davidlohr Bueso
  2015-09-22 15:33   ` Thomas Gleixner
  2015-09-24  1:06   ` [PATCH v2] " Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 4/5] locking/rwsem: " Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 5/5] locking/mcs: " Davidlohr Bueso
  4 siblings, 2 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-21 20:17 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	Davidlohr Bueso, linux-kernel, Davidlohr Bueso

As such, weakly ordered archs can benefit from more relaxed use
of barriers when locking/unlocking.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 kernel/locking/rtmutex.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7781d80..226a629 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
  * set up.
  */
 #ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+
+/*
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
+ */
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	unsigned long owner, *p = (unsigned long *) &lock->owner;
 
 	do {
 		owner = *p;
-	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+	} while (cmpxchg_relaxed(p, owner,
+				 owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
 
 /*
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
 	 *					lock(wait_lock);
 	 *					acquire(lock);
 	 */
-	return rt_mutex_cmpxchg(lock, owner, NULL);
+	return rt_mutex_cmpxchg_acquire(lock, owner, NULL);
 }
 
 #else
-# define rt_mutex_cmpxchg(l,c,n)	(0)
+# define rt_mutex_cmpxchg_relaxed(l,c,n)	(0)
+# define rt_mutex_cmpxchg_acquire(l,c,n)	(0)
+# define rt_mutex_cmpxchg_release(l,c,n)	(0)
+
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	lock->owner = (struct task_struct *)
@@ -1321,7 +1333,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
 				struct hrtimer_sleeper *timeout,
 				enum rtmutex_chainwalk chwalk))
 {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
@@ -1337,7 +1349,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
 				      enum rtmutex_chainwalk chwalk))
 {
 	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	    likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
@@ -1348,7 +1360,7 @@ static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
 		     int (*slowfn)(struct rt_mutex *lock))
 {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 1;
 	}
@@ -1362,7 +1374,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 {
 	WAKE_Q(wake_q);
 
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
 
 	} else {
@@ -1484,7 +1496,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
 				   struct wake_q_head *wqh)
 {
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
 		return false;
 	}
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] locking/rtmutex: Use acquire/release semantics
  2015-09-21 20:17 ` [PATCH 3/5] locking/rtmutex: " Davidlohr Bueso
@ 2015-09-22 15:33   ` Thomas Gleixner
  2015-09-22 15:55     ` Davidlohr Bueso
  2015-09-24  1:06   ` [PATCH v2] " Davidlohr Bueso
  1 sibling, 1 reply; 15+ messages in thread
From: Thomas Gleixner @ 2015-09-22 15:33 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, Ingo Molnar, Andrew Morton, Linus Torvalds,
	Will Deacon, Paul E. McKenney, linux-kernel, Davidlohr Bueso

On Mon, 21 Sep 2015, Davidlohr Bueso wrote:
> As such, weakly ordered archs can benefit from more relaxed use
> of barriers when locking/unlocking.
> 
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
> ---
>  kernel/locking/rtmutex.c | 30 +++++++++++++++++++++---------
>  1 file changed, 21 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 7781d80..226a629 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
>   * set up.
>   */
>  #ifndef CONFIG_DEBUG_RT_MUTEXES
> -# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
> +# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
> +# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
> +# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
> +
> +/*
> + * Callers must hold the ->wait_lock -- which is the whole purpose as we force
> + * all future threads that attempt to [Rmw] the lock to the slowpath. As such
> + * relaxed semantics suffice.
> + */
>  static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
>  {
>  	unsigned long owner, *p = (unsigned long *) &lock->owner;
>  
>  	do {
>  		owner = *p;
> -	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
> +	} while (cmpxchg_relaxed(p, owner,
> +				 owner | RT_MUTEX_HAS_WAITERS) != owner);
>  }
>  
>  /*
> @@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
>  	 *					lock(wait_lock);
>  	 *					acquire(lock);
>  	 */
> -	return rt_mutex_cmpxchg(lock, owner, NULL);
> +	return rt_mutex_cmpxchg_acquire(lock, owner, NULL);

Why is this acquire?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] locking/rtmutex: Use acquire/release semantics
  2015-09-22 15:33   ` Thomas Gleixner
@ 2015-09-22 15:55     ` Davidlohr Bueso
  0 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-22 15:55 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Ingo Molnar, Andrew Morton, Linus Torvalds,
	Will Deacon, Paul E. McKenney, linux-kernel, Davidlohr Bueso

On Tue, 22 Sep 2015, Thomas Gleixner wrote:

>> @@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
>>  	 *					lock(wait_lock);
>>  	 *					acquire(lock);
>>  	 */
>> -	return rt_mutex_cmpxchg(lock, owner, NULL);
>> +	return rt_mutex_cmpxchg_acquire(lock, owner, NULL);
>
>Why is this acquire?

Because I am an idiot, that should be a release...

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v2] locking/rtmutex: Use acquire/release semantics
  2015-09-21 20:17 ` [PATCH 3/5] locking/rtmutex: " Davidlohr Bueso
  2015-09-22 15:33   ` Thomas Gleixner
@ 2015-09-24  1:06   ` Davidlohr Bueso
  2015-09-27 10:37     ` Thomas Gleixner
  1 sibling, 1 reply; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-24  1:06 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	linux-kernel, Davidlohr Bueso

From: Davidlohr Bueso <dave@stgolabs.net>

As such, weakly ordered archs can benefit from more relaxed use
of barriers when locking/unlocking.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---

Changes from v1:
  - fix bogus acquire in unlock_rt_mutex_safe() (tglx)

  kernel/locking/rtmutex.c | 30 +++++++++++++++++++++---------
  1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 35e9bfc..8251e75 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
   * set up.
   */
  #ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+
+/*
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
+ */
  static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  {
  	unsigned long owner, *p = (unsigned long *) &lock->owner;
  
  	do {
  		owner = *p;
-	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+	} while (cmpxchg_relaxed(p, owner,
+				 owner | RT_MUTEX_HAS_WAITERS) != owner);
  }
  
  /*
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
  	 *					lock(wait_lock);
  	 *					acquire(lock);
  	 */
-	return rt_mutex_cmpxchg(lock, owner, NULL);
+	return rt_mutex_cmpxchg_release(lock, owner, NULL);
  }
  
  #else
-# define rt_mutex_cmpxchg(l,c,n)	(0)
+# define rt_mutex_cmpxchg_relaxed(l,c,n)	(0)
+# define rt_mutex_cmpxchg_acquire(l,c,n)	(0)
+# define rt_mutex_cmpxchg_release(l,c,n)	(0)
+
  static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  {
  	lock->owner = (struct task_struct *)
@@ -1322,7 +1334,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
  				struct hrtimer_sleeper *timeout,
  				enum rtmutex_chainwalk chwalk))
  {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
  		rt_mutex_deadlock_account_lock(lock, current);
  		return 0;
  	} else
@@ -1338,7 +1350,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
  				      enum rtmutex_chainwalk chwalk))
  {
  	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	    likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
  		rt_mutex_deadlock_account_lock(lock, current);
  		return 0;
  	} else
@@ -1349,7 +1361,7 @@ static inline int
  rt_mutex_fasttrylock(struct rt_mutex *lock,
  		     int (*slowfn)(struct rt_mutex *lock))
  {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
  		rt_mutex_deadlock_account_lock(lock, current);
  		return 1;
  	}
@@ -1363,7 +1375,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
  {
  	WAKE_Q(wake_q);
  
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
  		rt_mutex_deadlock_account_unlock(current);
  
  	} else {
@@ -1485,7 +1497,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
  				   struct wake_q_head *wqh)
  {
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
  		rt_mutex_deadlock_account_unlock(current);
  		return false;
  	}
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] locking/rtmutex: Use acquire/release semantics
  2015-09-24  1:06   ` [PATCH v2] " Davidlohr Bueso
@ 2015-09-27 10:37     ` Thomas Gleixner
  2015-09-28 19:49       ` Davidlohr Bueso
  0 siblings, 1 reply; 15+ messages in thread
From: Thomas Gleixner @ 2015-09-27 10:37 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, Ingo Molnar, Andrew Morton, Linus Torvalds,
	Will Deacon, Paul E. McKenney, linux-kernel, Davidlohr Bueso

On Wed, 23 Sep 2015, Davidlohr Bueso wrote:

> From: Davidlohr Bueso <dave@stgolabs.net>
> 
> As such, weakly ordered archs can benefit from more relaxed use
> of barriers when locking/unlocking.

That changelog is not really helpful if someone is going to look at it
half a year from now who doesn't have the background of the discussion
leading to these changes.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] locking/rtmutex: Use acquire/release semantics
  2015-09-27 10:37     ` Thomas Gleixner
@ 2015-09-28 19:49       ` Davidlohr Bueso
  2015-09-29 20:57         ` Thomas Gleixner
  0 siblings, 1 reply; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-28 19:49 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Ingo Molnar, Andrew Morton, Linus Torvalds,
	Will Deacon, Paul E. McKenney, linux-kernel, Davidlohr Bueso

On Sun, 27 Sep 2015, Thomas Gleixner wrote:

>On Wed, 23 Sep 2015, Davidlohr Bueso wrote:
>
>> From: Davidlohr Bueso <dave@stgolabs.net>
>>
>> As such, weakly ordered archs can benefit from more relaxed use
>> of barriers when locking/unlocking.
>
>That changelog is not really helpful if someone is going to look at it
>half a year from now who doesn't have the background of the discussion
>leading to these changes.

Ok, how does the following sound?

""
As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}() variants
of some atomic operations) and 6d79ef2d30e (locking, asm-generic: Add
_{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly ordered
archs can benefit from more relaxed use of barriers when locking and unlocking,
instead of regular full barrier semantics. While currently only arm64 supports
such optimizations, updating corresponding locking primitives serves for other
archs to immediately benefit as well, once the necessary machinery is implemented
of course.
""

It's not _that_ different from the original changelong, but it should allow
future readers to at least be able to easily see the context of where the
changes come from. I don't think it's of much use going into the actual code
changes as they are pretty obvious -- and the ones that aren't (ie relaxed)
have the justification in the comments.

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2] locking/rtmutex: Use acquire/release semantics
  2015-09-28 19:49       ` Davidlohr Bueso
@ 2015-09-29 20:57         ` Thomas Gleixner
  0 siblings, 0 replies; 15+ messages in thread
From: Thomas Gleixner @ 2015-09-29 20:57 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, Ingo Molnar, Andrew Morton, Linus Torvalds,
	Will Deacon, Paul E. McKenney, linux-kernel, Davidlohr Bueso

On Mon, 28 Sep 2015, Davidlohr Bueso wrote:
> On Sun, 27 Sep 2015, Thomas Gleixner wrote:
> > That changelog is not really helpful if someone is going to look at it
> > half a year from now who doesn't have the background of the discussion
> > leading to these changes.
> 
> Ok, how does the following sound?
> 
> ""
> As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}() variants
> of some atomic operations) and 6d79ef2d30e (locking, asm-generic: Add
> _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly ordered
> archs can benefit from more relaxed use of barriers when locking and
> unlocking,
> instead of regular full barrier semantics. While currently only arm64 supports
> such optimizations, updating corresponding locking primitives serves for other
> archs to immediately benefit as well, once the necessary machinery is
> implemented
> of course.
> ""
> 
> It's not _that_ different from the original changelong, but it should allow
> future readers to at least be able to easily see the context of where the
> changes come from. I don't think it's of much use going into the actual code
> changes as they are pretty obvious -- and the ones that aren't (ie relaxed)
> have the justification in the comments.

Ok, care to resend the whole thing?

Thanks,

	tglx



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 4/5] locking/rwsem: Use acquire/release semantics
  2015-09-21 20:17 [PATCH 0/5] locking: Use acquire/release semantics Davidlohr Bueso
                   ` (2 preceding siblings ...)
  2015-09-21 20:17 ` [PATCH 3/5] locking/rtmutex: " Davidlohr Bueso
@ 2015-09-21 20:17 ` Davidlohr Bueso
  2015-09-21 20:39   ` Linus Torvalds
  2015-09-24  1:09   ` [PATCH v2] " Davidlohr Bueso
  2015-09-21 20:17 ` [PATCH 5/5] locking/mcs: " Davidlohr Bueso
  4 siblings, 2 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-21 20:17 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	Davidlohr Bueso, linux-kernel, Davidlohr Bueso

As such, weakly ordered archs can benefit from more relaxed use
of barriers when locking/unlocking.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/asm-generic/rwsem.h | 14 +++++++-------
 kernel/locking/rwsem-xadd.c |  5 +++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index d48bf5a..1a6bb87 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -33,7 +33,7 @@
  */
 static inline void __down_read(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_inc_return((atomic_long_t *)&sem->count) <= 0))
+	if (unlikely(atomic_long_inc_return_acquire((atomic_long_t *)&sem->count) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
@@ -42,7 +42,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 	long tmp;
 
 	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg(&sem->count, tmp,
+		if (tmp == cmpxchg_acquire(&sem->count, tmp,
 				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
 			return 1;
 		}
@@ -57,7 +57,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	long tmp;
 
-	tmp = atomic_long_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
 				     (atomic_long_t *)&sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		rwsem_down_write_failed(sem);
@@ -72,7 +72,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+	tmp = cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
 		      RWSEM_ACTIVE_WRITE_BIAS);
 	return tmp == RWSEM_UNLOCKED_VALUE;
 }
@@ -84,7 +84,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = atomic_long_dec_return((atomic_long_t *)&sem->count);
+	tmp = atomic_long_dec_return_release((atomic_long_t *)&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
 		rwsem_wake(sem);
 }
@@ -94,7 +94,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
 				 (atomic_long_t *)&sem->count) < 0))
 		rwsem_wake(sem);
 }
@@ -114,7 +114,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = atomic_long_add_return(-RWSEM_WAITING_BIAS,
+	tmp = atomic_long_add_return_acquire(-RWSEM_WAITING_BIAS,
 				     (atomic_long_t *)&sem->count);
 	if (tmp < 0)
 		rwsem_downgrade_wake(sem);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0f18971..a4d4de0 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 	 * to reduce unnecessary expensive cmpxchg() operations.
 	 */
 	if (count == RWSEM_WAITING_BIAS &&
-	    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+	    cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
 		    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
 		if (!list_is_singular(&sem->wait_list))
 			rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 		if (!(count == 0 || count == RWSEM_WAITING_BIAS))
 			return false;
 
-		old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+		old = cmpxchg_acquire(&sem->count, count,
+				      count + RWSEM_ACTIVE_WRITE_BIAS);
 		if (old == count) {
 			rwsem_set_owner(sem);
 			return true;
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] locking/rwsem: Use acquire/release semantics
  2015-09-21 20:17 ` [PATCH 4/5] locking/rwsem: " Davidlohr Bueso
@ 2015-09-21 20:39   ` Linus Torvalds
  2015-09-24  1:09   ` [PATCH v2] " Davidlohr Bueso
  1 sibling, 0 replies; 15+ messages in thread
From: Linus Torvalds @ 2015-09-21 20:39 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, Ingo Molnar, Thomas Gleixner, Andrew Morton,
	Will Deacon, Paul E. McKenney, Linux Kernel Mailing List,
	Davidlohr Bueso

On Mon, Sep 21, 2015 at 1:17 PM, Davidlohr Bueso <dave@stgolabs.net> wrote:
> @@ -114,7 +114,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
>  {
>         long tmp;
>
> -       tmp = atomic_long_add_return(-RWSEM_WAITING_BIAS,
> +       tmp = atomic_long_add_return_acquire(-RWSEM_WAITING_BIAS,
>                                      (atomic_long_t *)&sem->count);
>         if (tmp < 0)
>                 rwsem_downgrade_wake(sem);

Careful. I'm pretty sure this is wrong.

When we downgrade exclusive ownership to non-exclusive, that should be
a *release* operation. Anything we did inside the write-locked region
had damn better _stay_ inside the write-locked region, we can not
allow it to escape down into the read-locked side. So it needs to be
at least a release.

In contrast, anything that we do in the read-locked part is fine to be
re-ordered into the write-locked exclusive part, so it does *not* need
acquire ordering (the original write locking obviously did use
acquire, and acts as a barrier for everything that comes in the locked
region).

I tried to look through everything, and I think this is the only thing
you got wrong, but I'd like somebody to double-checks. Getting the
acquire/release semantics wrong will cause some really really subtle
and hard-as-hell-to-find bugs.  So let's be careful out there, ok?

                Linus

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v2] locking/rwsem: Use acquire/release semantics
  2015-09-21 20:17 ` [PATCH 4/5] locking/rwsem: " Davidlohr Bueso
  2015-09-21 20:39   ` Linus Torvalds
@ 2015-09-24  1:09   ` Davidlohr Bueso
  1 sibling, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-24  1:09 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	linux-kernel, Davidlohr Bueso

From: Davidlohr Bueso <dave@stgolabs.net>

As such, weakly ordered archs can benefit from more relaxed use
of barriers when locking/unlocking.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---

Changes from v1:
  Fix bogus acquire in generic lock downgrade (Linus)

  include/asm-generic/rwsem.h | 21 ++++++++++++++-------
  kernel/locking/rwsem-xadd.c |  5 +++--
  2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index d48bf5a..d01e313 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -33,7 +33,7 @@
   */
  static inline void __down_read(struct rw_semaphore *sem)
  {
-	if (unlikely(atomic_long_inc_return((atomic_long_t *)&sem->count) <= 0))
+	if (unlikely(atomic_long_inc_return_acquire((atomic_long_t *)&sem->count) <= 0))
  		rwsem_down_read_failed(sem);
  }
  
@@ -42,7 +42,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
  	long tmp;
  
  	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg(&sem->count, tmp,
+		if (tmp == cmpxchg_acquire(&sem->count, tmp,
  				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
  			return 1;
  		}
@@ -57,7 +57,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
  {
  	long tmp;
  
-	tmp = atomic_long_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
  				     (atomic_long_t *)&sem->count);
  	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
  		rwsem_down_write_failed(sem);
@@ -72,7 +72,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
  {
  	long tmp;
  
-	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+	tmp = cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
  		      RWSEM_ACTIVE_WRITE_BIAS);
  	return tmp == RWSEM_UNLOCKED_VALUE;
  }
@@ -84,7 +84,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  {
  	long tmp;
  
-	tmp = atomic_long_dec_return((atomic_long_t *)&sem->count);
+	tmp = atomic_long_dec_return_release((atomic_long_t *)&sem->count);
  	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
  		rwsem_wake(sem);
  }
@@ -94,7 +94,7 @@ static inline void __up_read(struct rw_semaphore *sem)
   */
  static inline void __up_write(struct rw_semaphore *sem)
  {
-	if (unlikely(atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
  				 (atomic_long_t *)&sem->count) < 0))
  		rwsem_wake(sem);
  }
@@ -114,7 +114,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
  {
  	long tmp;
  
-	tmp = atomic_long_add_return(-RWSEM_WAITING_BIAS,
+	/*
+	 * When downgrading from exclusive to shared ownership,
+	 * anything inside the write-locked region cannot leak
+	 * into the read side. In contrast, anything in the
+	 * read-locked region is ok to be re-ordered into the
+	 * write side. As such, use RELEASE semantics.
+	 */
+	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS,
  				     (atomic_long_t *)&sem->count);
  	if (tmp < 0)
  		rwsem_downgrade_wake(sem);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0f18971..a4d4de0 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
  	 * to reduce unnecessary expensive cmpxchg() operations.
  	 */
  	if (count == RWSEM_WAITING_BIAS &&
-	    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+	    cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
  		    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
  		if (!list_is_singular(&sem->wait_list))
  			rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
  		if (!(count == 0 || count == RWSEM_WAITING_BIAS))
  			return false;
  
-		old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+		old = cmpxchg_acquire(&sem->count, count,
+				      count + RWSEM_ACTIVE_WRITE_BIAS);
  		if (old == count) {
  			rwsem_set_owner(sem);
  			return true;
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/5] locking/mcs: Use acquire/release semantics
  2015-09-21 20:17 [PATCH 0/5] locking: Use acquire/release semantics Davidlohr Bueso
                   ` (3 preceding siblings ...)
  2015-09-21 20:17 ` [PATCH 4/5] locking/rwsem: " Davidlohr Bueso
@ 2015-09-21 20:17 ` Davidlohr Bueso
  4 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-21 20:17 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E. McKenney,
	Davidlohr Bueso, linux-kernel, Davidlohr Bueso

As such, weakly ordered archs can benefit from more relaxed use
of barriers when locking/unlocking.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 kernel/locking/mcs_spinlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index fd91aaa..5b9102a 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	node->locked = 0;
 	node->next   = NULL;
 
-	prev = xchg(lock, node);
+	prev = xchg_acquire(lock, node);
 	if (likely(prev == NULL)) {
 		/*
 		 * Lock acquired, don't need to set node->locked to 1. Threads
@@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 		/*
 		 * Release the lock by setting it to NULL
 		 */
-		if (likely(cmpxchg(lock, node, NULL) == node))
+		if (likely(cmpxchg_release(lock, node, NULL) == node))
 			return;
 		/* Wait until the next pointer is set */
 		while (!(next = READ_ONCE(node->next)))
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH -tip v3 0/5] locking: Use acquire/release semantics
@ 2015-09-30 20:03 Davidlohr Bueso
  2015-09-30 20:03 ` [PATCH 2/5] locking/mutex: " Davidlohr Bueso
  0 siblings, 1 reply; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-30 20:03 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E.McKenney,
	linux-kernel, Davidlohr Bueso

Changes from v2:
 o Redid changelog for patches 2-5 (tglx)

Changes from v1:
 o Fix bogus acquire in generic rwsem lock downgrade (Linus)
 o Fix bogus acquire in unlock_rt_mutex_safe() (tglx)

Hi,

This series continues porting users to Will's new _{acquire|release|relaxed}
optimizations for weakly ordered archs -- and in practice, arm64 being the only
actual user we have, thus could use a _lot_ more testing. While we have osq in
-tip recently, along with Waiman's one for native qspinlocks, the following
patches pretty much complete the move over for all things kernel/locking/*.
Of course, particular primitives can be found in the obvious patch titles. Note
that patch 1 adds the necessary machinery to fully support some locks, such as
rwsems.

Compiles on x86 and x86-64.

Thanks!

Davidlohr Bueso (5):
  asm-generic: Add _{relaxed|acquire|release}() variants for inc/dec
    atomics
  locking/mutex: Use acquire/release semantics
  locking/rtmutex: Use acquire/release semantics
  locking/mcs: Use acquire/release semantics
  locking/rwsem: Use acquire/release semantics

 include/asm-generic/atomic-long.h | 29 +++++++-----
 include/asm-generic/mutex-dec.h   |  8 ++--
 include/asm-generic/mutex-xchg.h  | 10 ++--
 include/asm-generic/rwsem.h       | 21 ++++++---
 include/linux/atomic.h            | 97 +++++++++++++++++++++++++++++++++++++++
 kernel/locking/mcs_spinlock.h     |  4 +-
 kernel/locking/mutex.c            |  9 ++--
 kernel/locking/rtmutex.c          | 30 ++++++++----
 kernel/locking/rwsem-xadd.c       |  5 +-
 9 files changed, 168 insertions(+), 45 deletions(-)

--
2.1.4


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 2/5] locking/mutex: Use acquire/release semantics
  2015-09-30 20:03 [PATCH -tip v3 0/5] locking: " Davidlohr Bueso
@ 2015-09-30 20:03 ` Davidlohr Bueso
  0 siblings, 0 replies; 15+ messages in thread
From: Davidlohr Bueso @ 2015-09-30 20:03 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: Andrew Morton, Linus Torvalds, Will Deacon, Paul E.McKenney,
	linux-kernel, Davidlohr Bueso, Davidlohr Bueso

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/asm-generic/mutex-dec.h  |  8 ++++----
 include/asm-generic/mutex-xchg.h | 10 +++++-----
 kernel/locking/mutex.c           |  9 +++++----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h
index d4f9fb4..fd694cf 100644
--- a/include/asm-generic/mutex-dec.h
+++ b/include/asm-generic/mutex-dec.h
@@ -20,7 +20,7 @@
 static inline void
 __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		fail_fn(count);
 }
 
@@ -35,7 +35,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		return -1;
 	return 0;
 }
@@ -56,7 +56,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_inc_return(count) <= 0))
+	if (unlikely(atomic_inc_return_release(count) <= 0))
 		fail_fn(count);
 }
 
@@ -80,7 +80,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+	if (likely(atomic_cmpxchg_acquire(count, 1, 0) == 1))
 		return 1;
 	return 0;
 }
diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h
index f169ec0..a6b4a7b 100644
--- a/include/asm-generic/mutex-xchg.h
+++ b/include/asm-generic/mutex-xchg.h
@@ -31,7 +31,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 		 * to ensure that any waiting tasks are woken up by the
 		 * unlock slow path.
 		 */
-		if (likely(atomic_xchg(count, -1) != 1))
+		if (likely(atomic_xchg_acquire(count, -1) != 1))
 			fail_fn(count);
 }
 
@@ -46,7 +46,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_xchg(count, 0) != 1))
+	if (unlikely(atomic_xchg_acquire(count, 0) != 1))
 		if (likely(atomic_xchg(count, -1) != 1))
 			return -1;
 	return 0;
@@ -67,7 +67,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_xchg(count, 1) != 0))
+	if (unlikely(atomic_xchg_release(count, 1) != 0))
 		fail_fn(count);
 }
 
@@ -91,7 +91,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	int prev = atomic_xchg(count, 0);
+	int prev = atomic_xchg_acquire(count, 0);
 
 	if (unlikely(prev < 0)) {
 		/*
@@ -105,7 +105,7 @@ __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 		 *   owner's unlock path needlessly, but that's not a problem
 		 *   in practice. ]
 		 */
-		prev = atomic_xchg(count, prev);
+		prev = atomic_xchg_acquire(count, prev);
 		if (prev < 0)
 			prev = 0;
 	}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6..0551c21 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 static inline bool mutex_try_to_acquire(struct mutex *lock)
 {
 	return !mutex_is_locked(lock) &&
-		(atomic_cmpxchg(&lock->count, 1, 0) == 1);
+		(atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
 }
 
 /*
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 * Once more, try to acquire the lock. Only try-lock the mutex if
 	 * it is unlocked to reduce unnecessary xchg() operations.
 	 */
-	if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+	if (!mutex_is_locked(lock) &&
+	    (atomic_xchg_acquire(&lock->count, 0) == 1))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * non-negative in order to avoid unnecessary xchg operations:
 		 */
 		if (atomic_read(&lock->count) >= 0 &&
-		    (atomic_xchg(&lock->count, -1) == 1))
+		    (atomic_xchg_acquire(&lock->count, -1) == 1))
 			break;
 
 		/*
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	spin_lock_mutex(&lock->wait_lock, flags);
 
-	prev = atomic_xchg(&lock->count, -1);
+	prev = atomic_xchg_acquire(&lock->count, -1);
 	if (likely(prev == 1)) {
 		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-09-30 20:03 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-09-21 20:17 [PATCH 0/5] locking: Use acquire/release semantics Davidlohr Bueso
2015-09-21 20:17 ` [PATCH 1/5] asm-generic: Add _{relaxed|acquire|release}() variants for inc/dec atomics Davidlohr Bueso
2015-09-21 20:17 ` [PATCH 2/5] locking/mutex: Use acquire/release semantics Davidlohr Bueso
2015-09-21 20:17 ` [PATCH 3/5] locking/rtmutex: " Davidlohr Bueso
2015-09-22 15:33   ` Thomas Gleixner
2015-09-22 15:55     ` Davidlohr Bueso
2015-09-24  1:06   ` [PATCH v2] " Davidlohr Bueso
2015-09-27 10:37     ` Thomas Gleixner
2015-09-28 19:49       ` Davidlohr Bueso
2015-09-29 20:57         ` Thomas Gleixner
2015-09-21 20:17 ` [PATCH 4/5] locking/rwsem: " Davidlohr Bueso
2015-09-21 20:39   ` Linus Torvalds
2015-09-24  1:09   ` [PATCH v2] " Davidlohr Bueso
2015-09-21 20:17 ` [PATCH 5/5] locking/mcs: " Davidlohr Bueso
  -- strict thread matches above, loose matches on Subject: below --
2015-09-30 20:03 [PATCH -tip v3 0/5] locking: " Davidlohr Bueso
2015-09-30 20:03 ` [PATCH 2/5] locking/mutex: " Davidlohr Bueso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).