public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH-tip] locking/pvqspinlock: Relax cmpxchg's to improve performance  on some archs
@ 2016-12-06 19:14 Waiman Long
  2016-12-08  0:38 ` Pan Xinhui
  2016-12-09 16:05 ` Peter Zijlstra
  0 siblings, 2 replies; 4+ messages in thread
From: Waiman Long @ 2016-12-06 19:14 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar
  Cc: linux-kernel, Pan Xinhui, Boqun Feng, Waiman Long

A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
relaxed versions to improve performance on architectures that use LL/SC.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/locking/qspinlock_paravirt.h | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e3b5520..9d2205f 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 	struct __qspinlock *l = (void *)lock;
 
 	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-	    (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
 		qstat_inc(qstat_pv_lock_stealing, true);
 		return true;
 	}
@@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
 
 /*
  * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
- * just to be sure that it will get it.
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock to provide the proper memory barrier.
  */
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
 
 	return !READ_ONCE(l->locked) &&
-	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
-			== _Q_PENDING_VAL);
+	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
 }
 #else /* _Q_PENDING_BITS == 8 */
 static __always_inline void set_pending(struct qspinlock *lock)
@@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 		 */
 		old = val;
 		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
-		val = atomic_cmpxchg(&lock->val, old, new);
+		val = atomic_cmpxchg_acquire(&lock->val, old, new);
 
 		if (val == old)
 			return 1;
@@ -211,7 +211,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 
 	for_each_hash_entry(he, offset, hash) {
 		hopcnt++;
-		if (!cmpxchg(&he->lock, NULL, lock)) {
+		if (!cmpxchg_relaxed(&he->lock, NULL, lock)) {
 			WRITE_ONCE(he->node, node);
 			qstat_hop(hopcnt);
 			return &he->lock;
@@ -309,7 +309,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 		 *     MB			      MB
 		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
 		 *
-		 * Matches the cmpxchg() from pv_kick_node().
+		 * Matches the cmpxchg_release() from pv_kick_node().
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
@@ -324,7 +324,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 		 * value so that pv_wait_head_or_lock() knows to not also try
 		 * to hash this lock.
 		 */
-		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
+		cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
 
 		/*
 		 * If the locked flag is still not set after wakeup, it is a
@@ -360,9 +360,10 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
 	 * observe its next->locked value and advance itself.
 	 *
-	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+	 * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node().
 	 */
-	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+	if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
+			!= vcpu_halted)
 		return;
 
 	/*
@@ -461,8 +462,8 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	}
 
 	/*
-	 * The cmpxchg() or xchg() call before coming here provides the
-	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+	 * The cmpxchg_acquire() or xchg() call before coming here provides
+	 * the acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
 	 * here is to indicate to the compiler that the value will always
 	 * be nozero to enable better code optimization.
 	 */
@@ -488,11 +489,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	}
 
 	/*
-	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
-	 * so we need a barrier to order the read of the node data in
-	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+	 * A failed cmpxchg_release doesn't provide any memory-ordering
+	 * guarantees, so we need a barrier to order the read of the node
+	 * data in pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 	 *
-	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
+	 * Matches the cmpxchg_acquire() in pv_wait_head_or_lock() setting
+	 * _Q_SLOW_VAL.
 	 */
 	smp_rmb();
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH-tip] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2016-12-06 19:14 [PATCH-tip] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs Waiman Long
@ 2016-12-08  0:38 ` Pan Xinhui
  2016-12-09 16:05 ` Peter Zijlstra
  1 sibling, 0 replies; 4+ messages in thread
From: Pan Xinhui @ 2016-12-08  0:38 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra, Ingo Molnar; +Cc: linux-kernel, Boqun Feng



在 2016/12/7 03:14, Waiman Long 写道:
> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> relaxed versions to improve performance on architectures that use LL/SC.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
thanks!
I apply it on my tree. and the tests is okay.

>  ke
rnel/locking/qspinlock_paravirt.h | 36 +++++++++++++++++++-----------------
>  1 file changed, 19 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
> index e3b5520..9d2205f 100644
> --- a/kernel/locking/qspinlock_paravirt.h
> +++ b/kernel/locking/qspinlock_paravirt.h
> @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
>  	struct __qspinlock *l = (void *)lock;
>
>  	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
> -	    (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
> +	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
>  		qstat_inc(qstat_pv_lock_stealing, true);
>  		return true;
>  	}
> @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
>
>  /*
>   * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
> - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
> - * just to be sure that it will get it.
> + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
> + * lock to provide the proper memory barrier.
>   */
>  static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>  {
>  	struct __qspinlock *l = (void *)lock;
>
>  	return !READ_ONCE(l->locked) &&
> -	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
> -			== _Q_PENDING_VAL);
> +	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
> +				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
>  }
>  #else /* _Q_PENDING_BITS == 8 */
>  static __always_inline void set_pending(struct qspinlock *lock)
> @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
>  		 */
>  		old = val;
>  		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
> -		val = atomic_cmpxchg(&lock->val, old, new);
> +		val = atomic_cmpxchg_acquire(&lock->val, old, new);
>
>  		if (val == old)
>  			return 1;
> @@ -211,7 +211,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
>
>  	for_each_hash_entry(he, offset, hash) {
>  		hopcnt++;
> -		if (!cmpxchg(&he->lock, NULL, lock)) {
> +		if (!cmpxchg_relaxed(&he->lock, NULL, lock)) {
>  			WRITE_ONCE(he->node, node);
>  			qstat_hop(hopcnt);
>  			return &he->lock;
> @@ -309,7 +309,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>  		 *     MB			      MB
>  		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed
>  		 *
> -		 * Matches the cmpxchg() from pv_kick_node().
> +		 * Matches the cmpxchg_release() from pv_kick_node().
>  		 */
>  		smp_store_mb(pn->state, vcpu_halted);
>
> @@ -324,7 +324,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
>  		 * value so that pv_wait_head_or_lock() knows to not also try
>  		 * to hash this lock.
>  		 */
> -		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
> +		cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_running);
>
>  		/*
>  		 * If the locked flag is still not set after wakeup, it is a
> @@ -360,9 +360,10 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>  	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
>  	 * observe its next->locked value and advance itself.
>  	 *
> -	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
> +	 * Matches with smp_store_mb() and cmpxchg_relaxed() in pv_wait_node().
>  	 */
> -	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
> +	if (cmpxchg_release(&pn->state, vcpu_halted, vcpu_hashed)
> +			!= vcpu_halted)
>  		return;
>
>  	/*
> @@ -461,8 +462,8 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>  	}
>
>  	/*
> -	 * The cmpxchg() or xchg() call before coming here provides the
> -	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
> +	 * The cmpxchg_acquire() or xchg() call before coming here provides
> +	 * the acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
>  	 * here is to indicate to the compiler that the value will always
>  	 * be nozero to enable better code optimization.
>  	 */
> @@ -488,11 +489,12 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
>  	}
>
>  	/*
> -	 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
> -	 * so we need a barrier to order the read of the node data in
> -	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
> +	 * A failed cmpxchg_release doesn't provide any memory-ordering
> +	 * guarantees, so we need a barrier to order the read of the node
> +	 * data in pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
>  	 *
> -	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
> +	 * Matches the cmpxchg_acquire() in pv_wait_head_or_lock() setting
> +	 * _Q_SLOW_VAL.
>  	 */
>  	smp_rmb();
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH-tip] locking/pvqspinlock: Relax cmpxchg's to improve performance  on some archs
  2016-12-06 19:14 [PATCH-tip] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs Waiman Long
  2016-12-08  0:38 ` Pan Xinhui
@ 2016-12-09 16:05 ` Peter Zijlstra
  2016-12-09 16:10   ` Waiman Long
  1 sibling, 1 reply; 4+ messages in thread
From: Peter Zijlstra @ 2016-12-09 16:05 UTC (permalink / raw)
  To: Waiman Long
  Cc: Ingo Molnar, linux-kernel, Pan Xinhui, Boqun Feng, Will Deacon

On Tue, Dec 06, 2016 at 02:14:42PM -0500, Waiman Long wrote:
> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
> relaxed versions to improve performance on architectures that use LL/SC.

I would feel so much better if each change were to have some rationale
included. Either enumerate them in the changelog, or split them up into
smaller patches.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH-tip] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs
  2016-12-09 16:05 ` Peter Zijlstra
@ 2016-12-09 16:10   ` Waiman Long
  0 siblings, 0 replies; 4+ messages in thread
From: Waiman Long @ 2016-12-09 16:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, linux-kernel, Pan Xinhui, Boqun Feng, Will Deacon

On 12/09/2016 11:05 AM, Peter Zijlstra wrote:
> On Tue, Dec 06, 2016 at 02:14:42PM -0500, Waiman Long wrote:
>> A number of cmpxchg calls in qspinlock_paravirt.h were replaced by more
>> relaxed versions to improve performance on architectures that use LL/SC.
> I would feel so much better if each change were to have some rationale
> included. Either enumerate them in the changelog, or split them up into
> smaller patches.
>
>
OK, I will update the patch to either document them with comments or
enumerate them in the change log.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-12-09 16:10 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-12-06 19:14 [PATCH-tip] locking/pvqspinlock: Relax cmpxchg's to improve performance on some archs Waiman Long
2016-12-08  0:38 ` Pan Xinhui
2016-12-09 16:05 ` Peter Zijlstra
2016-12-09 16:10   ` Waiman Long

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox