[patch] mutex: optimise generic mutex implementations

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [patch] mutex: optimise generic mutex implementations
@ 2008-10-12  5:46 Nick Piggin
  2008-10-12  5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-12  5:46 UTC (permalink / raw)
  To: Ingo Molnar, linux-arch, linuxppc-dev, paulus, benh

Speed up generic mutex implementations.

- atomic operations which both modify the variable and return something imply
  full smp memory barriers before and after the memory operations involved
  (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
  they don't modify the target). See Documentation/atomic_ops.txt.
  So remove extra barriers and branches.
  
- All architectures support atomic_cmpxchg. This has no relation to
  __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally

This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
to 203 cycles on a ppc970 system.

Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/include/asm-generic/mutex-dec.h
===================================================================
--- linux-2.6.orig/include/asm-generic/mutex-dec.h
+++ linux-2.6/include/asm-generic/mutex-dec.h
@@ -22,8 +22,6 @@ __mutex_fastpath_lock(atomic_t *count, v
 {
 	if (unlikely(atomic_dec_return(count) < 0))
 		fail_fn(count);
-	else
-		smp_mb();
 }
 
 /**
@@ -41,10 +39,7 @@ __mutex_fastpath_lock_retval(atomic_t *c
 {
 	if (unlikely(atomic_dec_return(count) < 0))
 		return fail_fn(count);
-	else {
-		smp_mb();
-		return 0;
-	}
+	return 0;
 }
 
 /**
@@ -63,7 +58,6 @@ __mutex_fastpath_lock_retval(atomic_t *c
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	smp_mb();
 	if (unlikely(atomic_inc_return(count) <= 0))
 		fail_fn(count);
 }
@@ -98,15 +92,9 @@ __mutex_fastpath_trylock(atomic_t *count
 	 * just as efficient (and simpler) as a 'destructive' probing of
 	 * the mutex state would be.
 	 */
-#ifdef __HAVE_ARCH_CMPXCHG
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1)) {
-		smp_mb();
+	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
 		return 1;
-	}
 	return 0;
-#else
-	return fail_fn(count);
-#endif
 }
 
 #endif
Index: linux-2.6/include/asm-generic/mutex-xchg.h
===================================================================
--- linux-2.6.orig/include/asm-generic/mutex-xchg.h
+++ linux-2.6/include/asm-generic/mutex-xchg.h
@@ -27,8 +27,6 @@ __mutex_fastpath_lock(atomic_t *count, v
 {
 	if (unlikely(atomic_xchg(count, 0) != 1))
 		fail_fn(count);
-	else
-		smp_mb();
 }
 
 /**
@@ -46,10 +44,7 @@ __mutex_fastpath_lock_retval(atomic_t *c
 {
 	if (unlikely(atomic_xchg(count, 0) != 1))
 		return fail_fn(count);
-	else {
-		smp_mb();
-		return 0;
-	}
+	return 0;
 }
 
 /**
@@ -67,7 +62,6 @@ __mutex_fastpath_lock_retval(atomic_t *c
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	smp_mb();
 	if (unlikely(atomic_xchg(count, 1) != 0))
 		fail_fn(count);
 }
@@ -110,7 +104,6 @@ __mutex_fastpath_trylock(atomic_t *count
 		if (prev < 0)
 			prev = 0;
 	}
-	smp_mb();
 
 	return prev;
 }

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch] powerpc: implement optimised mutex fastpaths
  2008-10-12  5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
@ 2008-10-12  5:47 ` Nick Piggin
  2008-10-13  1:18   ` Nick Piggin
  2008-10-13 16:15   ` Scott Wood
  2008-10-14  8:35 ` [patch] mutex: optimise generic mutex implementations Benjamin Herrenschmidt
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-12  5:47 UTC (permalink / raw)
  To: Ingo Molnar, linux-arch, linuxppc-dev, paulus, benh


Implement a more optimal mutex fastpath for powerpc, making use of acquire
and release barrier semantics. This takes the mutex lock+unlock benchmark
from 203 to 173 cycles on a G5.

Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/arch/powerpc/include/asm/mutex.h
===================================================================
--- linux-2.6.orig/arch/powerpc/include/asm/mutex.h
+++ linux-2.6/arch/powerpc/include/asm/mutex.h
@@ -1,9 +1,136 @@
 /*
- * Pull in the generic implementation for the mutex fastpath.
+ * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm
+ */
+#ifndef _ASM_POWERPC_MUTEX_H
+#define _ASM_POWERPC_MUTEX_H
+
+static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
+{
+	int t;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%1		# mutex trylock\n\
+	cmpw	0,%0,%2\n\
+	bne-	2f\n"
+	PPC405_ERR77(0,%1)
+"	stwcx.	%3,0,%1\n\
+	bne-	1b"
+	ISYNC_ON_SMP
+	"\n\
+2:"
+	: "=&r" (t)
+	: "r" (&v->counter), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return t;
+}
+
+static inline int __mutex_dec_return_lock(atomic_t *v)
+{
+	int t;
+
+	__asm__ __volatile__(
+"1:	lwarx	%0,0,%1		# mutex lock\n\
+	addic	%0,%0,-1\n"
+	PPC405_ERR77(0,%1)
+"	stwcx.	%0,0,%1\n\
+	bne-	1b"
+	ISYNC_ON_SMP
+	: "=&r" (t)
+	: "r" (&v->counter)
+	: "cc", "memory");
+
+	return t;
+}
+
+static inline int __mutex_inc_return_unlock(atomic_t *v)
+{
+	int t;
+
+	__asm__ __volatile__(
+	LWSYNC_ON_SMP
+"1:	lwarx	%0,0,%1		# mutex unlock\n\
+	addic	%0,%0,1\n"
+	PPC405_ERR77(0,%1)
+"	stwcx.	%0,0,%1 \n\
+	bne-	1b"
+	: "=&r" (t)
+	: "r" (&v->counter)
+	: "cc", "memory");
+
+	return t;
+}
+
+/**
+ *  __mutex_fastpath_lock - try to take the lock by moving the count
+ *                          from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function MUST leave the value lower than
+ * 1 even when the "1" assertion wasn't true.
+ */
+static inline void
+__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	if (unlikely(__mutex_dec_return_lock(count) < 0))
+		fail_fn(count);
+}
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns.
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (unlikely(__mutex_dec_return_lock(count) < 0))
+		return fail_fn(count);
+	return 0;
+}
+
+/**
+ *  __mutex_fastpath_unlock - try to promote the count from 0 to 1
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 0
+ *
+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value to
+ * 1, or to set it to a value lower than 1.
+ */
+static inline void
+__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	if (unlikely(__mutex_inc_return_unlock(count) <= 0))
+		fail_fn(count);
+}
+
+#define __mutex_slowpath_needs_to_unlock()		1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: fallback function
  *
- * TODO: implement optimized primitives instead, or leave the generic
- * implementation in place, or pick the atomic_xchg() based generic
- * implementation. (see asm-generic/mutex-xchg.h for details)
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
  */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
+		return 1;
+}
 
-#include <asm-generic/mutex-dec.h>
+#endif

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] powerpc: implement optimised mutex fastpaths
  2008-10-12  5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
@ 2008-10-13  1:18   ` Nick Piggin
  2008-11-06  4:09     ` Paul Mackerras
  2008-10-13 16:15   ` Scott Wood
  1 sibling, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2008-10-13  1:18 UTC (permalink / raw)
  To: Ingo Molnar, linux-arch, linuxppc-dev, paulus, benh

On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> 
> Implement a more optimal mutex fastpath for powerpc, making use of acquire
> and release barrier semantics. This takes the mutex lock+unlock benchmark
> from 203 to 173 cycles on a G5.
> 
> +static inline int
> +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
> +{
> +	if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
> +		return 1;

Oops, I must have sent the wrong version. This needs a return 0 here.

> +}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] powerpc: implement optimised mutex fastpaths
  2008-10-12  5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
  2008-10-13  1:18   ` Nick Piggin
@ 2008-10-13 16:15   ` Scott Wood
  2008-10-13 16:20     ` Scott Wood
  1 sibling, 1 reply; 13+ messages in thread
From: Scott Wood @ 2008-10-13 16:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus

On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
> +{
> +	int t;
> +
> +	__asm__ __volatile__ (
> +"1:	lwarx	%0,0,%1		# mutex trylock\n\
> +	cmpw	0,%0,%2\n\
> +	bne-	2f\n"
> +	PPC405_ERR77(0,%1)
> +"	stwcx.	%3,0,%1\n\
> +	bne-	1b"
> +	ISYNC_ON_SMP
> +	"\n\
> +2:"
> +	: "=&r" (t)
> +	: "r" (&v->counter), "r" (old), "r" (new)
> +	: "cc", "memory");

This will break if the compiler picks r0 for &v->counter.  Use "b" as the
constraint, or better yet do "lwarx %0, %y1", with a "Z" (v->counter)
constraint.

-Scott

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] powerpc: implement optimised mutex fastpaths
  2008-10-13 16:15   ` Scott Wood
@ 2008-10-13 16:20     ` Scott Wood
  2008-10-14  7:06       ` Nick Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Scott Wood @ 2008-10-13 16:20 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus

On Mon, Oct 13, 2008 at 11:15:47AM -0500, Scott Wood wrote:
> On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> > +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
> > +{
> > +	int t;
> > +
> > +	__asm__ __volatile__ (
> > +"1:	lwarx	%0,0,%1		# mutex trylock\n\
> > +	cmpw	0,%0,%2\n\
> > +	bne-	2f\n"
> > +	PPC405_ERR77(0,%1)
> > +"	stwcx.	%3,0,%1\n\
> > +	bne-	1b"
> > +	ISYNC_ON_SMP
> > +	"\n\
> > +2:"
> > +	: "=&r" (t)
> > +	: "r" (&v->counter), "r" (old), "r" (new)
> > +	: "cc", "memory");
> 
> This will break if the compiler picks r0 for &v->counter.  Use "b" as the
> constraint, or better yet do "lwarx %0, %y1", with a "Z" (v->counter)
> constraint.

Sorry, had a morning brain-fart there -- you're not using the base
register.

-Scott

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] powerpc: implement optimised mutex fastpaths
  2008-10-13 16:20     ` Scott Wood
@ 2008-10-14  7:06       ` Nick Piggin
  0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-14  7:06 UTC (permalink / raw)
  To: Scott Wood; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus

On Mon, Oct 13, 2008 at 11:20:20AM -0500, Scott Wood wrote:
> On Mon, Oct 13, 2008 at 11:15:47AM -0500, Scott Wood wrote:
> > On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> > > +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
> > > +{
> > > +	int t;
> > > +
> > > +	__asm__ __volatile__ (
> > > +"1:	lwarx	%0,0,%1		# mutex trylock\n\
> > > +	cmpw	0,%0,%2\n\
> > > +	bne-	2f\n"
> > > +	PPC405_ERR77(0,%1)
> > > +"	stwcx.	%3,0,%1\n\
> > > +	bne-	1b"
> > > +	ISYNC_ON_SMP
> > > +	"\n\
> > > +2:"
> > > +	: "=&r" (t)
> > > +	: "r" (&v->counter), "r" (old), "r" (new)
> > > +	: "cc", "memory");
> > 
> > This will break if the compiler picks r0 for &v->counter.  Use "b" as the
> > constraint, or better yet do "lwarx %0, %y1", with a "Z" (v->counter)
> > constraint.
> 
> Sorry, had a morning brain-fart there -- you're not using the base
> register.

OK no problem. Do you think it looks OK? Thanks for reviewing...

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mutex: optimise generic mutex implementations
  2008-10-12  5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
  2008-10-12  5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
@ 2008-10-14  8:35 ` Benjamin Herrenschmidt
  2008-10-22 15:59 ` Ingo Molnar
  2008-10-22 16:24 ` David Howells
  3 siblings, 0 replies; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2008-10-14  8:35 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus

On Sun, 2008-10-12 at 07:46 +0200, Nick Piggin wrote:
> Speed up generic mutex implementations.
> 
> - atomic operations which both modify the variable and return something imply
>   full smp memory barriers before and after the memory operations involved
>   (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
>   they don't modify the target). See Documentation/atomic_ops.txt.
>   So remove extra barriers and branches.
>   
> - All architectures support atomic_cmpxchg. This has no relation to
>   __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
> 
> This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> to 203 cycles on a ppc970 system.
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>

Looks ok.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mutex: optimise generic mutex implementations
  2008-10-12  5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
  2008-10-12  5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
  2008-10-14  8:35 ` [patch] mutex: optimise generic mutex implementations Benjamin Herrenschmidt
@ 2008-10-22 15:59 ` Ingo Molnar
  2008-10-23  4:43   ` Benjamin Herrenschmidt
  2008-10-22 16:24 ` David Howells
  3 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2008-10-22 15:59 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, paulus, Peter Zijlstra


* Nick Piggin <npiggin@suse.de> wrote:

> Speed up generic mutex implementations.
> 
> - atomic operations which both modify the variable and return something imply
>   full smp memory barriers before and after the memory operations involved
>   (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
>   they don't modify the target). See Documentation/atomic_ops.txt.
>   So remove extra barriers and branches.
>   
> - All architectures support atomic_cmpxchg. This has no relation to
>   __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
> 
> This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> to 203 cycles on a ppc970 system.
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>

no objections here. Lets merge these two patches via the ppc tree, so 
that it gets testing on real hardware as well?

Acked-by: Ingo Molnar <mingo@elte.hu>

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mutex: optimise generic mutex implementations
  2008-10-12  5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
                   ` (2 preceding siblings ...)
  2008-10-22 15:59 ` Ingo Molnar
@ 2008-10-22 16:24 ` David Howells
  3 siblings, 0 replies; 13+ messages in thread
From: David Howells @ 2008-10-22 16:24 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, paulus, Ingo Molnar

Nick Piggin <npiggin@suse.de> wrote:

> Speed up generic mutex implementations.
> 
> - atomic operations which both modify the variable and return something imply
>   full smp memory barriers before and after the memory operations involved
>   (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
>   they don't modify the target). See Documentation/atomic_ops.txt.
>   So remove extra barriers and branches.
>   
> - All architectures support atomic_cmpxchg. This has no relation to
>   __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
> 
> This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> to 203 cycles on a ppc970 system.
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>

This seems to work on FRV which uses the mutex-dec generic algorithm, though
you have to take that with a pinch of salt as I don't have SMP hardware for
it.

Acked-by: David Howells <dhowells@redhat.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mutex: optimise generic mutex implementations
  2008-10-22 15:59 ` Ingo Molnar
@ 2008-10-23  4:43   ` Benjamin Herrenschmidt
  2008-10-23  7:02     ` Nick Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2008-10-23  4:43 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Nick Piggin, linux-arch, paulus, Peter Zijlstra, linuxppc-dev

On Wed, 2008-10-22 at 17:59 +0200, Ingo Molnar wrote:
> * Nick Piggin <npiggin@suse.de> wrote:
> 
> > Speed up generic mutex implementations.
> > 
> > - atomic operations which both modify the variable and return something imply
> >   full smp memory barriers before and after the memory operations involved
> >   (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
> >   they don't modify the target). See Documentation/atomic_ops.txt.
> >   So remove extra barriers and branches.
> >   
> > - All architectures support atomic_cmpxchg. This has no relation to
> >   __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
> > 
> > This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> > to 203 cycles on a ppc970 system.
> > 
> > Signed-off-by: Nick Piggin <npiggin@suse.de>
> 
> no objections here. Lets merge these two patches via the ppc tree, so 
> that it gets testing on real hardware as well?
> 
> Acked-by: Ingo Molnar <mingo@elte.hu>

Allright but in that case it will be after -rc1 unless I manage to sneak
something in tomorrow before linux closes the merge window.

I can't get an update today.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mutex: optimise generic mutex implementations
  2008-10-23  4:43   ` Benjamin Herrenschmidt
@ 2008-10-23  7:02     ` Nick Piggin
  0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-23  7:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus, Peter Zijlstra

On Thu, Oct 23, 2008 at 03:43:58PM +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2008-10-22 at 17:59 +0200, Ingo Molnar wrote:
> > * Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > Speed up generic mutex implementations.
> > > 
> > > - atomic operations which both modify the variable and return something imply
> > >   full smp memory barriers before and after the memory operations involved
> > >   (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
> > >   they don't modify the target). See Documentation/atomic_ops.txt.
> > >   So remove extra barriers and branches.
> > >   
> > > - All architectures support atomic_cmpxchg. This has no relation to
> > >   __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
> > > 
> > > This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> > > to 203 cycles on a ppc970 system.
> > > 
> > > Signed-off-by: Nick Piggin <npiggin@suse.de>
> > 
> > no objections here. Lets merge these two patches via the ppc tree, so 
> > that it gets testing on real hardware as well?
> > 
> > Acked-by: Ingo Molnar <mingo@elte.hu>
> 
> Allright but in that case it will be after -rc1 unless I manage to sneak
> something in tomorrow before linux closes the merge window.
> 
> I can't get an update today.

Fine with me.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] powerpc: implement optimised mutex fastpaths
  2008-10-13  1:18   ` Nick Piggin
@ 2008-11-06  4:09     ` Paul Mackerras
  2008-11-06  5:06       ` Nick Piggin
  0 siblings, 1 reply; 13+ messages in thread
From: Paul Mackerras @ 2008-11-06  4:09 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar

Nick Piggin writes:

> On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> > 
> > Implement a more optimal mutex fastpath for powerpc, making use of acquire
> > and release barrier semantics. This takes the mutex lock+unlock benchmark
> > from 203 to 173 cycles on a G5.
> > 
> > +static inline int
> > +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
> > +{
> > +	if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
> > +		return 1;
> 
> Oops, I must have sent the wrong version. This needs a return 0 here.

Are you going to send the right version?  (or did you send it and I
missed it?)

Also I note that the comment you added just before that routine says:

+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.

yet just doing a __mutex_cmpxchg_lock isn't going to do that.  So
please fix either the comment or the code (or both :).

Paul.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] powerpc: implement optimised mutex fastpaths
  2008-11-06  4:09     ` Paul Mackerras
@ 2008-11-06  5:06       ` Nick Piggin
  0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2008-11-06  5:06 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linux-arch, linuxppc-dev, Ingo Molnar

On Thu, Nov 06, 2008 at 03:09:08PM +1100, Paul Mackerras wrote:
> Nick Piggin writes:
> 
> > On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> > > 
> > > Implement a more optimal mutex fastpath for powerpc, making use of acquire
> > > and release barrier semantics. This takes the mutex lock+unlock benchmark
> > > from 203 to 173 cycles on a G5.
> > > 
> > > +static inline int
> > > +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
> > > +{
> > > +	if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
> > > +		return 1;
> > 
> > Oops, I must have sent the wrong version. This needs a return 0 here.
> 
> Are you going to send the right version?  (or did you send it and I
> missed it?)

Hmm, I thought I did but perhaps not. Will do...


> Also I note that the comment you added just before that routine says:
> 
> + * Change the count from 1 to a value lower than 1, and return 0 (failure)
> + * if it wasn't 1 originally, or return 1 (success) otherwise. This function
> + * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
> 
> yet just doing a __mutex_cmpxchg_lock isn't going to do that.  So
> please fix either the comment or the code (or both :).

OK.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2008-11-06  5:06 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-12  5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
2008-10-12  5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
2008-10-13  1:18   ` Nick Piggin
2008-11-06  4:09     ` Paul Mackerras
2008-11-06  5:06       ` Nick Piggin
2008-10-13 16:15   ` Scott Wood
2008-10-13 16:20     ` Scott Wood
2008-10-14  7:06       ` Nick Piggin
2008-10-14  8:35 ` [patch] mutex: optimise generic mutex implementations Benjamin Herrenschmidt
2008-10-22 15:59 ` Ingo Molnar
2008-10-23  4:43   ` Benjamin Herrenschmidt
2008-10-23  7:02     ` Nick Piggin
2008-10-22 16:24 ` David Howells

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).