* [patch] mutex: optimise generic mutex implementations
@ 2008-10-12 5:46 Nick Piggin
2008-10-12 5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
` (3 more replies)
0 siblings, 4 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-12 5:46 UTC (permalink / raw)
To: Ingo Molnar, linux-arch, linuxppc-dev, paulus, benh
Speed up generic mutex implementations.
- atomic operations which both modify the variable and return something imply
full smp memory barriers before and after the memory operations involved
(failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
they don't modify the target). See Documentation/atomic_ops.txt.
So remove extra barriers and branches.
- All architectures support atomic_cmpxchg. This has no relation to
__HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
to 203 cycles on a ppc970 system.
Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/include/asm-generic/mutex-dec.h
===================================================================
--- linux-2.6.orig/include/asm-generic/mutex-dec.h
+++ linux-2.6/include/asm-generic/mutex-dec.h
@@ -22,8 +22,6 @@ __mutex_fastpath_lock(atomic_t *count, v
{
if (unlikely(atomic_dec_return(count) < 0))
fail_fn(count);
- else
- smp_mb();
}
/**
@@ -41,10 +39,7 @@ __mutex_fastpath_lock_retval(atomic_t *c
{
if (unlikely(atomic_dec_return(count) < 0))
return fail_fn(count);
- else {
- smp_mb();
- return 0;
- }
+ return 0;
}
/**
@@ -63,7 +58,6 @@ __mutex_fastpath_lock_retval(atomic_t *c
static inline void
__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
{
- smp_mb();
if (unlikely(atomic_inc_return(count) <= 0))
fail_fn(count);
}
@@ -98,15 +92,9 @@ __mutex_fastpath_trylock(atomic_t *count
* just as efficient (and simpler) as a 'destructive' probing of
* the mutex state would be.
*/
-#ifdef __HAVE_ARCH_CMPXCHG
- if (likely(atomic_cmpxchg(count, 1, 0) == 1)) {
- smp_mb();
+ if (likely(atomic_cmpxchg(count, 1, 0) == 1))
return 1;
- }
return 0;
-#else
- return fail_fn(count);
-#endif
}
#endif
Index: linux-2.6/include/asm-generic/mutex-xchg.h
===================================================================
--- linux-2.6.orig/include/asm-generic/mutex-xchg.h
+++ linux-2.6/include/asm-generic/mutex-xchg.h
@@ -27,8 +27,6 @@ __mutex_fastpath_lock(atomic_t *count, v
{
if (unlikely(atomic_xchg(count, 0) != 1))
fail_fn(count);
- else
- smp_mb();
}
/**
@@ -46,10 +44,7 @@ __mutex_fastpath_lock_retval(atomic_t *c
{
if (unlikely(atomic_xchg(count, 0) != 1))
return fail_fn(count);
- else {
- smp_mb();
- return 0;
- }
+ return 0;
}
/**
@@ -67,7 +62,6 @@ __mutex_fastpath_lock_retval(atomic_t *c
static inline void
__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
{
- smp_mb();
if (unlikely(atomic_xchg(count, 1) != 0))
fail_fn(count);
}
@@ -110,7 +104,6 @@ __mutex_fastpath_trylock(atomic_t *count
if (prev < 0)
prev = 0;
}
- smp_mb();
return prev;
}
^ permalink raw reply [flat|nested] 13+ messages in thread
* [patch] powerpc: implement optimised mutex fastpaths
2008-10-12 5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
@ 2008-10-12 5:47 ` Nick Piggin
2008-10-13 1:18 ` Nick Piggin
2008-10-13 16:15 ` Scott Wood
2008-10-14 8:35 ` [patch] mutex: optimise generic mutex implementations Benjamin Herrenschmidt
` (2 subsequent siblings)
3 siblings, 2 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-12 5:47 UTC (permalink / raw)
To: Ingo Molnar, linux-arch, linuxppc-dev, paulus, benh
Implement a more optimal mutex fastpath for powerpc, making use of acquire
and release barrier semantics. This takes the mutex lock+unlock benchmark
from 203 to 173 cycles on a G5.
Signed-off-by: Nick Piggin <npiggin@suse.de>
---
Index: linux-2.6/arch/powerpc/include/asm/mutex.h
===================================================================
--- linux-2.6.orig/arch/powerpc/include/asm/mutex.h
+++ linux-2.6/arch/powerpc/include/asm/mutex.h
@@ -1,9 +1,136 @@
/*
- * Pull in the generic implementation for the mutex fastpath.
+ * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm
+ */
+#ifndef _ASM_POWERPC_MUTEX_H
+#define _ASM_POWERPC_MUTEX_H
+
+static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
+{
+ int t;
+
+ __asm__ __volatile__ (
+"1: lwarx %0,0,%1 # mutex trylock\n\
+ cmpw 0,%0,%2\n\
+ bne- 2f\n"
+ PPC405_ERR77(0,%1)
+" stwcx. %3,0,%1\n\
+ bne- 1b"
+ ISYNC_ON_SMP
+ "\n\
+2:"
+ : "=&r" (t)
+ : "r" (&v->counter), "r" (old), "r" (new)
+ : "cc", "memory");
+
+ return t;
+}
+
+static inline int __mutex_dec_return_lock(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+"1: lwarx %0,0,%1 # mutex lock\n\
+ addic %0,%0,-1\n"
+ PPC405_ERR77(0,%1)
+" stwcx. %0,0,%1\n\
+ bne- 1b"
+ ISYNC_ON_SMP
+ : "=&r" (t)
+ : "r" (&v->counter)
+ : "cc", "memory");
+
+ return t;
+}
+
+static inline int __mutex_inc_return_unlock(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+ LWSYNC_ON_SMP
+"1: lwarx %0,0,%1 # mutex unlock\n\
+ addic %0,%0,1\n"
+ PPC405_ERR77(0,%1)
+" stwcx. %0,0,%1 \n\
+ bne- 1b"
+ : "=&r" (t)
+ : "r" (&v->counter)
+ : "cc", "memory");
+
+ return t;
+}
+
+/**
+ * __mutex_fastpath_lock - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function MUST leave the value lower than
+ * 1 even when the "1" assertion wasn't true.
+ */
+static inline void
+__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+ if (unlikely(__mutex_dec_return_lock(count) < 0))
+ fail_fn(count);
+}
+
+/**
+ * __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ * from 1 to a 0 value
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns.
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+ if (unlikely(__mutex_dec_return_lock(count) < 0))
+ return fail_fn(count);
+ return 0;
+}
+
+/**
+ * __mutex_fastpath_unlock - try to promote the count from 0 to 1
+ * @count: pointer of type atomic_t
+ * @fail_fn: function to call if the original value was not 0
+ *
+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value to
+ * 1, or to set it to a value lower than 1.
+ */
+static inline void
+__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+ if (unlikely(__mutex_inc_return_unlock(count) <= 0))
+ fail_fn(count);
+}
+
+#define __mutex_slowpath_needs_to_unlock() 1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ * @count: pointer of type atomic_t
+ * @fail_fn: fallback function
*
- * TODO: implement optimized primitives instead, or leave the generic
- * implementation in place, or pick the atomic_xchg() based generic
- * implementation. (see asm-generic/mutex-xchg.h for details)
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
*/
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+ if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
+ return 1;
+}
-#include <asm-generic/mutex-dec.h>
+#endif
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] powerpc: implement optimised mutex fastpaths
2008-10-12 5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
@ 2008-10-13 1:18 ` Nick Piggin
2008-11-06 4:09 ` Paul Mackerras
2008-10-13 16:15 ` Scott Wood
1 sibling, 1 reply; 13+ messages in thread
From: Nick Piggin @ 2008-10-13 1:18 UTC (permalink / raw)
To: Ingo Molnar, linux-arch, linuxppc-dev, paulus, benh
On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
>
> Implement a more optimal mutex fastpath for powerpc, making use of acquire
> and release barrier semantics. This takes the mutex lock+unlock benchmark
> from 203 to 173 cycles on a G5.
>
> +static inline int
> +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
> +{
> + if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
> + return 1;
Oops, I must have sent the wrong version. This needs a return 0 here.
> +}
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] powerpc: implement optimised mutex fastpaths
2008-10-12 5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
2008-10-13 1:18 ` Nick Piggin
@ 2008-10-13 16:15 ` Scott Wood
2008-10-13 16:20 ` Scott Wood
1 sibling, 1 reply; 13+ messages in thread
From: Scott Wood @ 2008-10-13 16:15 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus
On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
> +{
> + int t;
> +
> + __asm__ __volatile__ (
> +"1: lwarx %0,0,%1 # mutex trylock\n\
> + cmpw 0,%0,%2\n\
> + bne- 2f\n"
> + PPC405_ERR77(0,%1)
> +" stwcx. %3,0,%1\n\
> + bne- 1b"
> + ISYNC_ON_SMP
> + "\n\
> +2:"
> + : "=&r" (t)
> + : "r" (&v->counter), "r" (old), "r" (new)
> + : "cc", "memory");
This will break if the compiler picks r0 for &v->counter. Use "b" as the
constraint, or better yet do "lwarx %0, %y1", with a "Z" (v->counter)
constraint.
-Scott
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] powerpc: implement optimised mutex fastpaths
2008-10-13 16:15 ` Scott Wood
@ 2008-10-13 16:20 ` Scott Wood
2008-10-14 7:06 ` Nick Piggin
0 siblings, 1 reply; 13+ messages in thread
From: Scott Wood @ 2008-10-13 16:20 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus
On Mon, Oct 13, 2008 at 11:15:47AM -0500, Scott Wood wrote:
> On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> > +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
> > +{
> > + int t;
> > +
> > + __asm__ __volatile__ (
> > +"1: lwarx %0,0,%1 # mutex trylock\n\
> > + cmpw 0,%0,%2\n\
> > + bne- 2f\n"
> > + PPC405_ERR77(0,%1)
> > +" stwcx. %3,0,%1\n\
> > + bne- 1b"
> > + ISYNC_ON_SMP
> > + "\n\
> > +2:"
> > + : "=&r" (t)
> > + : "r" (&v->counter), "r" (old), "r" (new)
> > + : "cc", "memory");
>
> This will break if the compiler picks r0 for &v->counter. Use "b" as the
> constraint, or better yet do "lwarx %0, %y1", with a "Z" (v->counter)
> constraint.
Sorry, had a morning brain-fart there -- you're not using the base
register.
-Scott
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] powerpc: implement optimised mutex fastpaths
2008-10-13 16:20 ` Scott Wood
@ 2008-10-14 7:06 ` Nick Piggin
0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-14 7:06 UTC (permalink / raw)
To: Scott Wood; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus
On Mon, Oct 13, 2008 at 11:20:20AM -0500, Scott Wood wrote:
> On Mon, Oct 13, 2008 at 11:15:47AM -0500, Scott Wood wrote:
> > On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> > > +static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new)
> > > +{
> > > + int t;
> > > +
> > > + __asm__ __volatile__ (
> > > +"1: lwarx %0,0,%1 # mutex trylock\n\
> > > + cmpw 0,%0,%2\n\
> > > + bne- 2f\n"
> > > + PPC405_ERR77(0,%1)
> > > +" stwcx. %3,0,%1\n\
> > > + bne- 1b"
> > > + ISYNC_ON_SMP
> > > + "\n\
> > > +2:"
> > > + : "=&r" (t)
> > > + : "r" (&v->counter), "r" (old), "r" (new)
> > > + : "cc", "memory");
> >
> > This will break if the compiler picks r0 for &v->counter. Use "b" as the
> > constraint, or better yet do "lwarx %0, %y1", with a "Z" (v->counter)
> > constraint.
>
> Sorry, had a morning brain-fart there -- you're not using the base
> register.
OK no problem. Do you think it looks OK? Thanks for reviewing...
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] mutex: optimise generic mutex implementations
2008-10-12 5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
2008-10-12 5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
@ 2008-10-14 8:35 ` Benjamin Herrenschmidt
2008-10-22 15:59 ` Ingo Molnar
2008-10-22 16:24 ` David Howells
3 siblings, 0 replies; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2008-10-14 8:35 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus
On Sun, 2008-10-12 at 07:46 +0200, Nick Piggin wrote:
> Speed up generic mutex implementations.
>
> - atomic operations which both modify the variable and return something imply
> full smp memory barriers before and after the memory operations involved
> (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
> they don't modify the target). See Documentation/atomic_ops.txt.
> So remove extra barriers and branches.
>
> - All architectures support atomic_cmpxchg. This has no relation to
> __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
>
> This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> to 203 cycles on a ppc970 system.
>
> Signed-off-by: Nick Piggin <npiggin@suse.de>
Looks ok.
Cheers,
Ben.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] mutex: optimise generic mutex implementations
2008-10-12 5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
2008-10-12 5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
2008-10-14 8:35 ` [patch] mutex: optimise generic mutex implementations Benjamin Herrenschmidt
@ 2008-10-22 15:59 ` Ingo Molnar
2008-10-23 4:43 ` Benjamin Herrenschmidt
2008-10-22 16:24 ` David Howells
3 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2008-10-22 15:59 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, paulus, Peter Zijlstra
* Nick Piggin <npiggin@suse.de> wrote:
> Speed up generic mutex implementations.
>
> - atomic operations which both modify the variable and return something imply
> full smp memory barriers before and after the memory operations involved
> (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
> they don't modify the target). See Documentation/atomic_ops.txt.
> So remove extra barriers and branches.
>
> - All architectures support atomic_cmpxchg. This has no relation to
> __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
>
> This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> to 203 cycles on a ppc970 system.
>
> Signed-off-by: Nick Piggin <npiggin@suse.de>
no objections here. Lets merge these two patches via the ppc tree, so
that it gets testing on real hardware as well?
Acked-by: Ingo Molnar <mingo@elte.hu>
Ingo
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] mutex: optimise generic mutex implementations
2008-10-12 5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
` (2 preceding siblings ...)
2008-10-22 15:59 ` Ingo Molnar
@ 2008-10-22 16:24 ` David Howells
3 siblings, 0 replies; 13+ messages in thread
From: David Howells @ 2008-10-22 16:24 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, paulus, Ingo Molnar
Nick Piggin <npiggin@suse.de> wrote:
> Speed up generic mutex implementations.
>
> - atomic operations which both modify the variable and return something imply
> full smp memory barriers before and after the memory operations involved
> (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
> they don't modify the target). See Documentation/atomic_ops.txt.
> So remove extra barriers and branches.
>
> - All architectures support atomic_cmpxchg. This has no relation to
> __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
>
> This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> to 203 cycles on a ppc970 system.
>
> Signed-off-by: Nick Piggin <npiggin@suse.de>
This seems to work on FRV which uses the mutex-dec generic algorithm, though
you have to take that with a pinch of salt as I don't have SMP hardware for
it.
Acked-by: David Howells <dhowells@redhat.com>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] mutex: optimise generic mutex implementations
2008-10-22 15:59 ` Ingo Molnar
@ 2008-10-23 4:43 ` Benjamin Herrenschmidt
2008-10-23 7:02 ` Nick Piggin
0 siblings, 1 reply; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2008-10-23 4:43 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Nick Piggin, linux-arch, paulus, Peter Zijlstra, linuxppc-dev
On Wed, 2008-10-22 at 17:59 +0200, Ingo Molnar wrote:
> * Nick Piggin <npiggin@suse.de> wrote:
>
> > Speed up generic mutex implementations.
> >
> > - atomic operations which both modify the variable and return something imply
> > full smp memory barriers before and after the memory operations involved
> > (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
> > they don't modify the target). See Documentation/atomic_ops.txt.
> > So remove extra barriers and branches.
> >
> > - All architectures support atomic_cmpxchg. This has no relation to
> > __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
> >
> > This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> > to 203 cycles on a ppc970 system.
> >
> > Signed-off-by: Nick Piggin <npiggin@suse.de>
>
> no objections here. Lets merge these two patches via the ppc tree, so
> that it gets testing on real hardware as well?
>
> Acked-by: Ingo Molnar <mingo@elte.hu>
Allright but in that case it will be after -rc1 unless I manage to sneak
something in tomorrow before linux closes the merge window.
I can't get an update today.
Cheers,
Ben.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] mutex: optimise generic mutex implementations
2008-10-23 4:43 ` Benjamin Herrenschmidt
@ 2008-10-23 7:02 ` Nick Piggin
0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2008-10-23 7:02 UTC (permalink / raw)
To: Benjamin Herrenschmidt
Cc: linux-arch, linuxppc-dev, Ingo Molnar, paulus, Peter Zijlstra
On Thu, Oct 23, 2008 at 03:43:58PM +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2008-10-22 at 17:59 +0200, Ingo Molnar wrote:
> > * Nick Piggin <npiggin@suse.de> wrote:
> >
> > > Speed up generic mutex implementations.
> > >
> > > - atomic operations which both modify the variable and return something imply
> > > full smp memory barriers before and after the memory operations involved
> > > (failing atomic_cmpxchg, atomic_add_unless, etc don't imply a barrier because
> > > they don't modify the target). See Documentation/atomic_ops.txt.
> > > So remove extra barriers and branches.
> > >
> > > - All architectures support atomic_cmpxchg. This has no relation to
> > > __HAVE_ARCH_CMPXCHG. We can just take the atomic_cmpxchg path unconditionally
> > >
> > > This reduces a simple single threaded fastpath lock+unlock test from 590 cycles
> > > to 203 cycles on a ppc970 system.
> > >
> > > Signed-off-by: Nick Piggin <npiggin@suse.de>
> >
> > no objections here. Lets merge these two patches via the ppc tree, so
> > that it gets testing on real hardware as well?
> >
> > Acked-by: Ingo Molnar <mingo@elte.hu>
>
> Allright but in that case it will be after -rc1 unless I manage to sneak
> something in tomorrow before linux closes the merge window.
>
> I can't get an update today.
Fine with me.
Thanks,
Nick
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] powerpc: implement optimised mutex fastpaths
2008-10-13 1:18 ` Nick Piggin
@ 2008-11-06 4:09 ` Paul Mackerras
2008-11-06 5:06 ` Nick Piggin
0 siblings, 1 reply; 13+ messages in thread
From: Paul Mackerras @ 2008-11-06 4:09 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-arch, linuxppc-dev, Ingo Molnar
Nick Piggin writes:
> On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> >
> > Implement a more optimal mutex fastpath for powerpc, making use of acquire
> > and release barrier semantics. This takes the mutex lock+unlock benchmark
> > from 203 to 173 cycles on a G5.
> >
> > +static inline int
> > +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
> > +{
> > + if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
> > + return 1;
>
> Oops, I must have sent the wrong version. This needs a return 0 here.
Are you going to send the right version? (or did you send it and I
missed it?)
Also I note that the comment you added just before that routine says:
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
yet just doing a __mutex_cmpxchg_lock isn't going to do that. So
please fix either the comment or the code (or both :).
Paul.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [patch] powerpc: implement optimised mutex fastpaths
2008-11-06 4:09 ` Paul Mackerras
@ 2008-11-06 5:06 ` Nick Piggin
0 siblings, 0 replies; 13+ messages in thread
From: Nick Piggin @ 2008-11-06 5:06 UTC (permalink / raw)
To: Paul Mackerras; +Cc: linux-arch, linuxppc-dev, Ingo Molnar
On Thu, Nov 06, 2008 at 03:09:08PM +1100, Paul Mackerras wrote:
> Nick Piggin writes:
>
> > On Sun, Oct 12, 2008 at 07:47:32AM +0200, Nick Piggin wrote:
> > >
> > > Implement a more optimal mutex fastpath for powerpc, making use of acquire
> > > and release barrier semantics. This takes the mutex lock+unlock benchmark
> > > from 203 to 173 cycles on a G5.
> > >
> > > +static inline int
> > > +__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
> > > +{
> > > + if (likely(__mutex_cmpxchg_lock(count, 1, 0) == 1))
> > > + return 1;
> >
> > Oops, I must have sent the wrong version. This needs a return 0 here.
>
> Are you going to send the right version? (or did you send it and I
> missed it?)
Hmm, I thought I did but perhaps not. Will do...
> Also I note that the comment you added just before that routine says:
>
> + * Change the count from 1 to a value lower than 1, and return 0 (failure)
> + * if it wasn't 1 originally, or return 1 (success) otherwise. This function
> + * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
>
> yet just doing a __mutex_cmpxchg_lock isn't going to do that. So
> please fix either the comment or the code (or both :).
OK.
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2008-11-06 5:06 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-12 5:46 [patch] mutex: optimise generic mutex implementations Nick Piggin
2008-10-12 5:47 ` [patch] powerpc: implement optimised mutex fastpaths Nick Piggin
2008-10-13 1:18 ` Nick Piggin
2008-11-06 4:09 ` Paul Mackerras
2008-11-06 5:06 ` Nick Piggin
2008-10-13 16:15 ` Scott Wood
2008-10-13 16:20 ` Scott Wood
2008-10-14 7:06 ` Nick Piggin
2008-10-14 8:35 ` [patch] mutex: optimise generic mutex implementations Benjamin Herrenschmidt
2008-10-22 15:59 ` Ingo Molnar
2008-10-23 4:43 ` Benjamin Herrenschmidt
2008-10-23 7:02 ` Nick Piggin
2008-10-22 16:24 ` David Howells
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).