[PATCH v2] x86: Use asm-goto to implement mutex fast path on x86-64

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] x86: Use asm-goto to implement mutex fast path on x86-64
@ 2013-06-28 11:50 Wedson Almeida Filho
  2013-06-28 14:14 ` H. Peter Anvin
  2013-07-05 14:24 ` [tip:x86/asm] " tip-bot for Wedson Almeida Filho
  0 siblings, 2 replies; 4+ messages in thread
From: Wedson Almeida Filho @ 2013-06-28 11:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, H. Peter Anvin, x86, linux-kernel,
	Wedson Almeida Filho

The new implementation allows the compiler to better optimize the code; the
original implementation is still used when the kernel is compiled with older
versions of gcc that don't support asm-goto.

Compiling with gcc 4.7.3, the original mutex_lock() is 60 bytes with the fast
path taking 16 instructions; the new mutex_lock() is 42 bytes, with the fast
path taking 12 instructions.

The original mutex_unlock() is 24 bytes with the fast path taking 7
instructions; the new mutex_unlock() is 25 bytes (because the compiler used
a 2-byte ret) with the fast path taking 4 instructions.

The two versions of the functions are included below for reference.

Old:
ffffffff817742a0 <mutex_lock>:
ffffffff817742a0:       55                      push   %rbp
ffffffff817742a1:       48 89 e5                mov    %rsp,%rbp
ffffffff817742a4:       48 83 ec 10             sub    $0x10,%rsp
ffffffff817742a8:       48 89 5d f0             mov    %rbx,-0x10(%rbp)
ffffffff817742ac:       48 89 fb                mov    %rdi,%rbx
ffffffff817742af:       4c 89 65 f8             mov    %r12,-0x8(%rbp)
ffffffff817742b3:       e8 28 15 00 00          callq  ffffffff817757e0 <_cond_resched>
ffffffff817742b8:       48 89 df                mov    %rbx,%rdi
ffffffff817742bb:       f0 ff 0f                lock decl (%rdi)
ffffffff817742be:       79 05                   jns    ffffffff817742c5 <mutex_lock+0x25>
ffffffff817742c0:       e8 cb 04 00 00          callq  ffffffff81774790 <__mutex_lock_slowpath>
ffffffff817742c5:       65 48 8b 04 25 c0 b7    mov    %gs:0xb7c0,%rax
ffffffff817742cc:       00 00
ffffffff817742ce:       4c 8b 65 f8             mov    -0x8(%rbp),%r12
ffffffff817742d2:       48 89 43 18             mov    %rax,0x18(%rbx)
ffffffff817742d6:       48 8b 5d f0             mov    -0x10(%rbp),%rbx
ffffffff817742da:       c9                      leaveq
ffffffff817742db:       c3                      retq

ffffffff81774250 <mutex_unlock>:
ffffffff81774250:       55                      push   %rbp
ffffffff81774251:       48 c7 47 18 00 00 00    movq   $0x0,0x18(%rdi)
ffffffff81774258:       00
ffffffff81774259:       48 89 e5                mov    %rsp,%rbp
ffffffff8177425c:       f0 ff 07                lock incl (%rdi)
ffffffff8177425f:       7f 05                   jg     ffffffff81774266 <mutex_unlock+0x16>
ffffffff81774261:       e8 ea 04 00 00          callq  ffffffff81774750 <__mutex_unlock_slowpath>
ffffffff81774266:       5d                      pop    %rbp
ffffffff81774267:       c3                      retq

New:
ffffffff81774920 <mutex_lock>:
ffffffff81774920:       55                      push   %rbp
ffffffff81774921:       48 89 e5                mov    %rsp,%rbp
ffffffff81774924:       53                      push   %rbx
ffffffff81774925:       48 89 fb                mov    %rdi,%rbx
ffffffff81774928:       e8 a3 0e 00 00          callq  ffffffff817757d0 <_cond_resched>
ffffffff8177492d:       f0 ff 0b                lock decl (%rbx)
ffffffff81774930:       79 08                   jns    ffffffff8177493a <mutex_lock+0x1a>
ffffffff81774932:       48 89 df                mov    %rbx,%rdi
ffffffff81774935:       e8 16 fe ff ff          callq  ffffffff81774750 <__mutex_lock_slowpath>
ffffffff8177493a:       65 48 8b 04 25 c0 b7    mov    %gs:0xb7c0,%rax
ffffffff81774941:       00 00
ffffffff81774943:       48 89 43 18             mov    %rax,0x18(%rbx)
ffffffff81774947:       5b                      pop    %rbx
ffffffff81774948:       5d                      pop    %rbp
ffffffff81774949:       c3                      retq

ffffffff81774730 <mutex_unlock>:
ffffffff81774730:       48 c7 47 18 00 00 00    movq   $0x0,0x18(%rdi)
ffffffff81774737:       00
ffffffff81774738:       f0 ff 07                lock incl (%rdi)
ffffffff8177473b:       7f 0a                   jg     ffffffff81774747 <mutex_unlock+0x17>
ffffffff8177473d:       55                      push   %rbp
ffffffff8177473e:       48 89 e5                mov    %rsp,%rbp
ffffffff81774741:       e8 aa ff ff ff          callq  ffffffff817746f0 <__mutex_unlock_slowpath>
ffffffff81774746:       5d                      pop    %rbp
ffffffff81774747:       f3 c3                   repz retq

Signed-off-by: Wedson Almeida Filho <wedsonaf@gmail.com>
---
 arch/x86/include/asm/mutex_64.h |   30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
index 68a87b0..c030bee 100644
--- a/arch/x86/include/asm/mutex_64.h
+++ b/arch/x86/include/asm/mutex_64.h
@@ -16,6 +16,20 @@
  *
  * Atomically decrements @v and calls <fail_fn> if the result is negative.
  */
+#ifdef CC_HAVE_ASM_GOTO
+static inline void __mutex_fastpath_lock(atomic_t *v,
+					 void (*fail_fn)(atomic_t *))
+{
+	asm volatile goto(LOCK_PREFIX "   decl %0\n"
+			  "   jns %l[exit]\n"
+			  : : "m" (v->counter)
+			  : "memory", "cc"
+			  : exit);
+	fail_fn(v);
+exit:
+	return;
+}
+#else
 #define __mutex_fastpath_lock(v, fail_fn)			\
 do {								\
 	unsigned long dummy;					\
@@ -32,6 +46,7 @@ do {								\
 		     : "rax", "rsi", "rdx", "rcx",		\
 		       "r8", "r9", "r10", "r11", "memory");	\
 } while (0)
+#endif
 
 /**
  *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
@@ -59,6 +74,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count,
  *
  * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
  */
+#ifdef CC_HAVE_ASM_GOTO
+static inline void __mutex_fastpath_unlock(atomic_t *v,
+					   void (*fail_fn)(atomic_t *))
+{
+	asm volatile goto(LOCK_PREFIX "   incl %0\n"
+			  "   jg %l[exit]\n"
+			  : : "m" (v->counter)
+			  : "memory", "cc"
+			  : exit);
+	fail_fn(v);
+exit:
+	return;
+}
+#else
 #define __mutex_fastpath_unlock(v, fail_fn)			\
 do {								\
 	unsigned long dummy;					\
@@ -75,6 +104,7 @@ do {								\
 		     : "rax", "rsi", "rdx", "rcx",		\
 		       "r8", "r9", "r10", "r11", "memory");	\
 } while (0)
+#endif
 
 #define __mutex_slowpath_needs_to_unlock()	1
 
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] x86: Use asm-goto to implement mutex fast path on x86-64
  2013-06-28 11:50 [PATCH v2] x86: Use asm-goto to implement mutex fast path on x86-64 Wedson Almeida Filho
@ 2013-06-28 14:14 ` H. Peter Anvin
  2013-06-29  6:35   ` Wedson Almeida Filho
  2013-07-05 14:24 ` [tip:x86/asm] " tip-bot for Wedson Almeida Filho
  1 sibling, 1 reply; 4+ messages in thread
From: H. Peter Anvin @ 2013-06-28 14:14 UTC (permalink / raw)
  To: Wedson Almeida Filho; +Cc: Ingo Molnar, Thomas Gleixner, x86, linux-kernel

On 06/28/2013 04:50 AM, Wedson Almeida Filho wrote:
> The new implementation allows the compiler to better optimize the code; the
> original implementation is still used when the kernel is compiled with older
> versions of gcc that don't support asm-goto.
> 
> Compiling with gcc 4.7.3, the original mutex_lock() is 60 bytes with the fast
> path taking 16 instructions; the new mutex_lock() is 42 bytes, with the fast
> path taking 12 instructions.
> 
> The original mutex_unlock() is 24 bytes with the fast path taking 7
> instructions; the new mutex_unlock() is 25 bytes (because the compiler used
> a 2-byte ret) with the fast path taking 4 instructions.
> 
> The two versions of the functions are included below for reference.
> 

As Ingo said, looks very nice.  However, it is really too late for Linux
3.11, so I'm going to put it on a queue I already have for 3.12.

	-hpa



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] x86: Use asm-goto to implement mutex fast path on x86-64
  2013-06-28 14:14 ` H. Peter Anvin
@ 2013-06-29  6:35   ` Wedson Almeida Filho
  0 siblings, 0 replies; 4+ messages in thread
From: Wedson Almeida Filho @ 2013-06-29  6:35 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Ingo Molnar, Thomas Gleixner, x86, linux-kernel

On Fri, Jun 28, 2013 at 7:14 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>
> As Ingo said, looks very nice.  However, it is really too late for Linux
> 3.11, so I'm going to put it on a queue I already have for 3.12.

That's fine, I'm not in a hurry. Thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip:x86/asm] x86: Use asm-goto to implement mutex fast path on x86-64
  2013-06-28 11:50 [PATCH v2] x86: Use asm-goto to implement mutex fast path on x86-64 Wedson Almeida Filho
  2013-06-28 14:14 ` H. Peter Anvin
@ 2013-07-05 14:24 ` tip-bot for Wedson Almeida Filho
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot for Wedson Almeida Filho @ 2013-07-05 14:24 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, wedsonaf, tglx, hpa

Commit-ID:  00e55a790706223c903ce6a450c18596a7bc9be0
Gitweb:     http://git.kernel.org/tip/00e55a790706223c903ce6a450c18596a7bc9be0
Author:     Wedson Almeida Filho <wedsonaf@gmail.com>
AuthorDate: Fri, 28 Jun 2013 04:50:45 -0700
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Fri, 28 Jun 2013 15:22:18 -0700

x86: Use asm-goto to implement mutex fast path on x86-64

The new implementation allows the compiler to better optimize the code; the
original implementation is still used when the kernel is compiled with older
versions of gcc that don't support asm-goto.

Compiling with gcc 4.7.3, the original mutex_lock() is 60 bytes with the fast
path taking 16 instructions; the new mutex_lock() is 42 bytes, with the fast
path taking 12 instructions.

The original mutex_unlock() is 24 bytes with the fast path taking 7
instructions; the new mutex_unlock() is 25 bytes (because the compiler used
a 2-byte ret) with the fast path taking 4 instructions.

The two versions of the functions are included below for reference.

Old:
ffffffff817742a0 <mutex_lock>:
ffffffff817742a0:       55                      push   %rbp
ffffffff817742a1:       48 89 e5                mov    %rsp,%rbp
ffffffff817742a4:       48 83 ec 10             sub    $0x10,%rsp
ffffffff817742a8:       48 89 5d f0             mov    %rbx,-0x10(%rbp)
ffffffff817742ac:       48 89 fb                mov    %rdi,%rbx
ffffffff817742af:       4c 89 65 f8             mov    %r12,-0x8(%rbp)
ffffffff817742b3:       e8 28 15 00 00          callq  ffffffff817757e0 <_cond_resched>
ffffffff817742b8:       48 89 df                mov    %rbx,%rdi
ffffffff817742bb:       f0 ff 0f                lock decl (%rdi)
ffffffff817742be:       79 05                   jns    ffffffff817742c5 <mutex_lock+0x25>
ffffffff817742c0:       e8 cb 04 00 00          callq  ffffffff81774790 <__mutex_lock_slowpath>
ffffffff817742c5:       65 48 8b 04 25 c0 b7    mov    %gs:0xb7c0,%rax
ffffffff817742cc:       00 00
ffffffff817742ce:       4c 8b 65 f8             mov    -0x8(%rbp),%r12
ffffffff817742d2:       48 89 43 18             mov    %rax,0x18(%rbx)
ffffffff817742d6:       48 8b 5d f0             mov    -0x10(%rbp),%rbx
ffffffff817742da:       c9                      leaveq
ffffffff817742db:       c3                      retq

ffffffff81774250 <mutex_unlock>:
ffffffff81774250:       55                      push   %rbp
ffffffff81774251:       48 c7 47 18 00 00 00    movq   $0x0,0x18(%rdi)
ffffffff81774258:       00
ffffffff81774259:       48 89 e5                mov    %rsp,%rbp
ffffffff8177425c:       f0 ff 07                lock incl (%rdi)
ffffffff8177425f:       7f 05                   jg     ffffffff81774266 <mutex_unlock+0x16>
ffffffff81774261:       e8 ea 04 00 00          callq  ffffffff81774750 <__mutex_unlock_slowpath>
ffffffff81774266:       5d                      pop    %rbp
ffffffff81774267:       c3                      retq

New:
ffffffff81774920 <mutex_lock>:
ffffffff81774920:       55                      push   %rbp
ffffffff81774921:       48 89 e5                mov    %rsp,%rbp
ffffffff81774924:       53                      push   %rbx
ffffffff81774925:       48 89 fb                mov    %rdi,%rbx
ffffffff81774928:       e8 a3 0e 00 00          callq  ffffffff817757d0 <_cond_resched>
ffffffff8177492d:       f0 ff 0b                lock decl (%rbx)
ffffffff81774930:       79 08                   jns    ffffffff8177493a <mutex_lock+0x1a>
ffffffff81774932:       48 89 df                mov    %rbx,%rdi
ffffffff81774935:       e8 16 fe ff ff          callq  ffffffff81774750 <__mutex_lock_slowpath>
ffffffff8177493a:       65 48 8b 04 25 c0 b7    mov    %gs:0xb7c0,%rax
ffffffff81774941:       00 00
ffffffff81774943:       48 89 43 18             mov    %rax,0x18(%rbx)
ffffffff81774947:       5b                      pop    %rbx
ffffffff81774948:       5d                      pop    %rbp
ffffffff81774949:       c3                      retq

ffffffff81774730 <mutex_unlock>:
ffffffff81774730:       48 c7 47 18 00 00 00    movq   $0x0,0x18(%rdi)
ffffffff81774737:       00
ffffffff81774738:       f0 ff 07                lock incl (%rdi)
ffffffff8177473b:       7f 0a                   jg     ffffffff81774747 <mutex_unlock+0x17>
ffffffff8177473d:       55                      push   %rbp
ffffffff8177473e:       48 89 e5                mov    %rsp,%rbp
ffffffff81774741:       e8 aa ff ff ff          callq  ffffffff817746f0 <__mutex_unlock_slowpath>
ffffffff81774746:       5d                      pop    %rbp
ffffffff81774747:       f3 c3                   repz retq

Signed-off-by: Wedson Almeida Filho <wedsonaf@gmail.com>
Link: http://lkml.kernel.org/r/1372420245-60021-1-git-send-email-wedsonaf@gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/mutex_64.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
index 68a87b0..c030bee 100644
--- a/arch/x86/include/asm/mutex_64.h
+++ b/arch/x86/include/asm/mutex_64.h
@@ -16,6 +16,20 @@
  *
  * Atomically decrements @v and calls <fail_fn> if the result is negative.
  */
+#ifdef CC_HAVE_ASM_GOTO
+static inline void __mutex_fastpath_lock(atomic_t *v,
+					 void (*fail_fn)(atomic_t *))
+{
+	asm volatile goto(LOCK_PREFIX "   decl %0\n"
+			  "   jns %l[exit]\n"
+			  : : "m" (v->counter)
+			  : "memory", "cc"
+			  : exit);
+	fail_fn(v);
+exit:
+	return;
+}
+#else
 #define __mutex_fastpath_lock(v, fail_fn)			\
 do {								\
 	unsigned long dummy;					\
@@ -32,6 +46,7 @@ do {								\
 		     : "rax", "rsi", "rdx", "rcx",		\
 		       "r8", "r9", "r10", "r11", "memory");	\
 } while (0)
+#endif
 
 /**
  *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
@@ -59,6 +74,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count,
  *
  * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
  */
+#ifdef CC_HAVE_ASM_GOTO
+static inline void __mutex_fastpath_unlock(atomic_t *v,
+					   void (*fail_fn)(atomic_t *))
+{
+	asm volatile goto(LOCK_PREFIX "   incl %0\n"
+			  "   jg %l[exit]\n"
+			  : : "m" (v->counter)
+			  : "memory", "cc"
+			  : exit);
+	fail_fn(v);
+exit:
+	return;
+}
+#else
 #define __mutex_fastpath_unlock(v, fail_fn)			\
 do {								\
 	unsigned long dummy;					\
@@ -75,6 +104,7 @@ do {								\
 		     : "rax", "rsi", "rdx", "rcx",		\
 		       "r8", "r9", "r10", "r11", "memory");	\
 } while (0)
+#endif
 
 #define __mutex_slowpath_needs_to_unlock()	1
 

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-07-05 14:25 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-06-28 11:50 [PATCH v2] x86: Use asm-goto to implement mutex fast path on x86-64 Wedson Almeida Filho
2013-06-28 14:14 ` H. Peter Anvin
2013-06-29  6:35   ` Wedson Almeida Filho
2013-07-05 14:24 ` [tip:x86/asm] " tip-bot for Wedson Almeida Filho

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.