public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH][2.6] Completely out of line spinlocks / x86_64
@ 2004-08-08  4:50 Zwane Mwaikambo
  2004-08-08  6:08 ` Zwane Mwaikambo
  0 siblings, 1 reply; 7+ messages in thread
From: Zwane Mwaikambo @ 2004-08-08  4:50 UTC (permalink / raw)
  To: Linux Kernel; +Cc: Andrew Morton, Andi Kleen, Matt Mackall

Pulled from the -tiny tree, the focus of this patch is for reduced kernel
image size, the shared spinlock text may be more of a benefit later on
when multicore packages with shared caches arrive but should benefit
em64t systems with HT enabled. I've set the config option to y so
that it gets a bit of runtime, but it can be left as a config option.

Tested on 4x logical em64t.

   text    data     bss     dec     hex filename
3870057 1350301  508048 5728406  576896 vmlinux-after
3885801 1352134  508048 5745983  57ad3f vmlinux-before

 arch/x86_64/Kconfig              |   10 ++++++
 arch/x86_64/kernel/x8664_ksyms.c |   11 +++++++
 arch/x86_64/lib/Makefile         |    1
 arch/x86_64/lib/spinlock.c       |   57 +++++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/spinlock.h    |   22 +++++++++++++--
 5 files changed, 99 insertions(+), 2 deletions(-)

Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca>

Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig
===================================================================
RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/Kconfig,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 Kconfig
--- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	5 Aug 2004 16:37:48 -0000	1.1.1.1
+++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	7 Aug 2004 22:47:30 -0000
@@ -438,6 +438,16 @@ config DEBUG_SPINLOCK
 	  best used in conjunction with the NMI watchdog so that spinlock
 	  deadlocks are also debuggable.

+config COOL_SPINLOCK
+	bool "Completely out of line spinlocks"
+	depends on SMP
+	default y
+	help
+	  Say Y here to build spinlocks which have common text for contended
+	  and uncontended paths. This reduces kernel text size by at least
+	  50k on most configurations, plus there is the additional benefit
+	  of better cache utilisation.
+
 # !SMP for now because the context switch early causes GPF in segment reloading
 # and the GS base checking does the wrong thing then, causing a hang.
 config CHECKING
Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/kernel/x8664_ksyms.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 x8664_ksyms.c
--- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/kernel/x8664_ksyms.c	5 Aug 2004 16:37:48 -0000	1.1.1.1
+++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/kernel/x8664_ksyms.c	7 Aug 2004 22:41:34 -0000
@@ -220,3 +220,14 @@ EXPORT_SYMBOL(flush_tlb_page);
 EXPORT_SYMBOL_GPL(flush_tlb_all);
 #endif

+#ifdef CONFIG_COOL_SPINLOCK
+extern void asmlinkage __spin_lock_failed(spinlock_t *);
+extern void asmlinkage __spin_lock_failed_flags(spinlock_t *, unsigned long);
+extern void asmlinkage __spin_lock_loop(spinlock_t *);
+extern void asmlinkage __spin_lock_loop_flags(spinlock_t *, unsigned long);
+EXPORT_SYMBOL(__spin_lock_failed);
+EXPORT_SYMBOL(__spin_lock_failed_flags);
+EXPORT_SYMBOL(__spin_lock_loop);
+EXPORT_SYMBOL(__spin_lock_loop_flags);
+#endif
+
Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/Makefile
===================================================================
RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/lib/Makefile,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 Makefile
--- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/Makefile	5 Aug 2004 16:37:48 -0000	1.1.1.1
+++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/Makefile	7 Aug 2004 22:32:44 -0000
@@ -13,3 +13,4 @@ lib-y += memcpy.o memmove.o memset.o cop

 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
 lib-$(CONFIG_KGDB) += kgdb_serial.o
+lib-$(CONFIG_COOL_SPINLOCK) += spinlock.o
Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
===================================================================
RCS file: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
diff -N linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c	7 Aug 2004 23:00:34 -0000
@@ -0,0 +1,57 @@
+#define PROC(name)	\
+	".align 4\n"	\
+	".globl " #name"\n" \
+	#name":\n"
+
+asm (PROC(__spin_lock_failed_flags)
+	"test $0x200, %rbx\n"
+	"jz 1f\n"
+	"sti\n"
+	"1:\n\t"
+	"rep; nop\n"
+	"cmpb $0, (%rax)\n"
+	"jle 1b\n"
+	"cli\n"
+	"lock; decb (%rax)\n\t"
+	"js __spin_lock_failed_flags\n\t"
+	"nop\n"
+	"ret\n"
+);
+
+asm (PROC(__spin_lock_loop_flags)
+	"lock; decb (%rax)\n\t"
+	"js 1f\n\t"
+	"nop\n\t"
+	"ret\n\t"
+	"1:\n\t"
+	"test $0x200, %rbx\n\t"
+	"jz 1f\n\t"
+	"sti\n\t"
+	"2: rep; nop\n\t"
+	"cmpb $0, (%rax)\n\t"
+	"jle 2b\n\t"
+	"cli\n\t"
+	"jmp __spin_lock_loop_flags\n\t"
+);
+
+asm (PROC(__spin_lock_failed)
+	"rep; nop\n\t"
+	"cmpb $0, (%rax)\n\t"
+	"jle __spin_lock_failed\n\t"
+	"lock; decb (%rax)\n\t"
+	"js __spin_lock_failed\n\t"
+	"nop\n\t"
+	"ret\n\t"
+);
+
+asm (PROC(__spin_lock_loop)
+	"lock; decb (%rax)\n\t"
+	"js 1f\n\t"
+	"nop\n\t"
+	"ret\n\t"
+	"1: rep; nop\n\t"
+	"cmpb $0, (%rax)\n\t"
+	"jle 1b\n\t"
+	"jmp __spin_lock_loop\n\t"
+);
+
Index: linux-2.6.8-rc3-mm1-amd64/include/asm-x86_64/spinlock.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/include/asm-x86_64/spinlock.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 spinlock.h
--- linux-2.6.8-rc3-mm1-amd64/include/asm-x86_64/spinlock.h	5 Aug 2004 16:37:48 -0000	1.1.1.1
+++ linux-2.6.8-rc3-mm1-amd64/include/asm-x86_64/spinlock.h	7 Aug 2004 22:48:13 -0000
@@ -42,6 +42,13 @@ typedef struct {
 #define spin_is_locked(x)	(*(volatile signed char *)(&(x)->lock) <= 0)
 #define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))

+#ifdef CONFIG_COOL_SPINLOCK
+#define spin_lock_string \
+	"call __spin_lock_loop\n\t"
+
+#define spin_lock_string_flags \
+	"call __spin_lock_loop_flags\n\t"
+#else
 #define spin_lock_string \
 	"\n1:\t" \
 	"lock ; decb %0\n\t" \
@@ -70,6 +77,7 @@ typedef struct {
 	"cli\n\t" \
 	"jmp 1b\n" \
 	LOCK_SECTION_END
+#endif

 /*
  * This works. Despite all the confusion.
@@ -138,7 +146,12 @@ printk("eip: %p\n", &&here);
 #endif
 	__asm__ __volatile__(
 		spin_lock_string
-		:"=m" (lock->lock) : : "memory");
+#ifdef CONFIG_COOL_SPINLOCK
+		: : "a" (&lock->lock) : "memory"
+#else
+		:"=m" (lock->lock) : : "memory"
+#endif
+	);
 }

 static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
@@ -152,7 +165,12 @@ here:
 	}
 #endif
 	__asm__ __volatile__(spin_lock_string_flags
-		:"=m" (lock->lock) : "r" (flags) : "memory");
+#ifdef CONFIG_COOL_SPINLOCK
+		: : "a" (&lock->lock), "b" (flags) : "memory"
+#else
+		:"=m" (lock->lock) : "r" (flags) : "memory"
+#endif
+	);
 }



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH][2.6] Completely out of line spinlocks / x86_64
  2004-08-08  4:50 [PATCH][2.6] Completely out of line spinlocks / x86_64 Zwane Mwaikambo
@ 2004-08-08  6:08 ` Zwane Mwaikambo
  2004-08-09 11:23   ` Andi Kleen
  0 siblings, 1 reply; 7+ messages in thread
From: Zwane Mwaikambo @ 2004-08-08  6:08 UTC (permalink / raw)
  To: Linux Kernel; +Cc: Andrew Morton, Andi Kleen, Matt Mackall

On Sun, 8 Aug 2004, Zwane Mwaikambo wrote:

> Pulled from the -tiny tree, the focus of this patch is for reduced kernel
> image size, the shared spinlock text may be more of a benefit later on
> when multicore packages with shared caches arrive but should benefit
> em64t systems with HT enabled. I've set the config option to y so
> that it gets a bit of runtime, but it can be left as a config option.
>
> Tested on 4x logical em64t.
>
>    text    data     bss     dec     hex filename
> 3870057 1350301  508048 5728406  576896 vmlinux-after
> 3885801 1352134  508048 5745983  57ad3f vmlinux-before

A few changes from the i386 thread.

 arch/x86_64/Kconfig           |   10 ++++++++++
 arch/x86_64/lib/Makefile      |    1 +
 arch/x86_64/lib/spinlock.c    |   38 ++++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/spinlock.h |   22 ++++++++++++++++++++--
 4 files changed, 69 insertions(+), 2 deletions(-)

Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig
===================================================================
RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/Kconfig,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 Kconfig
--- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	5 Aug 2004 16:37:48 -0000	1.1.1.1
+++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	7 Aug 2004 22:47:30 -0000
@@ -438,6 +438,16 @@ config DEBUG_SPINLOCK
 	  best used in conjunction with the NMI watchdog so that spinlock
 	  deadlocks are also debuggable.

+config COOL_SPINLOCK
+	bool "Completely out of line spinlocks"
+	depends on SMP
+	default y
+	help
+	  Say Y here to build spinlocks which have common text for contended
+	  and uncontended paths. This reduces kernel text size by at least
+	  50k on most configurations, plus there is the additional benefit
+	  of better cache utilisation.
+
 # !SMP for now because the context switch early causes GPF in segment reloading
 # and the GS base checking does the wrong thing then, causing a hang.
 config CHECKING
Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/Makefile
===================================================================
RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/lib/Makefile,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 Makefile
--- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/Makefile	5 Aug 2004 16:37:48 -0000	1.1.1.1
+++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/Makefile	7 Aug 2004 22:32:44 -0000
@@ -13,3 +13,4 @@ lib-y += memcpy.o memmove.o memset.o cop

 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
 lib-$(CONFIG_KGDB) += kgdb_serial.o
+lib-$(CONFIG_COOL_SPINLOCK) += spinlock.o
Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
===================================================================
RCS file: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
diff -N linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c	8 Aug 2004 05:39:04 -0000
@@ -0,0 +1,38 @@
+#include <linux/module.h>
+
+#define PROC(name)	\
+	".align 4\n"	\
+	".globl " #name"\n" \
+	#name":\n"
+
+asm (PROC(__spin_lock_loop_flags)
+	"lock; decb (%rax)\n\t"
+	"js 1f\n\t"
+	"nop\n\t"
+	"ret\n\t"
+	"1:\n\t"
+	"test $0x200, %rbx\n\t"
+	"jz 1f\n\t"
+	"sti\n\t"
+	"2: rep; nop\n\t"
+	"cmpb $0, (%rax)\n\t"
+	"jle 2b\n\t"
+	"cli\n\t"
+	"jmp __spin_lock_loop_flags\n\t"
+);
+
+asm (PROC(__spin_lock_loop)
+	"lock; decb (%rax)\n\t"
+	"js 1f\n\t"
+	"nop\n\t"
+	"ret\n\t"
+	"1: rep; nop\n\t"
+	"cmpb $0, (%rax)\n\t"
+	"jle 1b\n\t"
+	"jmp __spin_lock_loop\n\t"
+);
+
+void __spin_lock_loop_flags(void);
+void __spin_lock_loop(void);
+EXPORT_SYMBOL(__spin_lock_loop_flags);
+EXPORT_SYMBOL(__spin_lock_loop);
Index: linux-2.6.8-rc3-mm1-amd64/include/asm-x86_64/spinlock.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/include/asm-x86_64/spinlock.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 spinlock.h
--- linux-2.6.8-rc3-mm1-amd64/include/asm-x86_64/spinlock.h	5 Aug 2004 16:37:48 -0000	1.1.1.1
+++ linux-2.6.8-rc3-mm1-amd64/include/asm-x86_64/spinlock.h	8 Aug 2004 05:19:41 -0000
@@ -42,6 +42,13 @@ typedef struct {
 #define spin_is_locked(x)	(*(volatile signed char *)(&(x)->lock) <= 0)
 #define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))

+#ifdef CONFIG_COOL_SPINLOCK
+#define spin_lock_string \
+	"call __spin_lock_loop\n\t"
+
+#define spin_lock_string_flags \
+	"call __spin_lock_loop_flags\n\t"
+#else
 #define spin_lock_string \
 	"\n1:\t" \
 	"lock ; decb %0\n\t" \
@@ -70,6 +77,7 @@ typedef struct {
 	"cli\n\t" \
 	"jmp 1b\n" \
 	LOCK_SECTION_END
+#endif

 /*
  * This works. Despite all the confusion.
@@ -138,7 +146,12 @@ printk("eip: %p\n", &&here);
 #endif
 	__asm__ __volatile__(
 		spin_lock_string
-		:"=m" (lock->lock) : : "memory");
+#ifdef CONFIG_COOL_SPINLOCK
+		: : "a" (&lock->lock) : "memory"
+#else
+		:"=m" (lock->lock) : : "memory"
+#endif
+	);
 }

 static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
@@ -152,7 +165,12 @@ here:
 	}
 #endif
 	__asm__ __volatile__(spin_lock_string_flags
-		:"=m" (lock->lock) : "r" (flags) : "memory");
+#ifdef CONFIG_COOL_SPINLOCK
+		: : "a" (&lock->lock), "d" (flags) : "memory"
+#else
+		:"=m" (lock->lock) : "r" (flags) : "memory"
+#endif
+	);
 }



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH][2.6] Completely out of line spinlocks / x86_64
  2004-08-08  6:08 ` Zwane Mwaikambo
@ 2004-08-09 11:23   ` Andi Kleen
  2004-08-09 11:41     ` Marcelo Tosatti
  2004-08-09 21:31     ` Zwane Mwaikambo
  0 siblings, 2 replies; 7+ messages in thread
From: Andi Kleen @ 2004-08-09 11:23 UTC (permalink / raw)
  To: Zwane Mwaikambo; +Cc: linux-kernel, akpm, mpm

On Sun, 8 Aug 2004 02:08:30 -0400 (EDT)
Zwane Mwaikambo <zwane@linuxpower.ca> wrote:

>  arch/x86_64/Kconfig           |   10 ++++++++++
>  arch/x86_64/lib/Makefile      |    1 +
>  arch/x86_64/lib/spinlock.c    |   38 ++++++++++++++++++++++++++++++++++++++
>  include/asm-x86_64/spinlock.h |   22 ++++++++++++++++++++--
>  4 files changed, 69 insertions(+), 2 deletions(-)
> 
> Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig
> ===================================================================
> RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/Kconfig,v
> retrieving revision 1.1.1.1
> diff -u -p -B -r1.1.1.1 Kconfig
> --- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	5 Aug 2004 16:37:48 -0000	1.1.1.1
> +++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	7 Aug 2004 22:47:30 -0000
> @@ -438,6 +438,16 @@ config DEBUG_SPINLOCK
>  	  best used in conjunction with the NMI watchdog so that spinlock
>  	  deadlocks are also debuggable.
> 
> +config COOL_SPINLOCK
> +	bool "Completely out of line spinlocks"
> +	depends on SMP
> +	default y
> +	help
> +	  Say Y here to build spinlocks which have common text for contended
> +	  and uncontended paths. This reduces kernel text size by at least
> +	  50k on most configurations, plus there is the additional benefit
> +	  of better cache utilisation.

I think the 50k number is wrong. I took a look at it and the big 
difference is only seen when you enable interrupts during spinning, which
we didn't do before.  If you compare it to the old implementation the
difference is much less.

I don't really like the config option. Either it's a good idea
then it should be done by default without option or it should not be done at all.

Did you do any lock intensive benchmarks that could show a slowdown?

> Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
> ===================================================================
> RCS file: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
> diff -N linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
> --- /dev/null	1 Jan 1970 00:00:00 -0000
> +++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c	8 Aug 2004 05:39:04 -0000
> @@ -0,0 +1,38 @@
> +#include <linux/module.h>

You should make this file assembly only. 


-Andi

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH][2.6] Completely out of line spinlocks / x86_64
  2004-08-09 11:23   ` Andi Kleen
@ 2004-08-09 11:41     ` Marcelo Tosatti
  2004-08-09 11:49       ` Marcelo Tosatti
  2004-08-09 21:31     ` Zwane Mwaikambo
  1 sibling, 1 reply; 7+ messages in thread
From: Marcelo Tosatti @ 2004-08-09 11:41 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Zwane Mwaikambo, linux-kernel, akpm, mpm

On Mon, Aug 09, 2004 at 01:23:08PM +0200, Andi Kleen wrote:
> On Sun, 8 Aug 2004 02:08:30 -0400 (EDT)
> Zwane Mwaikambo <zwane@linuxpower.ca> wrote:
> 
> >  arch/x86_64/Kconfig           |   10 ++++++++++
> >  arch/x86_64/lib/Makefile      |    1 +
> >  arch/x86_64/lib/spinlock.c    |   38 ++++++++++++++++++++++++++++++++++++++
> >  include/asm-x86_64/spinlock.h |   22 ++++++++++++++++++++--
> >  4 files changed, 69 insertions(+), 2 deletions(-)
> > 
> > Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig
> > ===================================================================
> > RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/Kconfig,v
> > retrieving revision 1.1.1.1
> > diff -u -p -B -r1.1.1.1 Kconfig
> > --- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	5 Aug 2004 16:37:48 -0000	1.1.1.1
> > +++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	7 Aug 2004 22:47:30 -0000
> > @@ -438,6 +438,16 @@ config DEBUG_SPINLOCK
> >  	  best used in conjunction with the NMI watchdog so that spinlock
> >  	  deadlocks are also debuggable.
> > 
> > +config COOL_SPINLOCK
> > +	bool "Completely out of line spinlocks"
> > +	depends on SMP
> > +	default y
> > +	help
> > +	  Say Y here to build spinlocks which have common text for contended
> > +	  and uncontended paths. This reduces kernel text size by at least
> > +	  50k on most configurations, plus there is the additional benefit
> > +	  of better cache utilisation.
> 
> I think the 50k number is wrong. I took a look at it and the big 
> difference is only seen when you enable interrupts during spinning, which
> we didn't do before.  If you compare it to the old implementation the
> difference is much less.
> 
> I don't really like the config option. Either it's a good idea
> then it should be done by default without option or it should not be done at all.
> 
> Did you do any lock intensive benchmarks that could show a slowdown?

Out of curiosity, also, have you ran any lock intensive benchmarks to get some
numbers out of the increased cacheline hits due to uninlining? 

I think you can measure the hits/misses precisely with Mikael's perfcounters.

> 
> > Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
> > ===================================================================
> > RCS file: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
> > diff -N linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c
> > --- /dev/null	1 Jan 1970 00:00:00 -0000
> > +++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/lib/spinlock.c	8 Aug 2004 05:39:04 -0000
> > @@ -0,0 +1,38 @@
> > +#include <linux/module.h>
> 
> You should make this file assembly only. 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH][2.6] Completely out of line spinlocks / x86_64
  2004-08-09 11:41     ` Marcelo Tosatti
@ 2004-08-09 11:49       ` Marcelo Tosatti
  2004-08-09 14:38         ` Matt Mackall
  0 siblings, 1 reply; 7+ messages in thread
From: Marcelo Tosatti @ 2004-08-09 11:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Zwane Mwaikambo, linux-kernel, akpm, mpm

On Mon, Aug 09, 2004 at 08:41:38AM -0300, Marcelo Tosatti wrote:
> On Mon, Aug 09, 2004 at 01:23:08PM +0200, Andi Kleen wrote:
> > On Sun, 8 Aug 2004 02:08:30 -0400 (EDT)
> > Zwane Mwaikambo <zwane@linuxpower.ca> wrote:
> > 
> > >  arch/x86_64/Kconfig           |   10 ++++++++++
> > >  arch/x86_64/lib/Makefile      |    1 +
> > >  arch/x86_64/lib/spinlock.c    |   38 ++++++++++++++++++++++++++++++++++++++
> > >  include/asm-x86_64/spinlock.h |   22 ++++++++++++++++++++--
> > >  4 files changed, 69 insertions(+), 2 deletions(-)
> > > 
> > > Index: linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig
> > > ===================================================================
> > > RCS file: /home/cvsroot/linux-2.6.8-rc3-mm1/arch/x86_64/Kconfig,v
> > > retrieving revision 1.1.1.1
> > > diff -u -p -B -r1.1.1.1 Kconfig
> > > --- linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	5 Aug 2004 16:37:48 -0000	1.1.1.1
> > > +++ linux-2.6.8-rc3-mm1-amd64/arch/x86_64/Kconfig	7 Aug 2004 22:47:30 -0000
> > > @@ -438,6 +438,16 @@ config DEBUG_SPINLOCK
> > >  	  best used in conjunction with the NMI watchdog so that spinlock
> > >  	  deadlocks are also debuggable.
> > > 
> > > +config COOL_SPINLOCK
> > > +	bool "Completely out of line spinlocks"
> > > +	depends on SMP
> > > +	default y
> > > +	help
> > > +	  Say Y here to build spinlocks which have common text for contended
> > > +	  and uncontended paths. This reduces kernel text size by at least
> > > +	  50k on most configurations, plus there is the additional benefit
> > > +	  of better cache utilisation.
> > 
> > I think the 50k number is wrong. I took a look at it and the big 
> > difference is only seen when you enable interrupts during spinning, which
> > we didn't do before.  If you compare it to the old implementation the
> > difference is much less.
> > 
> > I don't really like the config option. Either it's a good idea
> > then it should be done by default without option or it should not be done at all.
> > 
> > Did you do any lock intensive benchmarks that could show a slowdown?
> 
> Out of curiosity, also, have you ran any lock intensive benchmarks to get some
> numbers out of the increased cacheline hits due to uninlining? 
> 
> I think you can measure the hits/misses precisely with Mikael's perfcounters.

Hi Zwane, 

Just seen your bonnie++ results (should have the whole thread before replying), 
looks great, except a slight reduction in sequential output:

out-of-line spinlocks:
Version  @version@      ------Sequential Output------ --Sequential Input- --Random-
                    -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
stp2-000         2G  7018  99 64560  36 21694  16  6789  97 43729  14 340.6   1
stp2-000         2G  7055  99 64836  39 21899  16  6752  97 44827  17 330.8   2
stp2-000         2G  7023  99 64525  38 22987  17  6704  96 44777  14 337.3   1

mainline:
Version  @version@      ------Sequential Output------ --Sequential Input- --Random-
                    -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
stp2-000         2G  7048  99 64912  38 22510  17  6732  96 43900  14 332.0   1
stp2-000         2G  7018  99 63821  39 21732  16  6787  97 44889  17 326.7   2
stp2-000         2G  7063  99 63834  38 22361  17  6738  97 43310  14 338.3   1
                    ------Sequential Create------ --------Random Create--------

Probably just noise, still I think its worth mentioning.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH][2.6] Completely out of line spinlocks / x86_64
  2004-08-09 11:49       ` Marcelo Tosatti
@ 2004-08-09 14:38         ` Matt Mackall
  0 siblings, 0 replies; 7+ messages in thread
From: Matt Mackall @ 2004-08-09 14:38 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Andi Kleen, Zwane Mwaikambo, linux-kernel, akpm

On Mon, Aug 09, 2004 at 08:49:12AM -0300, Marcelo Tosatti wrote:
> 
> Hi Zwane, 
> 
> Just seen your bonnie++ results (should have the whole thread before replying), 
> looks great, except a slight reduction in sequential output:
> 
> out-of-line spinlocks:
> Version  @version@      ------Sequential Output------ --Sequential Input- --Random-
>                     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
> Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
> stp2-000         2G  7018  99 64560  36 21694  16  6789  97 43729  14 340.6   1
> stp2-000         2G  7055  99 64836  39 21899  16  6752  97 44827  17 330.8   2
> stp2-000         2G  7023  99 64525  38 22987  17  6704  96 44777  14 337.3   1
> 
> mainline:
> Version  @version@      ------Sequential Output------ --Sequential Input- --Random-
>                     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
> Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
> stp2-000         2G  7048  99 64912  38 22510  17  6732  96 43900  14 332.0   1
> stp2-000         2G  7018  99 63821  39 21732  16  6787  97 44889  17 326.7   2
> stp2-000         2G  7063  99 63834  38 22361  17  6738  97 43310  14 338.3   1
>                     ------Sequential Create------ --------Random Create--------
> 
> Probably just noise, still I think its worth mentioning.

That really does look like noise here, though this is probably not the
ideal benchmark test. My hope is that we can stick this in -mm for a
bit and get some wider benchmarking of it (hence the config option).

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH][2.6] Completely out of line spinlocks / x86_64
  2004-08-09 11:23   ` Andi Kleen
  2004-08-09 11:41     ` Marcelo Tosatti
@ 2004-08-09 21:31     ` Zwane Mwaikambo
  1 sibling, 0 replies; 7+ messages in thread
From: Zwane Mwaikambo @ 2004-08-09 21:31 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, akpm, mpm

On Mon, 9 Aug 2004, Andi Kleen wrote:

> I think the 50k number is wrong. I took a look at it and the big
> difference is only seen when you enable interrupts during spinning, which
> we didn't do before.  If you compare it to the old implementation the
> difference is much less.

Yes agreed the increase wouldn't be of as high a magnitude if compared to
the original code, but it's still a decent saving.

> I don't really like the config option. Either it's a good idea
> then it should be done by default without option or it should not be done at all.
>
> Did you do any lock intensive benchmarks that could show a slowdown?

I went for a file IO type benchmark, the differences looked like
statistical noise, possibly the best bet would be to check for cache
hits/misses.

> You should make this file assembly only.

Ok you're the second person to mention that, i don't have a problem with
switching to assembly only and dumping the exports in x8664_ksyms.c

Thanks,
	Zwane


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2004-08-09 21:29 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-08-08  4:50 [PATCH][2.6] Completely out of line spinlocks / x86_64 Zwane Mwaikambo
2004-08-08  6:08 ` Zwane Mwaikambo
2004-08-09 11:23   ` Andi Kleen
2004-08-09 11:41     ` Marcelo Tosatti
2004-08-09 11:49       ` Marcelo Tosatti
2004-08-09 14:38         ` Matt Mackall
2004-08-09 21:31     ` Zwane Mwaikambo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox