[RFC] x86-64: Use SSE for copy_page and clear

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] x86-64: Use SSE for copy_page and clear_page
@ 2005-05-30 18:16 Benjamin LaHaise
  2005-05-30 18:45 ` Jeff Garzik
                   ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Benjamin LaHaise @ 2005-05-30 18:16 UTC (permalink / raw)
  To: ak; +Cc: linux-kernel

Hello Andi,

Below is a patch that uses 128 bit SSE instructions for copy_page and 
clear_page.  This is an improvement on P4 systems as can be seen by 
running the test program at http://www.kvack.org/~bcrl/xmm64.c to get 
results like:

SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ buffer = 0x2aaaaaad6000
clear_page() tests 
clear_page function 'warm up run'        took 25444 cycles per page
clear_page function 'kernel clear'       took 6595 cycles per page
clear_page function '2.4 non MMX'        took 7827 cycles per page
clear_page function '2.4 MMX fallback'   took 7741 cycles per page
clear_page function '2.4 MMX version'    took 6454 cycles per page
clear_page function 'faster_clear_page'  took 4344 cycles per page
clear_page function 'even_faster_clear'  took 4151 cycles per page
clear_page function 'xmm_clear '         took 3204 cycles per page
clear_page function 'xmma_clear '        took 6080 cycles per page
clear_page function 'xmm2_clear '        took 3370 cycles per page
clear_page function 'xmma2_clear '       took 6115 cycles per page
clear_page function 'kernel clear'       took 6583 cycles per page

copy_page() tests 
copy_page function 'warm up run'         took 9770 cycles per page
copy_page function '2.4 non MMX'         took 9758 cycles per page
copy_page function '2.4 MMX fallback'    took 9572 cycles per page
copy_page function '2.4 MMX version'     took 9405 cycles per page
copy_page function 'faster_copy'         took 7407 cycles per page
copy_page function 'even_faster'         took 7158 cycles per page
copy_page function 'xmm_copy_page_no'    took 6110 cycles per page
copy_page function 'xmm_copy_page'       took 5914 cycles per page
copy_page function 'xmma_copy_page'      took 5913 cycles per page
copy_page function 'v26_copy_page'       took 9168 cycles per page

The SSE clear page fuction is almost twice as fast as the kernel's 
current clear_page, while the copy_page implementation is roughly a 
third faster.  This is likely due to the fact that SSE instructions 
can keep the 256 bit wide L2 cache bus at a higher utilisation than 
64 bit movs are able to.  Comments?

		-ben

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
:r public_html/patches/v2.6.12-rc4-xmm-2.diff
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c xmm-rc4/arch/x86_64/lib/c_clear_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c	1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_clear_page.c	2005-05-26 11:16:09.000000000 -0400
@@ -0,0 +1,45 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_clear_page_xmm(void *page)
+{
+	/* Note! gcc doesn't seem to align stack variables properly, so we 
+	 * need to make use of unaligned loads and stores.
+	 */
+	xmm_store_t xmm_save[1];
+	unsigned long cr0;
+	int i;
+
+	preempt_disable();
+	__asm__ __volatile__ (
+		" mov %%cr0,%0\n"
+		" clts\n"
+		" movdqu %%xmm0,(%1)\n"
+		" pxor %%xmm0, %%xmm0\n"
+		: "=&r" (cr0): "r" (xmm_save) : "memory"
+	);
+
+	for(i=0;i<PAGE_SIZE/64;i++)
+	{
+		__asm__ __volatile__ (
+		" movntdq %%xmm0, (%0)\n"
+		" movntdq %%xmm0, 16(%0)\n"
+		" movntdq %%xmm0, 32(%0)\n"
+		" movntdq %%xmm0, 48(%0)\n"
+		: : "r" (page) : "memory");
+		page+=64;
+	}
+
+	__asm__ __volatile__ (
+		" sfence \n "
+		" movdqu (%0),%%xmm0\n"
+		" mov %1,%%cr0\n"
+		:: "r" (xmm_save), "r" (cr0)
+	);
+	preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c xmm-rc4/arch/x86_64/lib/c_copy_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c	1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_copy_page.c	2005-05-30 14:07:28.000000000 -0400
@@ -0,0 +1,52 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_copy_page_xmm(void *to, void *from)
+{
+	/* Note! gcc doesn't seem to align stack variables properly, so we 
+	 * need to make use of unaligned loads and stores.
+	 */
+	xmm_store_t xmm_save[2];
+	unsigned long cr0;
+	int i;
+
+	preempt_disable();
+	__asm__ __volatile__ (
+                " prefetchnta    (%1)\n"
+                " prefetchnta  64(%1)\n"
+                " prefetchnta 128(%1)\n"
+                " prefetchnta 192(%1)\n"
+                " prefetchnta 256(%1)\n"
+		" mov %%cr0,%0\n"
+		" clts\n"
+		" movdqu %%xmm0,  (%1)\n"
+		" movdqu %%xmm1,16(%1)\n"
+		: "=&r" (cr0): "r" (xmm_save) : "memory"
+	);
+
+	for(i=0;i<PAGE_SIZE/32;i++) {
+		__asm__ __volatile__ (
+		" prefetchnta 320(%0)\n"
+		" movdqa   (%0),%%xmm0\n"
+		" movdqa 16(%0),%%xmm1\n"
+		" movntdq %%xmm0,   (%1)\n"
+		" movntdq %%xmm1, 16(%1)\n"
+		: : "r" (from), "r" (to) : "memory");
+		to += 32;
+		from += 32;
+	}
+
+	__asm__ __volatile__ (
+		" sfence \n "
+		" movdqu   (%0),%%xmm0\n"
+		" movdqu 16(%0),%%xmm1\n"
+		" mov %1,%%cr0\n"
+		:: "r" (xmm_save), "r" (cr0)
+	);
+	preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/clear_page.S xmm-rc4/arch/x86_64/lib/clear_page.S
--- v2.6.12-rc4/arch/x86_64/lib/clear_page.S	2004-12-24 16:34:33.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/clear_page.S	2005-05-26 11:27:26.000000000 -0400
@@ -1,3 +1,5 @@
+#include <asm/cpufeature.h>
+	    	
 /*
  * Zero a page. 	
  * rdi	page
@@ -24,12 +26,25 @@ clear_page:
 	nop
 	ret
 clear_page_end:	
-	
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  clear_page
+	.quad  clear_page_xmm
+	.byte  X86_FEATURE_XMM2
+	.byte  clear_page_end-clear_page	
+	.byte  clear_page_xmm_end-clear_page_xmm
+	.previous
+
+	.globl	c_clear_page_xmm
+	.p2align 4
+clear_page_xmm:
+	jmp	c_clear_page_xmm+(clear_page_xmm-clear_page)
+clear_page_xmm_end:
+
 	/* C stepping K8 run faster using the string instructions.
 	   It is also a lot simpler. Use this when possible */
 	
-#include <asm/cpufeature.h>
-	    	
 	.section .altinstructions,"a"
 	.align 8
 	.quad  clear_page
diff -purN v2.6.12-rc4/arch/x86_64/lib/copy_page.S xmm-rc4/arch/x86_64/lib/copy_page.S
--- v2.6.12-rc4/arch/x86_64/lib/copy_page.S	2004-12-24 16:34:32.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/copy_page.S	2005-05-26 11:29:55.000000000 -0400
@@ -76,18 +76,34 @@ copy_page:
 	movq	2*8(%rsp),%r13
 	addq	$3*8,%rsp
 	ret
+copy_page_end = .
 	
+#include <asm/cpufeature.h>		
+		
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_page
+	.quad  copy_page_xmm
+	.byte  X86_FEATURE_XMM2
+	.byte  copy_page_end-copy_page	
+	.byte  copy_page_xmm_end-copy_page_xmm
+	.previous
+
+	.globl	c_copy_page_xmm
+	.p2align 4
+copy_page_xmm:
+	jmp	c_copy_page_xmm+(copy_page_xmm-copy_page)
+copy_page_xmm_end = .
+
 	/* C stepping K8 run faster using the string copy instructions.
 	   It is also a lot simpler. Use this when possible */
 
-#include <asm/cpufeature.h>		
-		
 	.section .altinstructions,"a"
 	.align 8
 	.quad  copy_page
 	.quad  copy_page_c
 	.byte  X86_FEATURE_K8_C
-	.byte  copy_page_c_end-copy_page_c
+	.byte  copy_page_end-copy_page
 	.byte  copy_page_c_end-copy_page_c
 	.previous
 
diff -purN v2.6.12-rc4/arch/x86_64/lib/Makefile xmm-rc4/arch/x86_64/lib/Makefile
--- v2.6.12-rc4/arch/x86_64/lib/Makefile	2004-12-24 16:34:01.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/Makefile	2005-05-26 11:26:50.000000000 -0400
@@ -10,5 +10,7 @@ lib-y := csum-partial.o csum-copy.o csum
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
 lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += c_clear_page.o
+lib-y += c_copy_page.o
 
 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 18:16 [RFC] x86-64: Use SSE for copy_page and clear_page Benjamin LaHaise
@ 2005-05-30 18:45 ` Jeff Garzik
  2005-05-30 19:06 ` dean gaudet
  2005-05-30 19:38 ` Andi Kleen
  2 siblings, 0 replies; 21+ messages in thread
From: Jeff Garzik @ 2005-05-30 18:45 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: ak, linux-kernel

Benjamin LaHaise wrote:
> Hello Andi,
> 
> Below is a patch that uses 128 bit SSE instructions for copy_page and 
> clear_page.  This is an improvement on P4 systems as can be seen by 
> running the test program at http://www.kvack.org/~bcrl/xmm64.c to get 
> results like:
> 
> SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ buffer = 0x2aaaaaad6000
> clear_page() tests 
> clear_page function 'warm up run'        took 25444 cycles per page
> clear_page function 'kernel clear'       took 6595 cycles per page
> clear_page function '2.4 non MMX'        took 7827 cycles per page
> clear_page function '2.4 MMX fallback'   took 7741 cycles per page
> clear_page function '2.4 MMX version'    took 6454 cycles per page
> clear_page function 'faster_clear_page'  took 4344 cycles per page
> clear_page function 'even_faster_clear'  took 4151 cycles per page
> clear_page function 'xmm_clear '         took 3204 cycles per page
> clear_page function 'xmma_clear '        took 6080 cycles per page
> clear_page function 'xmm2_clear '        took 3370 cycles per page
> clear_page function 'xmma2_clear '       took 6115 cycles per page
> clear_page function 'kernel clear'       took 6583 cycles per page
> 
> copy_page() tests 
> copy_page function 'warm up run'         took 9770 cycles per page
> copy_page function '2.4 non MMX'         took 9758 cycles per page
> copy_page function '2.4 MMX fallback'    took 9572 cycles per page
> copy_page function '2.4 MMX version'     took 9405 cycles per page
> copy_page function 'faster_copy'         took 7407 cycles per page
> copy_page function 'even_faster'         took 7158 cycles per page
> copy_page function 'xmm_copy_page_no'    took 6110 cycles per page
> copy_page function 'xmm_copy_page'       took 5914 cycles per page
> copy_page function 'xmma_copy_page'      took 5913 cycles per page
> copy_page function 'v26_copy_page'       took 9168 cycles per page
> 
> The SSE clear page fuction is almost twice as fast as the kernel's 
> current clear_page, while the copy_page implementation is roughly a 
> third faster.  This is likely due to the fact that SSE instructions 
> can keep the 256 bit wide L2 cache bus at a higher utilisation than 
> 64 bit movs are able to.  Comments?

Sounds pretty darn cool to me.  I can give it a test on athlon64 and 
em64t here.

I have some codingstyle whining to do though...


> :r public_html/patches/v2.6.12-rc4-xmm-2.diff
> diff -purN v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c xmm-rc4/arch/x86_64/lib/c_clear_page.c
> --- v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c	1969-12-31 19:00:00.000000000 -0500
> +++ xmm-rc4/arch/x86_64/lib/c_clear_page.c	2005-05-26 11:16:09.000000000 -0400
> @@ -0,0 +1,45 @@
> +#include <linux/config.h>
> +#include <linux/preempt.h>
> +#include <asm/page.h>
> +#include <linux/kernel.h>
> +#include <asm/string.h>

preferred ordering:

linux/config
linux/kernel
linux/preempt
asm/*


> +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;

space between "a,b"


> +void c_clear_page_xmm(void *page)
> +{
> +	/* Note! gcc doesn't seem to align stack variables properly, so we 
> +	 * need to make use of unaligned loads and stores.
> +	 */
> +	xmm_store_t xmm_save[1];
> +	unsigned long cr0;
> +	int i;
> +
> +	preempt_disable();
> +	__asm__ __volatile__ (
> +		" mov %%cr0,%0\n"
> +		" clts\n"
> +		" movdqu %%xmm0,(%1)\n"
> +		" pxor %%xmm0, %%xmm0\n"
> +		: "=&r" (cr0): "r" (xmm_save) : "memory"
> +	);
> +
> +	for(i=0;i<PAGE_SIZE/64;i++)

exercise that spacebar :)


> +	{
> +		__asm__ __volatile__ (
> +		" movntdq %%xmm0, (%0)\n"
> +		" movntdq %%xmm0, 16(%0)\n"
> +		" movntdq %%xmm0, 32(%0)\n"
> +		" movntdq %%xmm0, 48(%0)\n"
> +		: : "r" (page) : "memory");
> +		page+=64;
> +	}
> +
> +	__asm__ __volatile__ (
> +		" sfence \n "
> +		" movdqu (%0),%%xmm0\n"
> +		" mov %1,%%cr0\n"
> +		:: "r" (xmm_save), "r" (cr0)
> +	);
> +	preempt_enable();
> +}
> diff -purN v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c xmm-rc4/arch/x86_64/lib/c_copy_page.c
> --- v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c	1969-12-31 19:00:00.000000000 -0500
> +++ xmm-rc4/arch/x86_64/lib/c_copy_page.c	2005-05-30 14:07:28.000000000 -0400
> @@ -0,0 +1,52 @@
> +#include <linux/config.h>
> +#include <linux/preempt.h>
> +#include <asm/page.h>
> +#include <linux/kernel.h>
> +#include <asm/string.h>
> +
> +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;

ditto

> +void c_copy_page_xmm(void *to, void *from)
> +{
> +	/* Note! gcc doesn't seem to align stack variables properly, so we 
> +	 * need to make use of unaligned loads and stores.
> +	 */
> +	xmm_store_t xmm_save[2];
> +	unsigned long cr0;
> +	int i;
> +
> +	preempt_disable();
> +	__asm__ __volatile__ (
> +                " prefetchnta    (%1)\n"
> +                " prefetchnta  64(%1)\n"
> +                " prefetchnta 128(%1)\n"
> +                " prefetchnta 192(%1)\n"
> +                " prefetchnta 256(%1)\n"
> +		" mov %%cr0,%0\n"
> +		" clts\n"
> +		" movdqu %%xmm0,  (%1)\n"
> +		" movdqu %%xmm1,16(%1)\n"
> +		: "=&r" (cr0): "r" (xmm_save) : "memory"
> +	);
> +
> +	for(i=0;i<PAGE_SIZE/32;i++) {

ditto


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 18:16 [RFC] x86-64: Use SSE for copy_page and clear_page Benjamin LaHaise
  2005-05-30 18:45 ` Jeff Garzik
@ 2005-05-30 19:06 ` dean gaudet
  2005-05-30 19:11   ` dean gaudet
  2005-05-30 19:38 ` Andi Kleen
  2 siblings, 1 reply; 21+ messages in thread
From: dean gaudet @ 2005-05-30 19:06 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: ak, linux-kernel

On Mon, 30 May 2005, Benjamin LaHaise wrote:

> Below is a patch that uses 128 bit SSE instructions for copy_page and 
> clear_page.  This is an improvement on P4 systems as can be seen by 
> running the test program at http://www.kvack.org/~bcrl/xmm64.c to get 
> results like:

it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)... 
if you use xorps, movaps, movntps then it works on SSE processors as well.

-dean

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 19:06 ` dean gaudet
@ 2005-05-30 19:11   ` dean gaudet
  2005-05-30 19:32     ` Andi Kleen
  0 siblings, 1 reply; 21+ messages in thread
From: dean gaudet @ 2005-05-30 19:11 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: ak, linux-kernel

On Mon, 30 May 2005, dean gaudet wrote:

> On Mon, 30 May 2005, Benjamin LaHaise wrote:
> 
> > Below is a patch that uses 128 bit SSE instructions for copy_page and 
> > clear_page.  This is an improvement on P4 systems as can be seen by 
> > running the test program at http://www.kvack.org/~bcrl/xmm64.c to get 
> > results like:
> 
> it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)... 
> if you use xorps, movaps, movntps then it works on SSE processors as well.

oh and btw... on x86-64 you might want to look at using movnti with 64-bit 
registers... the memory datapath on these processors is actually 64-bits 
wide, and the 128-bit stores are broken into two 64-bit pieces internally 
anyhow.  the advantage of using movnti over movntdq/movntps is that you 
don't have to save/restore the xmm register set.

-dean

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 19:11   ` dean gaudet
@ 2005-05-30 19:32     ` Andi Kleen
  2005-05-31  8:37       ` Denis Vlasenko
  0 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2005-05-30 19:32 UTC (permalink / raw)
  To: dean gaudet; +Cc: Benjamin LaHaise, linux-kernel

On Mon, May 30, 2005 at 12:11:23PM -0700, dean gaudet wrote:
> On Mon, 30 May 2005, dean gaudet wrote:
> 
> > On Mon, 30 May 2005, Benjamin LaHaise wrote:
> > 
> > > Below is a patch that uses 128 bit SSE instructions for copy_page and 
> > > clear_page.  This is an improvement on P4 systems as can be seen by 
> > > running the test program at http://www.kvack.org/~bcrl/xmm64.c to get 
> > > results like:
> > 
> > it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)... 
> > if you use xorps, movaps, movntps then it works on SSE processors as well.
> 
> oh and btw... on x86-64 you might want to look at using movnti with 64-bit 
> registers... the memory datapath on these processors is actually 64-bits 
> wide, and the 128-bit stores are broken into two 64-bit pieces internally 
> anyhow.  the advantage of using movnti over movntdq/movntps is that you 
> don't have to save/restore the xmm register set.

Any use of write combining for copy_page/clear_page is a bad idea.
The problem is that write combining always forces the destination
out of cache.  While it gives you better microbenchmarks your real workloads
suffer because they eat lot more additional cache misses when
accessing the fresh pages.

Don't go down that path please.

At least on Opteron I did quite some tests and the existing setup
with just rep ; movsq for C stepping or later or the unrolled loop 
for earlier CPUs worked best overall. On P4 I haven't do any benchmarks;
however it might be a good idea to check if rep ; movsq would be 
a win there too (if yes it could be enabled there)

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 18:16 [RFC] x86-64: Use SSE for copy_page and clear_page Benjamin LaHaise
  2005-05-30 18:45 ` Jeff Garzik
  2005-05-30 19:06 ` dean gaudet
@ 2005-05-30 19:38 ` Andi Kleen
  2005-05-30 20:05   ` Michael Thonke
  2 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2005-05-30 19:38 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-kernel

> The SSE clear page fuction is almost twice as fast as the kernel's 
> current clear_page, while the copy_page implementation is roughly a 
> third faster.  This is likely due to the fact that SSE instructions 
> can keep the 256 bit wide L2 cache bus at a higher utilisation than 
> 64 bit movs are able to.  Comments?

Any use of write combining is wrong here because it forces
the destination out of cache, which causes performance issues later on. 
Believe me we went through this years ago.

If you can code up a better function for P4 that does not use
write combining I would be happy to add. I never tuned the functions
for P4. 

One simple experiment would be to just test if P4 likes the
simple rep ; movsq / rep ; stosq loops and enable them.

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 19:38 ` Andi Kleen
@ 2005-05-30 20:05   ` Michael Thonke
  2005-05-30 20:14     ` Benjamin LaHaise
  2005-05-31  7:11     ` Andi Kleen
  0 siblings, 2 replies; 21+ messages in thread
From: Michael Thonke @ 2005-05-30 20:05 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Benjamin LaHaise, linux-kernel

Andi Kleen schrieb:

>>The SSE clear page fuction is almost twice as fast as the kernel's 
>>current clear_page, while the copy_page implementation is roughly a 
>>third faster.  This is likely due to the fact that SSE instructions 
>>can keep the 256 bit wide L2 cache bus at a higher utilisation than 
>>64 bit movs are able to.  Comments?
>>    
>>
>
>Any use of write combining is wrong here because it forces
>the destination out of cache, which causes performance issues later on. 
>Believe me we went through this years ago.
>
>If you can code up a better function for P4 that does not use
>write combining I would be happy to add. I never tuned the functions
>for P4. 
>
>One simple experiment would be to just test if P4 likes the
>simple rep ; movsq / rep ; stosq loops and enable them.
>  
>
No it doesn't like this sample here at all,I'll get segmentationfault on
that run.
RUN 1:

    SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
    buffer = 0x2aaaaade7000
    clear_page() tests
    clear_page function 'warm up run'        took 13516 cycles per page
    clear_page function 'kernel clear'       took 6539 cycles per page
    clear_page function '2.4 non MMX'        took 6354 cycles per page
    clear_page function '2.4 MMX fallback'   took 6205 cycles per page
    clear_page function '2.4 MMX version'    took 6830 cycles per page
    clear_page function 'faster_clear_page'  took 6240 cycles per page
    clear_page function 'even_faster_clear'  took 5746 cycles per page
    clear_page function 'xmm_clear '         took 4580 cycles per page
    Segmentation fault

    xmm64.o[9485] general protection rip:400814 rsp:7fffffc74118 error:0
    xmm64.o[9486] general protection rip:400814 rsp:7fffff8b1498 error:0
    xmm64.o[9487] general protection rip:400814 rsp:7fffffc31848 error:0

RUN 2:
Tell gcc use processor specific flags
    gcc -pipe -march=nocona -O2 -o xmm64.o xmm64.c

    SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
    buffer = 0x2aaaaade7000
    clear_page() tests
    clear_page function 'warm up run'        took 13419 cycles per page
    clear_page function 'kernel clear'       took 6403 cycles per page
    clear_page function '2.4 non MMX'        took 6290 cycles per page
    clear_page function '2.4 MMX fallback'   took 6156 cycles per page
    clear_page function '2.4 MMX version'    took 6605 cycles per page
    clear_page function 'faster_clear_page'  took 5607 cycles per page
    clear_page function 'even_faster_clear'  took 5173 cycles per page
    clear_page function 'xmm_clear '         took 4307 cycles per page
    clear_page function 'xmma_clear '        took 6230 cycles per page
    clear_page function 'xmm2_clear '        took 4908 cycles per page
    clear_page function 'xmma2_clear '       took 6256 cycles per page
    clear_page function 'kernel clear'       took 6506 cycles per page

    copy_page() tests
    copy_page function 'warm up run'         took 10352 cycles per page
    copy_page function '2.4 non MMX'         took 9440 cycles per page
    copy_page function '2.4 MMX fallback'    took 9300 cycles per page
    copy_page function '2.4 MMX version'     took 10238 cycles per page
    copy_page function 'faster_copy'         took 9497 cycles per page
    copy_page function 'even_faster'         took 9229 cycles per page
    copy_page function 'xmm_copy_page_no'    took 7810 cycles per page
    copy_page function 'xmm_copy_page'       took 7397 cycles per page
    copy_page function 'xmma_copy_page'      took 9430 cycles per page
    copy_page function 'v26_copy_page'       took 9234 cycles per page

CPU flags on Intel Pentium 4 640 x86_64 Gentoo GNU/Linux

    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
    pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm
    syscall nx lm constant_tsc pni monitor ds_cpl est cid cx16 xtpr

Greets
    Michael

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 20:05   ` Michael Thonke
@ 2005-05-30 20:14     ` Benjamin LaHaise
  2005-05-30 20:42       ` Michael Thonke
  2005-05-31  7:11     ` Andi Kleen
  1 sibling, 1 reply; 21+ messages in thread
From: Benjamin LaHaise @ 2005-05-30 20:14 UTC (permalink / raw)
  To: Michael Thonke; +Cc: linux-kernel

On Mon, May 30, 2005 at 10:05:28PM +0200, Michael Thonke wrote:
> No it doesn't like this sample here at all,I'll get segmentationfault on
> that run.

Grab a new copy -- one of the routines had an unaligned store instead of 
aligned for the register save.

		-ben

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 20:14     ` Benjamin LaHaise
@ 2005-05-30 20:42       ` Michael Thonke
  0 siblings, 0 replies; 21+ messages in thread
From: Michael Thonke @ 2005-05-30 20:42 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-kernel

Benjamin LaHaise schrieb:

>On Mon, May 30, 2005 at 10:05:28PM +0200, Michael Thonke wrote:
>  
>
>>No it doesn't like this sample here at all,I'll get segmentationfault on
>>that run.
>>    
>>
>
>Grab a new copy -- one of the routines had an unaligned store instead of 
>aligned for the register save.
>
>		-ben
>
>  
>
Hi Benjamin,

Here are the results with the new copy.

    *RUN 1: cc -o xmm64.o xmm64.c*

    ioGL64NX_EMT64 ~ # ./xmm64.o
    SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
    buffer = 0x2aaaaade7000
    clear_page() tests
    clear_page function 'warm up run'        took 13632 cycles per page
    clear_page function 'kernel clear'       took 6599 cycles per page
    clear_page function '2.4 non MMX'        took 6482 cycles per page
    clear_page function '2.4 MMX fallback'   took 6367 cycles per page
    clear_page function '2.4 MMX version'    took 6644 cycles per page
    clear_page function 'faster_clear_page'  took 6088 cycles per page
    clear_page function 'even_faster_clear'  took 5692 cycles per page
    clear_page function 'xmm_clear'  took 4270 cycles per page
    clear_page function 'xmma_clear'         took 6351 cycles per page
    clear_page function 'xmm2_clear'         took 4710 cycles per page
    clear_page function 'xmma2_clear'        took 6198 cycles per page
    clear_page function 'xmm3_clear'         took 6583 cycles per page
    clear_page function 'nt clear  '         took 4746 cycles per page
    clear_page function 'kernel clear'       took 6158 cycles per page

    copy_page() tests
    copy_page function 'warm up run'         took 9210 cycles per page
    copy_page function '2.4 non MMX'         took 6740 cycles per page
    copy_page function '2.4 MMX fallback'    took 6697 cycles per page
    copy_page function '2.4 MMX version'     took 9178 cycles per page
    copy_page function 'faster_copy'         took 11360 cycles per page
    copy_page function 'even_faster'         took 10133 cycles per page
    copy_page function 'xmm_copy_page_no'    took 8885 cycles per page
    copy_page function 'xmm_copy_page'       took 8725 cycles per page
    copy_page function 'xmma_copy_page'      took 9964 cycles per page
    copy_page function 'xmm3_copy_page'      took 7176 cycles per page
    copy_page function 'v26_copy_page'       took 6879 cycles per page
    copy_page function 'nt_copy_page'        took 10858 cycles per page


    *RUN 2: gcc -o xmm64.o xmm64.c*

    ioGL64NX_EMT64 ~ # ./xmm64.o
    SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
    buffer = 0x2aaaaade7000
    clear_page() tests
    clear_page function 'warm up run'        took 13981 cycles per page
    clear_page function 'kernel clear'       took 6708 cycles per page
    clear_page function '2.4 non MMX'        took 6505 cycles per page
    clear_page function '2.4 MMX fallback'   took 6235 cycles per page
    clear_page function '2.4 MMX version'    took 7251 cycles per page
    clear_page function 'faster_clear_page'  took 6390 cycles per page
    clear_page function 'even_faster_clear'  took 5932 cycles per page
    clear_page function 'xmm_clear'  took 4876 cycles per page
    clear_page function 'xmma_clear'         took 6379 cycles per page
    clear_page function 'xmm2_clear'         took 5264 cycles per page
    clear_page function 'xmma2_clear'        took 6373 cycles per page
    clear_page function 'xmm3_clear'         took 6651 cycles per page
    clear_page function 'nt clear  '         took 5186 cycles per page
    clear_page function 'kernel clear'       took 6326 cycles per page

    copy_page() tests
    copy_page function 'warm up run'         took 9537 cycles per page
    copy_page function '2.4 non MMX'         took 6776 cycles per page
    copy_page function '2.4 MMX fallback'    took 7407 cycles per page
    copy_page function '2.4 MMX version'     took 8812 cycles per page
    copy_page function 'faster_copy'         took 10992 cycles per page
    copy_page function 'even_faster'         took 10232 cycles per page
    copy_page function 'xmm_copy_page_no'    took 8918 cycles per page
    copy_page function 'xmm_copy_page'       took 9579 cycles per page
    copy_page function 'xmma_copy_page'      took 9854 cycles per page
    copy_page function 'xmm3_copy_page'      took 7602 cycles per page
    copy_page function 'v26_copy_page'       took 6811 cycles per page
    copy_page function 'nt_copy_page'        took 10958 cycles per page

    *RUN 3: gcc -pipe -march=nocona -O2 -o xmm64.o xmm64.c
    *
    SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
    buffer = 0x2aaaaade7000
    clear_page() tests
    clear_page function 'warm up run'        took 13626 cycles per page
    clear_page function 'kernel clear'       took 6780 cycles per page
    clear_page function '2.4 non MMX'        took 6755 cycles per page
    clear_page function '2.4 MMX fallback'   took 6283 cycles per page
    clear_page function '2.4 MMX version'    took 6764 cycles per page
    clear_page function 'faster_clear_page'  took 5764 cycles per page
    clear_page function 'even_faster_clear'  took 5240 cycles per page
    clear_page function 'xmm_clear'  took 4532 cycles per page
    clear_page function 'xmma_clear'         took 6352 cycles per page
    clear_page function 'xmm2_clear'         took 4983 cycles per page
    clear_page function 'xmma2_clear'        took 6211 cycles per page
    clear_page function 'xmm3_clear'         took 6748 cycles per page
    clear_page function 'nt clear  '         took 5166 cycles per page
    clear_page function 'kernel clear'       took 6201 cycles per page

    copy_page() tests
    copy_page function 'warm up run'         took 9651 cycles per page
    copy_page function '2.4 non MMX'         took 6724 cycles per page
    copy_page function '2.4 MMX fallback'    took 6905 cycles per page
    copy_page function '2.4 MMX version'     took 9722 cycles per page
    copy_page function 'faster_copy'         took 9738 cycles per page
    copy_page function 'even_faster'         took 9609 cycles per page
    copy_page function 'xmm_copy_page_no'    took 8846 cycles per page
    copy_page function 'xmm_copy_page'       took 8591 cycles per page
    copy_page function 'xmma_copy_page'      took 8250 cycles per page
    copy_page function 'xmm3_copy_page'      took 7879 cycles per page
    copy_page function 'v26_copy_page'       took 7512 cycles per page
    copy_page function 'nt_copy_page'        took 10424 cycles per page

    RUN 4: *gcc -pipe -march=nocona -O2 -fPIC -o xmm64.o xmm64.c*

    SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
    buffer = 0x2aaaaade7000
    clear_page() tests
    clear_page function 'warm up run'        took 13713 cycles per page
    clear_page function 'kernel clear'       took 6655 cycles per page
    clear_page function '2.4 non MMX'        took 6448 cycles per page
    clear_page function '2.4 MMX fallback'   took 6270 cycles per page
    clear_page function '2.4 MMX version'    took 7001 cycles per page
    clear_page function 'faster_clear_page'  took 5671 cycles per page
    clear_page function 'even_faster_clear'  took 5366 cycles per page
    clear_page function 'xmm_clear'  took 4737 cycles per page
    clear_page function 'xmma_clear'         took 6464 cycles per page
    clear_page function 'xmm2_clear'         took 5214 cycles per page
    clear_page function 'xmma2_clear'        took 6371 cycles per page
    clear_page function 'xmm3_clear'         took 6660 cycles per page
    clear_page function 'nt clear  '         took 5066 cycles per page
    clear_page function 'kernel clear'       took 6314 cycles per page

    copy_page() tests
    copy_page function 'warm up run'         took 9464 cycles per page
    copy_page function '2.4 non MMX'         took 7179 cycles per page
    copy_page function '2.4 MMX fallback'    took 6928 cycles per page
    copy_page function '2.4 MMX version'     took 9091 cycles per page
    copy_page function 'faster_copy'         took 9996 cycles per page
    copy_page function 'even_faster'         took 9824 cycles per page
    copy_page function 'xmm_copy_page_no'    took 8724 cycles per page
    copy_page function 'xmm_copy_page'       took 8920 cycles per page
    copy_page function 'xmma_copy_page'      took 8859 cycles per page
    copy_page function 'xmm3_copy_page'      took 7794 cycles per page
    copy_page function 'v26_copy_page'       took 7808 cycles per page
    copy_page function 'nt_copy_page'        took 9264 cycles per page

    Do you need more results or tests Benjamin?

    Greets and best regards
        Michael


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 20:05   ` Michael Thonke
  2005-05-30 20:14     ` Benjamin LaHaise
@ 2005-05-31  7:11     ` Andi Kleen
  1 sibling, 0 replies; 21+ messages in thread
From: Andi Kleen @ 2005-05-31  7:11 UTC (permalink / raw)
  To: Michael Thonke; +Cc: Benjamin LaHaise, linux-kernel

> >One simple experiment would be to just test if P4 likes the
> >simple rep ; movsq / rep ; stosq loops and enable them.
> >  
> >
> No it doesn't like this sample here at all,I'll get segmentationfault on
> that run.

Sorry, what did you test exactly?

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-30 19:32     ` Andi Kleen
@ 2005-05-31  8:37       ` Denis Vlasenko
  2005-05-31  9:15         ` Denis Vlasenko
  0 siblings, 1 reply; 21+ messages in thread
From: Denis Vlasenko @ 2005-05-31  8:37 UTC (permalink / raw)
  To: Andi Kleen, dean gaudet, Jeff Garzik; +Cc: Benjamin LaHaise, linux-kernel

On Monday 30 May 2005 22:32, Andi Kleen wrote:
> On Mon, May 30, 2005 at 12:11:23PM -0700, dean gaudet wrote:
> > On Mon, 30 May 2005, dean gaudet wrote:
> > 
> > > On Mon, 30 May 2005, Benjamin LaHaise wrote:
> > > 
> > > > Below is a patch that uses 128 bit SSE instructions for copy_page and 
> > > > clear_page.  This is an improvement on P4 systems as can be seen by 
> > > > running the test program at http://www.kvack.org/~bcrl/xmm64.c to get 
> > > > results like:
> > > 
> > > it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)... 
> > > if you use xorps, movaps, movntps then it works on SSE processors as well.
> > 
> > oh and btw... on x86-64 you might want to look at using movnti with 64-bit 
> > registers... the memory datapath on these processors is actually 64-bits 
> > wide, and the 128-bit stores are broken into two 64-bit pieces internally 
> > anyhow.  the advantage of using movnti over movntdq/movntps is that you 
> > don't have to save/restore the xmm register set.

And if (more like 'when', actually) next AMD CPU will have 2x128bit bus
instead of 2x64bit? Revert back to XMM?
 
> Any use of write combining for copy_page/clear_page is a bad idea.
> The problem is that write combining always forces the destination
> out of cache.  While it gives you better microbenchmarks your real workloads
> suffer because they eat lot more additional cache misses when
> accessing the fresh pages.
> 
> Don't go down that path please.

I doubt it unless real-world data will back your claim up.

I did microbenchmarking. You said it looks good in microbench but
hurts real-world.

Sometime after that I made a patch which allows for switching
clear/copy routines on the fly, and played a bit with real-world tests.

See http://www.thisishull.net/showthread.php?t=36562

In short, I ran forking test programs which excercise clearing and copying
routines in kernel. I wasn't able to find a usage pattern where page copying
using SSE non-temporal stores is a loss. Page clear was demonstrably worse,
no argument about that.

If you know such usage pattern, I'd like to test it.
--
vda


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-31  8:37       ` Denis Vlasenko
@ 2005-05-31  9:15         ` Denis Vlasenko
  2005-05-31  9:23           ` Andi Kleen
  0 siblings, 1 reply; 21+ messages in thread
From: Denis Vlasenko @ 2005-05-31  9:15 UTC (permalink / raw)
  To: Andi Kleen, dean gaudet, Jeff Garzik; +Cc: Benjamin LaHaise, linux-kernel

On Tuesday 31 May 2005 11:37, Denis Vlasenko wrote:
> On Monday 30 May 2005 22:32, Andi Kleen wrote:
> > Any use of write combining for copy_page/clear_page is a bad idea.
> > The problem is that write combining always forces the destination
> > out of cache.  While it gives you better microbenchmarks your real workloads
> > suffer because they eat lot more additional cache misses when
> > accessing the fresh pages.
> > 
> > Don't go down that path please.
> 
> I doubt it unless real-world data will back your claim up.
> 
> I did microbenchmarking. You said it looks good in microbench but
> hurts real-world.
> 
> Sometime after that I made a patch which allows for switching
> clear/copy routines on the fly, and played a bit with real-world tests.
> 
> See http://www.thisishull.net/showthread.php?t=36562
> 
> In short, I ran forking test programs which excercise clearing and copying
> routines in kernel. I wasn't able to find a usage pattern where page copying
> using SSE non-temporal stores is a loss. Page clear was demonstrably worse,
> no argument about that.

Let me explain what did I test, and how.

[snip explanation why nt store is a loss on small buffer, see
 http://www.thisishull.net/showthread.php?t=36562 if you want to know]

Core of copy test program:

#define N (256/4)
#define SIZE 4096
...
    for(k = 0; k < 5000; k++) {
        int pid;
        pid = fork();
        if(pid == 0) {
            /* child */
            for(i = 0; i < N; i++) mem[i*SIZE+1] = 'b';          /* force copy */
            strchr(mem, 'c') == mem+N*SIZE-1 || printf("BUG\n");        /* read all */
            exit(0);
        } else if(pid == -1) {
	    perror("fork");
        } else {
            /* parent */
            waitpid(pid, NULL, 0);
        }
    }

Each copy test does one fork per one loop.
With each fork, kernel zeroes out 3 pages and copies 8 pages.
This amounts to 12k+32k bytes.

256k copying, 5x5000 loops:
slow: 0m8.036 0m8.063 0m8.192 0m8.233 0m8.252 75600/1800468
mmx_APn: 0m7.461 0m7.496 0m7.543 0m7.687 0m7.725 75586/1800446
mmx_APN: 0m6.351 0m6.366 0m6.378 0m6.382 0m6.525 75586/1800436
mmx_APn/APN: 0m6.412 0m6.448 0m6.501 0m6.663 0m6.669 75584/1800439
^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^time (5 runs)  ^^^^^^^^^^^^^ pages cleared/copied (as reported by patched kernel)
A/a - align/do not align loop
P/p - prefetch/do not prefetch
N/n - nt stores/normal stores
"mmx_APn/APN" means "normal stores for clear and nt stores for copy"
(because nt clear is already known to be bad)

nt stores win as expected on working sets larger tnan cache size

Smaller working set, 44k touched by fork and 20k by copying.
This is still larger than 64k L1 size:

20k copying, 5x20000 loops:
slow: 0m6.610 0m6.665 0m6.694 0m6.750 0m6.774 300315/1300468
mmx_APn: 0m6.208 0m6.218 0m6.263 0m6.335 0m6.452 300352/1300448
mmx_APN: 0m4.887 0m4.984 0m5.021 0m5.052 0m5.057 300295/1300443
mmx_APn/APN: 0m5.115 0m5.160 0m5.167 0m5.172 0m5.183 300292/1300443

Smallest working set possible for this test program.
44k touched by fork and 4k by copying:

4k copying, 5x40000 loops:
slow: 0m8.303 0m8.334 0m8.354 0m8.510 0m8.572 600313/1800473
mmx_APn: 0m8.233 0m8.350 0m8.406 0m8.407 0m8.642 600323/1800467
mmx_APN: 0m6.475 0m6.501 0m6.510 0m6.534 0m6.783 600302/1800436
mmx_APn/APN: 0m6.540 0m6.551 0m6.603 0m6.640 0m6.708 600271/1800442

Unexpectedly, these small ones still run quite noticeably faster
with nt stores!

Why? Simply because small-workspace test did not need to read
back all 32k of data copied by fork. This is also likely to be
the case for the most frequent use of fork: fork+exec.

Thus with "normal" page clear and "nt" page copy routines
both clear and copy benchmarks run faster than with
stock kernel, both with small and large working set.

Am I wrong?
--
vda


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-31  9:15         ` Denis Vlasenko
@ 2005-05-31  9:23           ` Andi Kleen
  2005-05-31 13:59             ` Benjamin LaHaise
  2005-06-01  7:22             ` michael
  0 siblings, 2 replies; 21+ messages in thread
From: Andi Kleen @ 2005-05-31  9:23 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: dean gaudet, Jeff Garzik, Benjamin LaHaise, linux-kernel

> Thus with "normal" page clear and "nt" page copy routines
> both clear and copy benchmarks run faster than with
> stock kernel, both with small and large working set.
> 
> Am I wrong?

fork is only a corner case. The main case is a process allocating
memory using brk/mmap and then using it.

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-31  9:23           ` Andi Kleen
@ 2005-05-31 13:59             ` Benjamin LaHaise
  2005-06-01  6:22               ` Denis Vlasenko
  2005-06-01  7:22             ` michael
  1 sibling, 1 reply; 21+ messages in thread
From: Benjamin LaHaise @ 2005-05-31 13:59 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Denis Vlasenko, dean gaudet, Jeff Garzik, linux-kernel

On Tue, May 31, 2005 at 11:23:58AM +0200, Andi Kleen wrote:
> fork is only a corner case. The main case is a process allocating
> memory using brk/mmap and then using it.

At least for kernel compiles, using non-temporal stores is a slight 
win (a 2-5s improvement on 4m30s).  Granted, there seems to be a 
lot of variation in kernel compile times.

A bit more experimentation shows that non-temporal stores plus a 
prefetch of the resulting data is still better than the existing 
routines and only slightly slower than the pure non-temporal version.  
That said, it seems to result in kernel compiles that are on the high 
side of the variations I normally see (4m40s, 4m38s) compared to the 
~4m30s for an unpatched kernel and ~4m25s-4m30s for the non-temporal 
store version.

		-ben
-- 
"Time is what keeps everything from happening all at once." -- John Wheeler

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-31 13:59             ` Benjamin LaHaise
@ 2005-06-01  6:22               ` Denis Vlasenko
  2005-06-01  6:47                 ` Denis Vlasenko
  0 siblings, 1 reply; 21+ messages in thread
From: Denis Vlasenko @ 2005-06-01  6:22 UTC (permalink / raw)
  To: Benjamin LaHaise, Andi Kleen; +Cc: dean gaudet, Jeff Garzik, linux-kernel

On Tuesday 31 May 2005 16:59, Benjamin LaHaise wrote:
> On Tue, May 31, 2005 at 11:23:58AM +0200, Andi Kleen wrote:
> > fork is only a corner case. The main case is a process allocating
> > memory using brk/mmap and then using it.

I did the tests. I confirm Andi's conclusion that
if you are going to use cleared/copied page immediately,
nt stores are a loss.

However...
 
> At least for kernel compiles, using non-temporal stores is a slight 
> win (a 2-5s improvement on 4m30s).  Granted, there seems to be a 
> lot of variation in kernel compile times.
> 
> A bit more experimentation shows that non-temporal stores plus a 
> prefetch of the resulting data is still better than the existing 
> routines and only slightly slower than the pure non-temporal version.  
> That said, it seems to result in kernel compiles that are on the high 
> side of the variations I normally see (4m40s, 4m38s) compared to the 
> ~4m30s for an unpatched kernel and ~4m25s-4m30s for the non-temporal 
> store version.

My kernel compiles took ~5000000 page clears and ~300000 page copies.

slow (rep stosd/rep movsd), three runs:
real    12m47.530s
user    11m24.523s
sys     1m17.868s

real    12m45.362s
user    11m24.708s
sys     1m18.286s

real    12m45.152s
user    11m25.030s
sys     1m17.985s

mmx_APn/APN (mmx page clear, mmx page copy with nt stores):
real    12m41.737s
user    11m26.104s
sys     1m12.126s

real    12m40.753s
user    11m26.512s
sys     1m11.185s

mmx_APN  (mmx page clear with nt stores, mmx page copy with nt stores):
real    12m37.913s
user    11m30.376s
sys     1m4.622s

My kernel compiles on Athlon 2000 MHz were faster too.
--
vda


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-06-01  6:22               ` Denis Vlasenko
@ 2005-06-01  6:47                 ` Denis Vlasenko
  0 siblings, 0 replies; 21+ messages in thread
From: Denis Vlasenko @ 2005-06-01  6:47 UTC (permalink / raw)
  To: Benjamin LaHaise, Andi Kleen; +Cc: dean gaudet, Jeff Garzik, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2733 bytes --]

On Wednesday 01 June 2005 09:22, Denis Vlasenko wrote:
> On Tuesday 31 May 2005 16:59, Benjamin LaHaise wrote:
> > On Tue, May 31, 2005 at 11:23:58AM +0200, Andi Kleen wrote:
> > > fork is only a corner case. The main case is a process allocating
> > > memory using brk/mmap and then using it.
> 
> I did the tests. I confirm Andi's conclusion that
> if you are going to use cleared/copied page immediately,
> nt stores are a loss.

For anyone interested, here are the results and tarball with patch
which allows to switch routines on the fly, plus test programs/scripts.

You need to apply the patch, compile programs (see testing/mk)
and run (see testing/m).

Zero 1 page (4k), x300000 times:
./zero1 4 300000
method............. times.................. clears/copies
slow:               0m2.094 0m2.143 0m2.143 1800225/332
mmx_APn:            0m1.931 0m2.036 0m2.053 1800207/316
mmx_APN:            0m3.458 0m3.476 0m3.487 1800212/335
mmx_APn/APN:        0m2.023 0m2.057 0m2.225 1800205/297
n - normal stores, N - nt stores, n/N - nt stores for copy only

Zero 100 pages, x3000 times (zero all pages, read back all pages):
./zero1 400 3000
slow:               0m2.861 0m2.865 0m2.958 909218/323
mmx_APn:            0m2.626 0m2.715 0m2.783 909208/307
mmx_APN:            0m2.240 0m2.249 0m2.251 909212/294
mmx_APn/APN:        0m2.715 0m2.752 0m2.761 909220/324

Zero 100 pages, x3000 times (zero page, read back page, repeat for all pages):
./zero2 400 3000
slow:               0m1.711 0m1.734 0m1.739 909211/306
mmx_APn:            0m1.548 0m1.585 0m1.616 909208/303
mmx_APN:            0m2.088 0m2.102 0m2.104 909203/287
mmx_APn/APN:        0m1.589 0m1.603 0m1.617 909202/288

Andi is right. If we read pages back immediately, we lose.
If we defer that, we win.

Fork and copy 1 page, x150000 times
./copy1 4 150000
slow:               0m4.464 0m4.487 0m4.495 1350240/900353
mmx_APn:            0m3.979 0m4.090 0m4.191 1350229/900327
mmx_APN:            0m5.685 0m5.722 0m5.744 1350229/900348
mmx_APn/APN:        0m4.730 0m4.826 0m4.975 1350221/900340

Fork and copy 100 pages, x1500 times (copy all pages, read back all copied pages):
./copy1 400 1500
slow:               0m2.561 0m2.568 0m2.576 14019/454824
mmx_APn:            0m2.230 0m2.237 0m2.242 14011/454804
mmx_APN:            0m1.841 0m1.865 0m1.876 14008/454791
mmx_APn/APN:        0m1.906 0m1.909 0m1.922 14010/454788

Fork and copy 100 pages, x1500 times (copy page, read back copied page, repeat for each page):
./copy2 400 1500
slow:               0m2.097 0m2.107 0m2.121 14020/454821
mmx_APn:            0m1.769 0m1.776 0m1.788 14008/454787
mmx_APN:            0m1.975 0m1.978 0m2.015 14035/454791
mmx_APn/APN:        0m1.979 0m2.009 0m2.017 14013/454810
--
vda

[-- Attachment #2: t.tar.bz2 --]
[-- Type: application/x-tbz, Size: 8705 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-05-31  9:23           ` Andi Kleen
  2005-05-31 13:59             ` Benjamin LaHaise
@ 2005-06-01  7:22             ` michael
  2005-06-01  7:48               ` Andi Kleen
                                 ` (2 more replies)
  1 sibling, 3 replies; 21+ messages in thread
From: michael @ 2005-06-01  7:22 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Denis Vlasenko, dean gaudet, Jeff Garzik, Benjamin LaHaise,
	linux-kernel

Andi Kleen <ak@muc.de> writes:

> > Thus with "normal" page clear and "nt" page copy routines
> > both clear and copy benchmarks run faster than with
> > stock kernel, both with small and large working set.
> > 
> > Am I wrong?
> 
> fork is only a corner case. The main case is a process allocating
> memory using brk/mmap and then using it.

Key point: "using it". This normally involves writes to memory. Most
applications don't commonly read memory that they haven't previously
written to. (valgrind et al call that behaviour a "bug" :).

Given that, I'd say you really don't want the page zero routines
touching the cache.

Michael.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-06-01  7:22             ` michael
@ 2005-06-01  7:48               ` Andi Kleen
  2005-06-01  7:48               ` Denis Vlasenko
  2005-06-01  8:01               ` Nick Piggin
  2 siblings, 0 replies; 21+ messages in thread
From: Andi Kleen @ 2005-06-01  7:48 UTC (permalink / raw)
  To: michael
  Cc: Denis Vlasenko, dean gaudet, Jeff Garzik, Benjamin LaHaise,
	linux-kernel

michael@optusnet.com.au writes:
>
> Key point: "using it". This normally involves writes to memory. Most
> applications don't commonly read memory that they haven't previously
> written to. (valgrind et al call that behaviour a "bug" :).
>
> Given that, I'd say you really don't want the page zero routines
> touching the cache.

Writing on a modern CPU requires reading first too to get the rest
of the cache line (provided you don't use write combing or uncached
accesses)

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-06-01  7:22             ` michael
  2005-06-01  7:48               ` Andi Kleen
@ 2005-06-01  7:48               ` Denis Vlasenko
  2005-06-01 21:46                 ` dean gaudet
  2005-06-01  8:01               ` Nick Piggin
  2 siblings, 1 reply; 21+ messages in thread
From: Denis Vlasenko @ 2005-06-01  7:48 UTC (permalink / raw)
  To: michael, Andi Kleen
  Cc: dean gaudet, Jeff Garzik, Benjamin LaHaise, linux-kernel

On Wednesday 01 June 2005 10:22, michael@optusnet.com.au wrote:
> Andi Kleen <ak@muc.de> writes:
> 
> > > Thus with "normal" page clear and "nt" page copy routines
> > > both clear and copy benchmarks run faster than with
> > > stock kernel, both with small and large working set.
> > > 
> > > Am I wrong?
> > 
> > fork is only a corner case. The main case is a process allocating
> > memory using brk/mmap and then using it.
> 
> Key point: "using it". This normally involves writes to memory. Most
> applications don't commonly read memory that they haven't previously
> written to. (valgrind et al call that behaviour a "bug" :).
> 
> Given that, I'd say you really don't want the page zero routines
> touching the cache.

Heh, good point.

However, it is valid only if program writes in every byte in a cacheline.
Then sufficiently smart CPU may avoid reading from main RAM.
(I am not sure that today's CPUs are smart enough. K6s were not)

If you have even one uninitialized byte (struct padding, etc) 
between bytes you write, CPU will have to do reads from main memory
in order to have cachelines with fully valid data.

Kernel compile did finish faster with nt stores, tho...
--
vda


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-06-01  7:22             ` michael
  2005-06-01  7:48               ` Andi Kleen
  2005-06-01  7:48               ` Denis Vlasenko
@ 2005-06-01  8:01               ` Nick Piggin
  2 siblings, 0 replies; 21+ messages in thread
From: Nick Piggin @ 2005-06-01  8:01 UTC (permalink / raw)
  To: michael
  Cc: Andi Kleen, Denis Vlasenko, dean gaudet, Jeff Garzik,
	Benjamin LaHaise, lkml

On Wed, 2005-06-01 at 17:22 +1000, michael@optusnet.com.au wrote:
> Andi Kleen <ak@muc.de> writes:
> 

> > fork is only a corner case. The main case is a process allocating
> > memory using brk/mmap and then using it.
> 
> Key point: "using it". This normally involves writes to memory. Most
> applications don't commonly read memory that they haven't previously
> written to. (valgrind et al call that behaviour a "bug" :).

Then in that case you have doubled your memory bandwidth
requirement for those cachelines.

> 
> Given that, I'd say you really don't want the page zero routines
> touching the cache.
> 

The principle of locality-of-data (ie. the reason why caches
even work) says that you do ;)

Clearly some things benefit from not going through the cache.
But I don't think we should fundamentally change behaviour of
this *just* because it is worth a percent on kernel compiles.

Also, I think that trends in CPU design (more cache, further
from memory, multiple CPUs & cores) should favour stores
going to cache rather than straight to memory... But I'm
just speculating.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC] x86-64: Use SSE for copy_page and clear_page
  2005-06-01  7:48               ` Denis Vlasenko
@ 2005-06-01 21:46                 ` dean gaudet
  0 siblings, 0 replies; 21+ messages in thread
From: dean gaudet @ 2005-06-01 21:46 UTC (permalink / raw)
  To: Denis Vlasenko
  Cc: michael, Andi Kleen, Jeff Garzik, Benjamin LaHaise, linux-kernel

On Wed, 1 Jun 2005, Denis Vlasenko wrote:

> However, it is valid only if program writes in every byte in a cacheline.
> Then sufficiently smart CPU may avoid reading from main RAM.
> (I am not sure that today's CPUs are smart enough. K6s were not)

nobody does this yet on regular stores...

so-called "non-temporal" stores actually go through the write-combiners 
(which is why Andi is referring to them as write-combining stores)... the 
write-combiners have byte-enables so they can detect if a full line is 
dirty or not.

in the event a write-combiner is flushed before it's full, the behaviour 
i've measured on all k8/p-m/p4 is to do a read-modify-write *at the memory 
interface*.  this occurs at typically a much slower cycle rate than it 
would in the cache itself... in theory DDR supports a byte-enabled write 
to memory, and there should be no need to do a read-modify-write sequence. 
however all of these processors (and/or their northbridges as appropriate) 
save pins on their package -- they don't have any pins for the DDR byte 
enables (they're hardwired to enabled on the mobo).

(you can see this behaviour with any of the movnt or with maskmov ... just 
leave holes in the lines and watch the store cost go through the roof.)

-dean

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2005-06-01 21:47 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-30 18:16 [RFC] x86-64: Use SSE for copy_page and clear_page Benjamin LaHaise
2005-05-30 18:45 ` Jeff Garzik
2005-05-30 19:06 ` dean gaudet
2005-05-30 19:11   ` dean gaudet
2005-05-30 19:32     ` Andi Kleen
2005-05-31  8:37       ` Denis Vlasenko
2005-05-31  9:15         ` Denis Vlasenko
2005-05-31  9:23           ` Andi Kleen
2005-05-31 13:59             ` Benjamin LaHaise
2005-06-01  6:22               ` Denis Vlasenko
2005-06-01  6:47                 ` Denis Vlasenko
2005-06-01  7:22             ` michael
2005-06-01  7:48               ` Andi Kleen
2005-06-01  7:48               ` Denis Vlasenko
2005-06-01 21:46                 ` dean gaudet
2005-06-01  8:01               ` Nick Piggin
2005-05-30 19:38 ` Andi Kleen
2005-05-30 20:05   ` Michael Thonke
2005-05-30 20:14     ` Benjamin LaHaise
2005-05-30 20:42       ` Michael Thonke
2005-05-31  7:11     ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox