[RFC] x86-64: Use SSE for copy_page and clear_page

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Benjamin LaHaise <bcrl@kvack.org>
To: ak@muc.de
Cc: linux-kernel@vger.kernel.org
Subject: [RFC] x86-64: Use SSE for copy_page and clear_page
Date: Mon, 30 May 2005 14:16:26 -0400	[thread overview]
Message-ID: <20050530181626.GA10212@kvack.org> (raw)

Hello Andi,

Below is a patch that uses 128 bit SSE instructions for copy_page and 
clear_page.  This is an improvement on P4 systems as can be seen by 
running the test program at http://www.kvack.org/~bcrl/xmm64.c to get 
results like:

SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ buffer = 0x2aaaaaad6000
clear_page() tests 
clear_page function 'warm up run'        took 25444 cycles per page
clear_page function 'kernel clear'       took 6595 cycles per page
clear_page function '2.4 non MMX'        took 7827 cycles per page
clear_page function '2.4 MMX fallback'   took 7741 cycles per page
clear_page function '2.4 MMX version'    took 6454 cycles per page
clear_page function 'faster_clear_page'  took 4344 cycles per page
clear_page function 'even_faster_clear'  took 4151 cycles per page
clear_page function 'xmm_clear '         took 3204 cycles per page
clear_page function 'xmma_clear '        took 6080 cycles per page
clear_page function 'xmm2_clear '        took 3370 cycles per page
clear_page function 'xmma2_clear '       took 6115 cycles per page
clear_page function 'kernel clear'       took 6583 cycles per page

copy_page() tests 
copy_page function 'warm up run'         took 9770 cycles per page
copy_page function '2.4 non MMX'         took 9758 cycles per page
copy_page function '2.4 MMX fallback'    took 9572 cycles per page
copy_page function '2.4 MMX version'     took 9405 cycles per page
copy_page function 'faster_copy'         took 7407 cycles per page
copy_page function 'even_faster'         took 7158 cycles per page
copy_page function 'xmm_copy_page_no'    took 6110 cycles per page
copy_page function 'xmm_copy_page'       took 5914 cycles per page
copy_page function 'xmma_copy_page'      took 5913 cycles per page
copy_page function 'v26_copy_page'       took 9168 cycles per page

The SSE clear page fuction is almost twice as fast as the kernel's 
current clear_page, while the copy_page implementation is roughly a 
third faster.  This is likely due to the fact that SSE instructions 
can keep the 256 bit wide L2 cache bus at a higher utilisation than 
64 bit movs are able to.  Comments?

		-ben

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
:r public_html/patches/v2.6.12-rc4-xmm-2.diff
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c xmm-rc4/arch/x86_64/lib/c_clear_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c	1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_clear_page.c	2005-05-26 11:16:09.000000000 -0400
@@ -0,0 +1,45 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_clear_page_xmm(void *page)
+{
+	/* Note! gcc doesn't seem to align stack variables properly, so we 
+	 * need to make use of unaligned loads and stores.
+	 */
+	xmm_store_t xmm_save[1];
+	unsigned long cr0;
+	int i;
+
+	preempt_disable();
+	__asm__ __volatile__ (
+		" mov %%cr0,%0\n"
+		" clts\n"
+		" movdqu %%xmm0,(%1)\n"
+		" pxor %%xmm0, %%xmm0\n"
+		: "=&r" (cr0): "r" (xmm_save) : "memory"
+	);
+
+	for(i=0;i<PAGE_SIZE/64;i++)
+	{
+		__asm__ __volatile__ (
+		" movntdq %%xmm0, (%0)\n"
+		" movntdq %%xmm0, 16(%0)\n"
+		" movntdq %%xmm0, 32(%0)\n"
+		" movntdq %%xmm0, 48(%0)\n"
+		: : "r" (page) : "memory");
+		page+=64;
+	}
+
+	__asm__ __volatile__ (
+		" sfence \n "
+		" movdqu (%0),%%xmm0\n"
+		" mov %1,%%cr0\n"
+		:: "r" (xmm_save), "r" (cr0)
+	);
+	preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c xmm-rc4/arch/x86_64/lib/c_copy_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c	1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_copy_page.c	2005-05-30 14:07:28.000000000 -0400
@@ -0,0 +1,52 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_copy_page_xmm(void *to, void *from)
+{
+	/* Note! gcc doesn't seem to align stack variables properly, so we 
+	 * need to make use of unaligned loads and stores.
+	 */
+	xmm_store_t xmm_save[2];
+	unsigned long cr0;
+	int i;
+
+	preempt_disable();
+	__asm__ __volatile__ (
+                " prefetchnta    (%1)\n"
+                " prefetchnta  64(%1)\n"
+                " prefetchnta 128(%1)\n"
+                " prefetchnta 192(%1)\n"
+                " prefetchnta 256(%1)\n"
+		" mov %%cr0,%0\n"
+		" clts\n"
+		" movdqu %%xmm0,  (%1)\n"
+		" movdqu %%xmm1,16(%1)\n"
+		: "=&r" (cr0): "r" (xmm_save) : "memory"
+	);
+
+	for(i=0;i<PAGE_SIZE/32;i++) {
+		__asm__ __volatile__ (
+		" prefetchnta 320(%0)\n"
+		" movdqa   (%0),%%xmm0\n"
+		" movdqa 16(%0),%%xmm1\n"
+		" movntdq %%xmm0,   (%1)\n"
+		" movntdq %%xmm1, 16(%1)\n"
+		: : "r" (from), "r" (to) : "memory");
+		to += 32;
+		from += 32;
+	}
+
+	__asm__ __volatile__ (
+		" sfence \n "
+		" movdqu   (%0),%%xmm0\n"
+		" movdqu 16(%0),%%xmm1\n"
+		" mov %1,%%cr0\n"
+		:: "r" (xmm_save), "r" (cr0)
+	);
+	preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/clear_page.S xmm-rc4/arch/x86_64/lib/clear_page.S
--- v2.6.12-rc4/arch/x86_64/lib/clear_page.S	2004-12-24 16:34:33.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/clear_page.S	2005-05-26 11:27:26.000000000 -0400
@@ -1,3 +1,5 @@
+#include <asm/cpufeature.h>
+	    	
 /*
  * Zero a page. 	
  * rdi	page
@@ -24,12 +26,25 @@ clear_page:
 	nop
 	ret
 clear_page_end:	
-	
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  clear_page
+	.quad  clear_page_xmm
+	.byte  X86_FEATURE_XMM2
+	.byte  clear_page_end-clear_page	
+	.byte  clear_page_xmm_end-clear_page_xmm
+	.previous
+
+	.globl	c_clear_page_xmm
+	.p2align 4
+clear_page_xmm:
+	jmp	c_clear_page_xmm+(clear_page_xmm-clear_page)
+clear_page_xmm_end:
+
 	/* C stepping K8 run faster using the string instructions.
 	   It is also a lot simpler. Use this when possible */
 	
-#include <asm/cpufeature.h>
-	    	
 	.section .altinstructions,"a"
 	.align 8
 	.quad  clear_page
diff -purN v2.6.12-rc4/arch/x86_64/lib/copy_page.S xmm-rc4/arch/x86_64/lib/copy_page.S
--- v2.6.12-rc4/arch/x86_64/lib/copy_page.S	2004-12-24 16:34:32.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/copy_page.S	2005-05-26 11:29:55.000000000 -0400
@@ -76,18 +76,34 @@ copy_page:
 	movq	2*8(%rsp),%r13
 	addq	$3*8,%rsp
 	ret
+copy_page_end = .
 	
+#include <asm/cpufeature.h>		
+		
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_page
+	.quad  copy_page_xmm
+	.byte  X86_FEATURE_XMM2
+	.byte  copy_page_end-copy_page	
+	.byte  copy_page_xmm_end-copy_page_xmm
+	.previous
+
+	.globl	c_copy_page_xmm
+	.p2align 4
+copy_page_xmm:
+	jmp	c_copy_page_xmm+(copy_page_xmm-copy_page)
+copy_page_xmm_end = .
+
 	/* C stepping K8 run faster using the string copy instructions.
 	   It is also a lot simpler. Use this when possible */
 
-#include <asm/cpufeature.h>		
-		
 	.section .altinstructions,"a"
 	.align 8
 	.quad  copy_page
 	.quad  copy_page_c
 	.byte  X86_FEATURE_K8_C
-	.byte  copy_page_c_end-copy_page_c
+	.byte  copy_page_end-copy_page
 	.byte  copy_page_c_end-copy_page_c
 	.previous
 
diff -purN v2.6.12-rc4/arch/x86_64/lib/Makefile xmm-rc4/arch/x86_64/lib/Makefile
--- v2.6.12-rc4/arch/x86_64/lib/Makefile	2004-12-24 16:34:01.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/Makefile	2005-05-26 11:26:50.000000000 -0400
@@ -10,5 +10,7 @@ lib-y := csum-partial.o csum-copy.o csum
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
 lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += c_clear_page.o
+lib-y += c_copy_page.o
 
 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o

next             reply	other threads:[~2005-05-30 18:17 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-05-30 18:16 Benjamin LaHaise [this message]
2005-05-30 18:45 ` [RFC] x86-64: Use SSE for copy_page and clear_page Jeff Garzik
2005-05-30 19:06 ` dean gaudet
2005-05-30 19:11   ` dean gaudet
2005-05-30 19:32     ` Andi Kleen
2005-05-31  8:37       ` Denis Vlasenko
2005-05-31  9:15         ` Denis Vlasenko
2005-05-31  9:23           ` Andi Kleen
2005-05-31 13:59             ` Benjamin LaHaise
2005-06-01  6:22               ` Denis Vlasenko
2005-06-01  6:47                 ` Denis Vlasenko
2005-06-01  7:22             ` michael
2005-06-01  7:48               ` Andi Kleen
2005-06-01  7:48               ` Denis Vlasenko
2005-06-01 21:46                 ` dean gaudet
2005-06-01  8:01               ` Nick Piggin
2005-05-30 19:38 ` Andi Kleen
2005-05-30 20:05   ` Michael Thonke
2005-05-30 20:14     ` Benjamin LaHaise
2005-05-30 20:42       ` Michael Thonke
2005-05-31  7:11     ` Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20050530181626.GA10212@kvack.org \
    --to=bcrl@kvack.org \
    --cc=ak@muc.de \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox