From: Benjamin LaHaise <bcrl@kvack.org>
To: ak@muc.de
Cc: linux-kernel@vger.kernel.org
Subject: [RFC] x86-64: Use SSE for copy_page and clear_page
Date: Mon, 30 May 2005 14:16:26 -0400 [thread overview]
Message-ID: <20050530181626.GA10212@kvack.org> (raw)
Hello Andi,
Below is a patch that uses 128 bit SSE instructions for copy_page and
clear_page. This is an improvement on P4 systems as can be seen by
running the test program at http://www.kvack.org/~bcrl/xmm64.c to get
results like:
SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ buffer = 0x2aaaaaad6000
clear_page() tests
clear_page function 'warm up run' took 25444 cycles per page
clear_page function 'kernel clear' took 6595 cycles per page
clear_page function '2.4 non MMX' took 7827 cycles per page
clear_page function '2.4 MMX fallback' took 7741 cycles per page
clear_page function '2.4 MMX version' took 6454 cycles per page
clear_page function 'faster_clear_page' took 4344 cycles per page
clear_page function 'even_faster_clear' took 4151 cycles per page
clear_page function 'xmm_clear ' took 3204 cycles per page
clear_page function 'xmma_clear ' took 6080 cycles per page
clear_page function 'xmm2_clear ' took 3370 cycles per page
clear_page function 'xmma2_clear ' took 6115 cycles per page
clear_page function 'kernel clear' took 6583 cycles per page
copy_page() tests
copy_page function 'warm up run' took 9770 cycles per page
copy_page function '2.4 non MMX' took 9758 cycles per page
copy_page function '2.4 MMX fallback' took 9572 cycles per page
copy_page function '2.4 MMX version' took 9405 cycles per page
copy_page function 'faster_copy' took 7407 cycles per page
copy_page function 'even_faster' took 7158 cycles per page
copy_page function 'xmm_copy_page_no' took 6110 cycles per page
copy_page function 'xmm_copy_page' took 5914 cycles per page
copy_page function 'xmma_copy_page' took 5913 cycles per page
copy_page function 'v26_copy_page' took 9168 cycles per page
The SSE clear page fuction is almost twice as fast as the kernel's
current clear_page, while the copy_page implementation is roughly a
third faster. This is likely due to the fact that SSE instructions
can keep the 256 bit wide L2 cache bus at a higher utilisation than
64 bit movs are able to. Comments?
-ben
Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
:r public_html/patches/v2.6.12-rc4-xmm-2.diff
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c xmm-rc4/arch/x86_64/lib/c_clear_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c 1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_clear_page.c 2005-05-26 11:16:09.000000000 -0400
@@ -0,0 +1,45 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_clear_page_xmm(void *page)
+{
+ /* Note! gcc doesn't seem to align stack variables properly, so we
+ * need to make use of unaligned loads and stores.
+ */
+ xmm_store_t xmm_save[1];
+ unsigned long cr0;
+ int i;
+
+ preempt_disable();
+ __asm__ __volatile__ (
+ " mov %%cr0,%0\n"
+ " clts\n"
+ " movdqu %%xmm0,(%1)\n"
+ " pxor %%xmm0, %%xmm0\n"
+ : "=&r" (cr0): "r" (xmm_save) : "memory"
+ );
+
+ for(i=0;i<PAGE_SIZE/64;i++)
+ {
+ __asm__ __volatile__ (
+ " movntdq %%xmm0, (%0)\n"
+ " movntdq %%xmm0, 16(%0)\n"
+ " movntdq %%xmm0, 32(%0)\n"
+ " movntdq %%xmm0, 48(%0)\n"
+ : : "r" (page) : "memory");
+ page+=64;
+ }
+
+ __asm__ __volatile__ (
+ " sfence \n "
+ " movdqu (%0),%%xmm0\n"
+ " mov %1,%%cr0\n"
+ :: "r" (xmm_save), "r" (cr0)
+ );
+ preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c xmm-rc4/arch/x86_64/lib/c_copy_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c 1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_copy_page.c 2005-05-30 14:07:28.000000000 -0400
@@ -0,0 +1,52 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_copy_page_xmm(void *to, void *from)
+{
+ /* Note! gcc doesn't seem to align stack variables properly, so we
+ * need to make use of unaligned loads and stores.
+ */
+ xmm_store_t xmm_save[2];
+ unsigned long cr0;
+ int i;
+
+ preempt_disable();
+ __asm__ __volatile__ (
+ " prefetchnta (%1)\n"
+ " prefetchnta 64(%1)\n"
+ " prefetchnta 128(%1)\n"
+ " prefetchnta 192(%1)\n"
+ " prefetchnta 256(%1)\n"
+ " mov %%cr0,%0\n"
+ " clts\n"
+ " movdqu %%xmm0, (%1)\n"
+ " movdqu %%xmm1,16(%1)\n"
+ : "=&r" (cr0): "r" (xmm_save) : "memory"
+ );
+
+ for(i=0;i<PAGE_SIZE/32;i++) {
+ __asm__ __volatile__ (
+ " prefetchnta 320(%0)\n"
+ " movdqa (%0),%%xmm0\n"
+ " movdqa 16(%0),%%xmm1\n"
+ " movntdq %%xmm0, (%1)\n"
+ " movntdq %%xmm1, 16(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+ to += 32;
+ from += 32;
+ }
+
+ __asm__ __volatile__ (
+ " sfence \n "
+ " movdqu (%0),%%xmm0\n"
+ " movdqu 16(%0),%%xmm1\n"
+ " mov %1,%%cr0\n"
+ :: "r" (xmm_save), "r" (cr0)
+ );
+ preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/clear_page.S xmm-rc4/arch/x86_64/lib/clear_page.S
--- v2.6.12-rc4/arch/x86_64/lib/clear_page.S 2004-12-24 16:34:33.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/clear_page.S 2005-05-26 11:27:26.000000000 -0400
@@ -1,3 +1,5 @@
+#include <asm/cpufeature.h>
+
/*
* Zero a page.
* rdi page
@@ -24,12 +26,25 @@ clear_page:
nop
ret
clear_page_end:
-
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad clear_page
+ .quad clear_page_xmm
+ .byte X86_FEATURE_XMM2
+ .byte clear_page_end-clear_page
+ .byte clear_page_xmm_end-clear_page_xmm
+ .previous
+
+ .globl c_clear_page_xmm
+ .p2align 4
+clear_page_xmm:
+ jmp c_clear_page_xmm+(clear_page_xmm-clear_page)
+clear_page_xmm_end:
+
/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */
-#include <asm/cpufeature.h>
-
.section .altinstructions,"a"
.align 8
.quad clear_page
diff -purN v2.6.12-rc4/arch/x86_64/lib/copy_page.S xmm-rc4/arch/x86_64/lib/copy_page.S
--- v2.6.12-rc4/arch/x86_64/lib/copy_page.S 2004-12-24 16:34:32.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/copy_page.S 2005-05-26 11:29:55.000000000 -0400
@@ -76,18 +76,34 @@ copy_page:
movq 2*8(%rsp),%r13
addq $3*8,%rsp
ret
+copy_page_end = .
+#include <asm/cpufeature.h>
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad copy_page
+ .quad copy_page_xmm
+ .byte X86_FEATURE_XMM2
+ .byte copy_page_end-copy_page
+ .byte copy_page_xmm_end-copy_page_xmm
+ .previous
+
+ .globl c_copy_page_xmm
+ .p2align 4
+copy_page_xmm:
+ jmp c_copy_page_xmm+(copy_page_xmm-copy_page)
+copy_page_xmm_end = .
+
/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
-#include <asm/cpufeature.h>
-
.section .altinstructions,"a"
.align 8
.quad copy_page
.quad copy_page_c
.byte X86_FEATURE_K8_C
- .byte copy_page_c_end-copy_page_c
+ .byte copy_page_end-copy_page
.byte copy_page_c_end-copy_page_c
.previous
diff -purN v2.6.12-rc4/arch/x86_64/lib/Makefile xmm-rc4/arch/x86_64/lib/Makefile
--- v2.6.12-rc4/arch/x86_64/lib/Makefile 2004-12-24 16:34:01.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/Makefile 2005-05-26 11:26:50.000000000 -0400
@@ -10,5 +10,7 @@ lib-y := csum-partial.o csum-copy.o csum
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += c_clear_page.o
+lib-y += c_copy_page.o
lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
next reply other threads:[~2005-05-30 18:17 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-05-30 18:16 Benjamin LaHaise [this message]
2005-05-30 18:45 ` [RFC] x86-64: Use SSE for copy_page and clear_page Jeff Garzik
2005-05-30 19:06 ` dean gaudet
2005-05-30 19:11 ` dean gaudet
2005-05-30 19:32 ` Andi Kleen
2005-05-31 8:37 ` Denis Vlasenko
2005-05-31 9:15 ` Denis Vlasenko
2005-05-31 9:23 ` Andi Kleen
2005-05-31 13:59 ` Benjamin LaHaise
2005-06-01 6:22 ` Denis Vlasenko
2005-06-01 6:47 ` Denis Vlasenko
2005-06-01 7:22 ` michael
2005-06-01 7:48 ` Andi Kleen
2005-06-01 7:48 ` Denis Vlasenko
2005-06-01 21:46 ` dean gaudet
2005-06-01 8:01 ` Nick Piggin
2005-05-30 19:38 ` Andi Kleen
2005-05-30 20:05 ` Michael Thonke
2005-05-30 20:14 ` Benjamin LaHaise
2005-05-30 20:42 ` Michael Thonke
2005-05-31 7:11 ` Andi Kleen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20050530181626.GA10212@kvack.org \
--to=bcrl@kvack.org \
--cc=ak@muc.de \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.