public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [beta patch] SSE copy_page() / clear_page()
@ 2001-02-09 22:17 Manfred Spraul
  2001-02-09 22:40 ` Linus Torvalds
       [not found] ` <200102092240.OAA15902@penguin.transmeta.com>
  0 siblings, 2 replies; 14+ messages in thread
From: Manfred Spraul @ 2001-02-09 22:17 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 843 bytes --]

I wrote a kernel patch that replaces the standard
copy_page()/clear_page() functions on Pentium III and Pentium IV with
SSE instructions.

If you have access to a Pentium 4 it would be great if you could
download the user space test apps from

http://colorfullife.com/~manfred/sse/

and run them.

The patch itself is still beta:
* find optimal number of %%xmm registers for copy_page(). The current
code uses 4, but 2 registers is faster on my Pentium III.

* use sse for normal memcopy. Then main advantage of sse over mmx is
that only the clobbered registers must be saved, not the full fpu state.

* verify that the code doesn't break SSE enabled apps.
I checked a sse enabled mp3 encoder and Mesa.

The implementation is derived from Intel sample code available at

http://developer.intel.com/software/idap/media/pdf/copy.pdf

--
	Manfred

[-- Attachment #2: patch-sse --]
[-- Type: text/plain, Size: 4899 bytes --]

// $Header$
// Kernel Version:
//  VERSION = 2
//  PATCHLEVEL = 4
//  SUBLEVEL = 1
//  EXTRAVERSION =
--- 2.4/arch/i386/config.in	Sat Feb  3 14:02:24 2001
+++ build-2.4/arch/i386/config.in	Fri Feb  9 15:52:19 2001
@@ -91,6 +91,7 @@
    define_bool CONFIG_X86_GOOD_APIC y
    define_bool CONFIG_X86_PGE y
    define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+   define_bool CONFIG_X86_USE_SSE y
 fi
 if [ "$CONFIG_MPENTIUM4" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 7
@@ -98,6 +99,7 @@
    define_bool CONFIG_X86_GOOD_APIC y
    define_bool CONFIG_X86_PGE y
    define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+   define_bool CONFIG_X86_USE_SSE y
 fi
 if [ "$CONFIG_MK6" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 5
--- 2.4/arch/i386/kernel/i386_ksyms.c	Sat Feb  3 14:02:24 2001
+++ build-2.4/arch/i386/kernel/i386_ksyms.c	Fri Feb  9 15:52:19 2001
@@ -117,6 +117,11 @@
 EXPORT_SYMBOL(mmx_copy_page);
 #endif
 
+#ifdef CONFIG_X86_USE_SSE
+EXPORT_SYMBOL(sse_clear_page);
+EXPORT_SYMBOL(sse_copy_page);
+#endif
+
 #ifdef CONFIG_SMP
 EXPORT_SYMBOL(cpu_data);
 EXPORT_SYMBOL(kernel_flag);
diff -urN --exclude .depend 2.4/arch/i386/lib/Makefile build-2.4/arch/i386/lib/Makefile
--- 2.4/arch/i386/lib/Makefile	Sat Feb  3 14:02:24 2001
+++ build-2.4/arch/i386/lib/Makefile	Fri Feb  9 15:52:19 2001
@@ -12,6 +12,7 @@
 	memcpy.o
 
 obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
+obj-$(CONFIG_X86_USE_SSE) += sse.o
 obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
 
 include $(TOPDIR)/Rules.make
diff -urN --exclude .depend 2.4/arch/i386/lib/sse.c build-2.4/arch/i386/lib/sse.c
--- 2.4/arch/i386/lib/sse.c	Thu Jan  1 01:00:00 1970
+++ build-2.4/arch/i386/lib/sse.c	Fri Feb  9 15:52:19 2001
@@ -0,0 +1,89 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/i387.h>
+
+/*
+ *	SSE library helper functions
+ *
+ *	Copyright (C) 2001 Manfred Spraul
+ *
+ *	Based on Intel sample code from
+ *	 Block Copy Using Pentium(R) III Streaming SIMD Extensions
+ *		Revision 1.9
+ *		January 12, 1999	
+ *
+ */
+ 
+
+void sse_clear_page(void * page)
+{
+	int storage[4];
+	int d0, d1, d2;
+	__asm__ __volatile__(
+		"mov %%cr0, %2\n\t"
+		"clts\n\t"
+		"movups %%xmm0, (%3)\n\t"
+		"xorps %%xmm0, %%xmm0\n\t"
+		"xor %0, %0\n\t"
+		"1: movntps %%xmm0, (%1)\n\t"
+		"movntps %%xmm0, 16(%1)\n\t"
+		"movntps %%xmm0, 32(%1)\n\t"
+		"movntps %%xmm0, 48(%1)\n\t"
+		"movntps %%xmm0, 64(%1)\n\t"
+		"movntps %%xmm0, 80(%1)\n\t"
+		"movntps %%xmm0, 96(%1)\n\t"
+		"movntps %%xmm0, 112(%1)\n\t"
+		"add $128, %1\n\t"
+		"inc %0\n\t"
+		"cmp $32, %0\n\t"
+		"jne 1b\n\t"
+		"movups (%3), %%xmm0\n\t"
+		"sfence\n\t"
+		"mov %2, %%cr0\n\t"
+		: "=&r" (d0), "=&r" (d1), "=&r" (d2)
+		: "r" (&storage), "1" (page)
+		: "cc", "memory");
+}
+
+void sse_copy_page(void *to, void *from)
+{
+	int storage[16];
+	int d0, d1, d2, d3;
+	__asm__ __volatile__(
+		"mov %%cr0, %3\n\t"		/* step 1: enable the FPU */
+		"clts\n\t"
+		"movups %%xmm0, (%4)\n\t"	/* step 2: save the clobbered regs */
+		"movups %%xmm1, 16(%4)\n\t"
+		"movups %%xmm2, 32(%4)\n\t"
+		"movups %%xmm3, 48(%4)\n\t"
+		"mov (%2), %0\n\t"		/* step 3: load the TLB */
+		"xor %0, %0\n\t"		/* step 4: prefetch the page */
+		"1:prefetchnta (%2, %0)\n\t"
+		"prefetchnta 32(%2, %0)\n\t"
+		"add $64,%0\n\t"
+		"cmp $4096, %0\n\t"
+		"jne 1b\n\t"
+		"2: movaps (%2), %%xmm0\n\t"	/* step 5: copy the page */
+		"movaps 16(%2), %%xmm1\n\t"
+		"movaps 32(%2), %%xmm2\n\t"
+		"movaps 48(%2), %%xmm3\n\t"
+		"add $64, %2\n\t"
+		"movntps %%xmm0, (%1)\n\t"
+		"movntps %%xmm1, 16(%1)\n\t"
+		"movntps %%xmm2, 32(%1)\n\t"
+		"movntps %%xmm3, 48(%1)\n\t"
+		"add $64, %1\n\t"
+		"sub $64, %0\n\t"
+		"jnz 2b\n\t"
+		"movups (%4), %%xmm0\n\t"	/* step 6: restore the clobbered regs */
+		"movups 16(%4), %%xmm1\n\t"
+		"movups 32(%4), %%xmm2\n\t"
+		"movups 48(%4), %%xmm3\n\t"
+		"sfence\n\t"
+		"mov %3, %%cr0\n\t"		/* step 7: restore cr0 */
+		: "=&r" (d0), "=&r" (d1), "=&r" (d2), "=&r" (d3)
+		: "r" (&storage), "1" (to), "2" (from)
+		: "cc", "memory");
+}
diff -urN 2.4/include/asm-i386/page.h build-2.4/include/asm-i386/page.h
--- 2.4/include/asm-i386/page.h	Thu Jan  4 23:50:46 2001
+++ build-2.4/include/asm-i386/page.h	Fri Feb  9 15:52:19 2001
@@ -11,7 +11,14 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_X86_USE_3DNOW
+#ifdef CONFIG_X86_USE_SSE
+
+#include <asm/sse.h>
+
+#define clear_page(page)	sse_clear_page(page)
+#define copy_page(to,from)	sse_copy_page(to,from)
+
+#elif defined(CONFIG_X86_USE_3DNOW)
 
 #include <asm/mmx.h>
 
diff -urN 2.4/include/asm-i386/sse.h build-2.4/include/asm-i386/sse.h
--- 2.4/include/asm-i386/sse.h	Thu Jan  1 01:00:00 1970
+++ build-2.4/include/asm-i386/sse.h	Fri Feb  9 17:26:34 2001
@@ -0,0 +1,11 @@
+#ifndef _ASM_SSE_H
+#define _ASM_SSE_H
+
+/*
+ *	SSE helper operations
+ */
+ 
+extern void sse_clear_page(void *page);
+extern void sse_copy_page(void *to, void *from);
+
+#endif

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2001-02-20 21:17 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2001-02-09 22:17 [beta patch] SSE copy_page() / clear_page() Manfred Spraul
2001-02-09 22:40 ` Linus Torvalds
2001-02-09 23:03   ` Doug Ledford
2001-02-10  9:09     ` Manfred Spraul
2001-02-10 17:18       ` Doug Ledford
2001-02-10 18:00         ` Manfred Spraul
2001-02-10 18:18           ` Manfred Spraul
     [not found] ` <200102092240.OAA15902@penguin.transmeta.com>
2001-02-14 22:37   ` Manfred Spraul
2001-02-16 15:27     ` Andrew Morton
2001-02-20 17:35     ` Pavel Machek
2001-02-20 20:49       ` Alan Cox
2001-02-20 20:52         ` Pavel Machek
2001-02-20 21:08           ` Alan Cox
2001-02-20 21:16           ` Manfred Spraul

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox