All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] framework for fpu usage in kernel
@ 2001-02-11 19:03 Manfred Spraul
  2001-02-11 20:26 ` Alan Cox
  0 siblings, 1 reply; 4+ messages in thread
From: Manfred Spraul @ 2001-02-11 19:03 UTC (permalink / raw)
  To: linux-kernel, Arjan van de Ven, Doug Ledford, Linus Torvalds

[-- Attachment #1: Type: text/plain, Size: 1728 bytes --]

Currently there are 2 fpu users in the kernel:
raid5 checksumming and 3dnow memcpy/memset.

raid5 checksumming is not problematic, but _mmx_memcpy() has unexpected
side effects if someone else is also using the fpu:
memcopy is a really generic function, and calling it saves the current
fpu state into thread.i387.f{,x}save. IMHO that's wrong, memcopy must
save into a local buffer like raid5 checksumming.

I've attached a proposal that supports arbitrary combinations of fpu
users in kernel space.

* kernel threads can use the fpu freely. Nothing new.
* 2 sets of functions for fpu usage in "normal" threads:

- kfpu_acquire(), kfpu_try_acquire() + release functions.

- kfpu_full_begin(), kfpu_mmx_begin(), kfpu_nosave_begin(),
kfpu_try_begin() + _end() functions.

The first set is only possible from normal process context. The caller
can sleep between _acquire() and _release().

The second set can be called from arbitrary context, but the caller must
not sleep between _begin() and _end().

The _try() functions check if the fpu is unused, and fail if the fpu is
currently in use. That way the memcpy()/memset() functions can avoid fpu
context saves/restores and it saves stack space.

Nesting is partially possible: _begin() within _acquire is possible,
_acquire() within _begin() will BUG.

[_acquire()/_release() have similar restrictions as down() and up(),
_begin()/_end() have similar restrictions as spinlocks]

The patch itself is alpha quality: only the sse functions are tested, it
boots when compiled for Pentium III, the raid5 checksum _benchmark_
still works and distributed.net still cracks rc5.

I haven't yet checked that the exception handlers are still called
properly.

What do you think?
--
	Manfred

[-- Attachment #2: patch-kfpu --]
[-- Type: text/plain, Size: 20435 bytes --]

// $Header$
// Kernel Version:
//  VERSION = 2
//  PATCHLEVEL = 4
//  SUBLEVEL = 1
//  EXTRAVERSION =
diff -urN 2.4/include/asm-i386/i387.h build-2.4/include/asm-i386/i387.h
--- 2.4/include/asm-i386/i387.h	Fri Feb  2 15:20:36 2001
+++ build-2.4/include/asm-i386/i387.h	Sun Feb 11 19:40:45 2001
@@ -16,6 +16,7 @@
 #include <asm/sigcontext.h>
 #include <asm/user.h>
 
+#include <asm/kfpu.h>
 extern void init_fpu(void);
 /*
  * FPU lazy state save handling...
@@ -23,9 +24,7 @@
 extern void save_init_fpu( struct task_struct *tsk );
 extern void restore_fpu( struct task_struct *tsk );
 
-extern void kernel_fpu_begin(void);
-#define kernel_fpu_end() stts()
-
+#include <asm/kfpu.h>
 
 #define unlazy_fpu( tsk ) do { \
 	if ( tsk->flags & PF_USEDFPU ) \
@@ -36,7 +35,7 @@
 	if ( tsk->flags & PF_USEDFPU ) { \
 		asm volatile("fwait"); \
 		tsk->flags &= ~PF_USEDFPU; \
-		stts(); \
+		kfpu_leave(); \
 	} \
 } while (0)
 
diff -urN 2.4/include/asm-i386/kfpu.h build-2.4/include/asm-i386/kfpu.h
--- 2.4/include/asm-i386/kfpu.h	Thu Jan  1 01:00:00 1970
+++ build-2.4/include/asm-i386/kfpu.h	Sun Feb 11 19:59:51 2001
@@ -0,0 +1,83 @@
+#ifndef _ASM_KFPU_H
+#define _ASM_KFPU_H
+
+/*
+ * FPU support for kernel threads
+ *
+ * currently limited to MMX, SSE and SSE2.
+ * x87 and fpu emulation are not supported.
+ */
+
+/**********************************/
+/*
+ * Enable full fpu access.
+ * Only for kernel threads.
+ */
+void kfpu_start(void);
+
+
+/**********************************/
+/*
+ * Get full fpu access.
+ *
+ * Only permitted from process context.
+ * Caller must check that the FPU is present before calling.
+ */
+struct kfpubuf_acquire {
+	unsigned long saved;
+	unsigned char buffer[512+16];
+} kfpubuf_acquire;
+
+void kfpu_acquire(struct kfpubuf_acquire *buf);
+void kfpu_release(struct kfpubuf_acquire *buf);
+
+/* returns 1 if it got fpu access, 0 otherwise */
+int kfpu_try_acquire(void);
+void kfpu_try_release(void);
+
+
+/**********************************/
+/*
+ * Get short term fpu access.
+ *
+ * The functions can be called from any context (process,
+ * softirq, interrupt)
+ * Caller must check that the FPU is present before calling.
+ * The caller must not sleep between _begin() and _end()
+ */
+struct kfpubuf_full {
+	unsigned char buffer[512];
+};
+
+struct kfpubuf_mmx {
+	unsigned char buffer[108];
+};
+
+void kfpu_full_begin(struct kfpubuf_full *buf);
+void kfpu_mmx_begin(struct kfpubuf_mmx *buf);
+/*
+ * ret val 0: caller doesn't need to save clobbered regs
+ * ret val !0: the caller must save & restore any clobbered
+ *		fpu registers.
+ * This function DOES NOT reinitialize the fpu!
+ */
+int kfpu_nosave_begin(void);
+
+void kfpu_full_end(struct kfpubuf_full *buf);
+void kfpu_mmx_end(struct kfpubuf_mmx *buf);
+void kfpu_nosave_end(void);
+
+/* returns 1 if it got fpu access */
+int kfpu_try_begin(void);
+void kfpu_try_end(void);
+
+/**********************************/
+
+/* internal function, called by math_state_restore() */
+void kfpu_enter(void);
+/* internal function, called by save_init_fpu() */
+void kfpu_leave(void);
+/* internal function, called by cpu_init() */
+void kfpu_initialize(void);
+
+#endif
diff -urN 2.4/include/asm-i386/page.h build-2.4/include/asm-i386/page.h
--- 2.4/include/asm-i386/page.h	Thu Jan  4 23:50:46 2001
+++ build-2.4/include/asm-i386/page.h	Sun Feb 11 13:10:17 2001
@@ -11,7 +11,14 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_X86_USE_3DNOW
+#ifdef CONFIG_X86_USE_SSE
+
+#include <asm/sse.h>
+
+#define clear_page(page)	sse_clear_page(page)
+#define copy_page(to,from)	sse_copy_page(to,from)
+
+#elif defined(CONFIG_X86_USE_3DNOW)
 
 #include <asm/mmx.h>
 
diff -urN 2.4/include/asm-i386/sse.h build-2.4/include/asm-i386/sse.h
--- 2.4/include/asm-i386/sse.h	Thu Jan  1 01:00:00 1970
+++ build-2.4/include/asm-i386/sse.h	Sun Feb 11 12:59:43 2001
@@ -0,0 +1,11 @@
+#ifndef _ASM_SSE_H
+#define _ASM_SSE_H
+
+/*
+ *	SSE helper operations
+ */
+ 
+extern void sse_clear_page(void *page);
+extern void sse_copy_page(void *to, void *from);
+
+#endif
diff -urN 2.4/include/asm-i386/system.h build-2.4/include/asm-i386/system.h
--- 2.4/include/asm-i386/system.h	Sun Feb 11 00:39:07 2001
+++ build-2.4/include/asm-i386/system.h	Sun Feb 11 12:41:01 2001
@@ -100,7 +100,6 @@
 /*
  * Clear and set 'TS' bit respectively
  */
-#define clts() __asm__ __volatile__ ("clts")
 #define read_cr0() ({ \
 	unsigned int __dummy; \
 	__asm__( \
@@ -110,7 +109,6 @@
 })
 #define write_cr0(x) \
 	__asm__("movl %0,%%cr0": :"r" (x));
-#define stts() write_cr0(8 | read_cr0())
 
 #endif	/* __KERNEL__ */
 
diff -urN 2.4/include/asm-i386/xor.h build-2.4/include/asm-i386/xor.h
--- 2.4/include/asm-i386/xor.h	Mon Nov 13 04:39:51 2000
+++ build-2.4/include/asm-i386/xor.h	Sun Feb 11 19:37:24 2001
@@ -18,18 +18,16 @@
  * Copyright (C) 1998 Ingo Molnar.
  */
 
+#include <asm/kfpu.h>
+
 #define FPU_SAVE							\
   do {									\
-	if (!(current->flags & PF_USEDFPU))				\
-		__asm__ __volatile__ (" clts;\n");			\
-	__asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));	\
+	kfpu_mmx_begin(&fpu_save);					\
   } while (0)
 
 #define FPU_RESTORE							\
   do {									\
-	__asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0]));		\
-	if (!(current->flags & PF_USEDFPU))				\
-		stts();							\
+	kfpu_mmx_end(&fpu_save);					\
   } while (0)
 
 #define LD(x,y)		"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
@@ -44,7 +42,7 @@
 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 {
 	unsigned long lines = bytes >> 7;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -89,7 +87,7 @@
 	      unsigned long *p3)
 {
 	unsigned long lines = bytes >> 7;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -139,7 +137,7 @@
 	      unsigned long *p3, unsigned long *p4)
 {
 	unsigned long lines = bytes >> 7;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -194,7 +192,7 @@
 	      unsigned long *p3, unsigned long *p4, unsigned long *p5)
 {
 	unsigned long lines = bytes >> 7;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -261,7 +259,7 @@
 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 {
 	unsigned long lines = bytes >> 6;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -310,7 +308,7 @@
 	     unsigned long *p3)
 {
 	unsigned long lines = bytes >> 6;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -368,7 +366,7 @@
 	     unsigned long *p3, unsigned long *p4)
 {
 	unsigned long lines = bytes >> 6;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -435,7 +433,7 @@
 	     unsigned long *p3, unsigned long *p4, unsigned long *p5)
 {
 	unsigned long lines = bytes >> 6;
-	char fpu_save[108];
+	struct kfpubuf_mmx fpu_save;
 
 	FPU_SAVE;
 
@@ -531,28 +529,31 @@
  */
 
 #define XMMS_SAVE				\
-	__asm__ __volatile__ ( 			\
-		"movl %%cr0,%0		;\n\t"	\
-		"clts			;\n\t"	\
-		"movups %%xmm0,(%1)	;\n\t"	\
-		"movups %%xmm1,0x10(%1)	;\n\t"	\
-		"movups %%xmm2,0x20(%1)	;\n\t"	\
-		"movups %%xmm3,0x30(%1)	;\n\t"	\
-		: "=r" (cr0)			\
+	restore = kfpu_nosave_begin();		\
+	if (restore)				\
+		__asm__ __volatile__ ( 		\
+		"movups %%xmm0,(%0)	;\n\t"	\
+		"movups %%xmm1,0x10(%0)	;\n\t"	\
+		"movups %%xmm2,0x20(%0)	;\n\t"	\
+		"movups %%xmm3,0x30(%0)	;\n\t"	\
+		: /* no output */		\
 		: "r" (xmm_save) 		\
 		: "memory")
 
 #define XMMS_RESTORE				\
-	__asm__ __volatile__ ( 			\
-		"sfence			;\n\t"	\
-		"movups (%1),%%xmm0	;\n\t"	\
-		"movups 0x10(%1),%%xmm1	;\n\t"	\
-		"movups 0x20(%1),%%xmm2	;\n\t"	\
-		"movups 0x30(%1),%%xmm3	;\n\t"	\
-		"movl 	%0,%%cr0	;\n\t"	\
-		:				\
-		: "r" (cr0), "r" (xmm_save)	\
-		: "memory")
+	__asm__ __volatile__ (			\
+		"sfence\n\t"			\
+		: : : "memory");		\
+	if (restore)				\
+		__asm__ __volatile__ ( 		\
+		"movups (%0),%%xmm0	;\n\t"	\
+		"movups 0x10(%0),%%xmm1	;\n\t"	\
+		"movups 0x20(%0),%%xmm2	;\n\t"	\
+		"movups 0x30(%0),%%xmm3	;\n\t"	\
+		: /* no output */		\
+		: "r" (xmm_save)		\
+		: "memory");			\
+	kfpu_nosave_end()
 
 #define OFFS(x)		"16*("#x")"
 #define	PF0(x)		"	prefetcht0  "OFFS(x)"(%1)   ;\n"
@@ -575,7 +576,7 @@
 {
         unsigned long lines = bytes >> 8;
 	char xmm_save[16*4];
-	int cr0;
+	int restore;
 
 	XMMS_SAVE;
 
@@ -629,7 +630,7 @@
 {
         unsigned long lines = bytes >> 8;
 	char xmm_save[16*4];
-	int cr0;
+	int restore;
 
 	XMMS_SAVE;
 
@@ -690,7 +691,7 @@
 {
         unsigned long lines = bytes >> 8;
 	char xmm_save[16*4];
-	int cr0;
+	int restore;
 
 	XMMS_SAVE;
 
@@ -758,7 +759,7 @@
 {
         unsigned long lines = bytes >> 8;
 	char xmm_save[16*4];
-	int cr0;
+	int restore;
 
 	XMMS_SAVE;
 
diff -urN --exclude .depend 2.4/arch/i386/lib/Makefile build-2.4/arch/i386/lib/Makefile
--- 2.4/arch/i386/lib/Makefile	Sun Feb 11 00:37:51 2001
+++ build-2.4/arch/i386/lib/Makefile	Sun Feb 11 12:59:43 2001
@@ -12,6 +12,7 @@
 	memcpy.o
 
 obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
+obj-$(CONFIG_X86_USE_SSE) += sse.o
 obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
 
 include $(TOPDIR)/Rules.make
diff -urN --exclude .depend 2.4/arch/i386/lib/mmx.c build-2.4/arch/i386/lib/mmx.c
--- 2.4/arch/i386/lib/mmx.c	Sat Feb  3 14:02:24 2001
+++ build-2.4/arch/i386/lib/mmx.c	Sun Feb 11 19:28:12 2001
@@ -28,7 +28,11 @@
 	void *p=to;
 	int i= len >> 6;	/* len/64 */
 
-	kernel_fpu_begin();
+	if(!kfpu_try_begin() {
+		/* FIXME: this belongs into string.h */
+		__memcpy(to, from, len);
+		return;
+	} 
 
 	__asm__ __volatile__ (
 		"1: prefetch (%0)\n"		/* This set is 28 bytes */
@@ -84,7 +88,7 @@
 	 *	Now do the tail of the block
 	 */
 	__memcpy(to, from, len&63);
-	kernel_fpu_end();
+	kfpu_try_end();
 	return p;
 }
 
@@ -92,8 +96,6 @@
 {
 	int i;
 
-	kernel_fpu_begin();
-	
 	__asm__ __volatile__ (
 		"  pxor %%mm0, %%mm0\n" : :
 	);
@@ -118,18 +120,12 @@
 	__asm__ __volatile__ (
 		"  sfence \n" : :
 	);
-	kernel_fpu_end();
 }
 
 static void fast_copy_page(void *to, void *from)
 {
 	int i;
 
-	kernel_fpu_begin();
-
-	/* maybe the prefetch stuff can go before the expensive fnsave...
-	 * but that is for later. -AV
-	 */
 	__asm__ __volatile__ (
 		"1: prefetch (%0)\n"
 		"   prefetch 64(%0)\n"
@@ -185,7 +181,6 @@
 	__asm__ __volatile__ (
 		"  sfence \n" : :
 	);
-	kernel_fpu_end();
 }
 
 /*
@@ -205,10 +200,12 @@
  
 void mmx_clear_page(void * page)
 {
-	if(in_interrupt())
-		slow_zero_page(page);
-	else
+	if (kfpu_try_begin()) {
 		fast_clear_page(page);
+		kfpu_try_end();
+	} else {
+		slow_zero_page(page);
+	}
 }
 
 static void slow_copy_page(void *to, void *from)
@@ -225,8 +222,10 @@
 
 void mmx_copy_page(void *to, void *from)
 {
-	if(in_interrupt())
-		slow_copy_page(to, from);
-	else
+	if (kfpu_try_begin()) {
 		fast_copy_page(to, from);
+		kfpu_try_end();
+	} else {
+		slow_copy_page(to, from);
+	}
 }
diff -urN --exclude .depend 2.4/arch/i386/lib/sse.c build-2.4/arch/i386/lib/sse.c
--- 2.4/arch/i386/lib/sse.c	Thu Jan  1 01:00:00 1970
+++ build-2.4/arch/i386/lib/sse.c	Sun Feb 11 18:02:30 2001
@@ -0,0 +1,107 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/i387.h>
+
+/*
+ *	SSE library helper functions
+ *
+ *	Copyright (C) 2001 Manfred Spraul
+ *
+ *	Based on Intel sample code from
+ *	 Block Copy Using Pentium(R) III Streaming SIMD Extensions
+ *		Revision 1.9
+ *		January 12, 1999	
+ *
+ */
+ 
+
+void sse_clear_page(void * page)
+{
+	int storage[4];
+	int restore;
+	int d0;
+	restore = kfpu_nosave_begin();
+	if (restore) {
+		__asm__ __volatile__(
+		"movups %%xmm0, (%0)\n\t"
+		: /* no output */
+		: "r" (&storage[0])
+		: "memory" );
+	}
+	__asm__ __volatile__(
+		"xorps %%xmm0, %%xmm0\n\t"
+		"xor %0, %0\n\t"
+		"1: movntps %%xmm0, (%1)\n\t"
+		"movntps %%xmm0, 16(%1)\n\t"
+		"add $32, %1\n\t"
+		"inc %0\n\t"
+		"cmp $128, %0\n\t"
+		"jne 1b\n\t"
+		"sfence\n\t"
+		: "=&r" (d0), "=&r" (page)
+		: "1" (page)
+		: "cc", "memory");
+	if (restore) {
+		__asm__ __volatile__(
+		"movups (%0), %%xmm0\n\t"
+		: /* no output */
+		: "r" (&storage[0])
+		: "memory" );
+	}
+	kfpu_nosave_end();
+}
+
+void sse_copy_page(void *to, void *from)
+{
+	int storage[16];
+	int restore;
+	int d0;
+	restore = kfpu_nosave_begin();
+	if (restore) {
+		__asm__ __volatile__(
+		"movups %%xmm0, (%0)\n\t"
+		"movups %%xmm1, 16(%0)\n\t"
+		"movups %%xmm2, 32(%0)\n\t"
+		"movups %%xmm3, 48(%0)\n\t"
+		: /* no output */
+		: "r" (&storage[0])
+		: "memory" );
+	}
+	__asm__ __volatile__(
+		"mov (%2), %0\n\t"		/* step 1: load the TLB */
+		"xor %0, %0\n\t"		/* step 2: prefetch the page */
+		"1:prefetchnta (%2, %0)\n\t"
+		"prefetchnta 32(%2, %0)\n\t"
+		"add $64,%0\n\t"
+		"cmp $4096, %0\n\t"
+		"jne 1b\n\t"
+		"2: movaps (%2), %%xmm0\n\t"	/* step 3: copy the page */
+		"movaps 16(%2), %%xmm1\n\t"
+		"movaps 32(%2), %%xmm2\n\t"
+		"movaps 48(%2), %%xmm3\n\t"
+		"add $64, %2\n\t"
+		"movntps %%xmm0, (%1)\n\t"
+		"movntps %%xmm1, 16(%1)\n\t"
+		"movntps %%xmm2, 32(%1)\n\t"
+		"movntps %%xmm3, 48(%1)\n\t"
+		"add $64, %1\n\t"
+		"sub $64, %0\n\t"
+		"jnz 2b\n\t"
+		"sfence\n\t"
+		: "=&r" (d0), "=&r" (to), "=&r" (from)
+		: "1" (to), "2" (from)
+		: "cc", "memory");
+	if (restore) {
+		__asm__ __volatile__(
+			"movups (%0), %%xmm0\n\t"
+			"movups 16(%0), %%xmm1\n\t"
+			"movups 32(%0), %%xmm2\n\t"
+			"movups 48(%0), %%xmm3\n\t"
+			: /* no output */
+			: "r" (&storage[0])
+			: "memory" );
+	}
+	kfpu_nosave_end();
+}
--- 2.4/arch/i386/kernel/i387.c	Sun Feb 11 00:37:51 2001
+++ build-2.4/arch/i386/kernel/i387.c	Sun Feb 11 19:35:49 2001
@@ -24,6 +24,8 @@
 #define HAVE_HWFP 1
 #endif
 
+static volatile int cpu_fpuactive[NR_CPUS];
+
 /*
  * The _current_ task is using the FPU for the first time
  * so initialize it and set the mxcsr to its default
@@ -39,10 +41,19 @@
 	current->used_math = 1;
 }
 
+void inline stts(void)
+{
+	write_cr0(8 | read_cr0())
+}
+
+char *sse_aligned(char *buffer)
+{
+	return (char*)((((unsigned long)buffer)+15)&(~15));
+}
+
 /*
  * FPU lazy state save handling.
  */
-
 static inline void __save_init_fpu( struct task_struct *tsk )
 {
 	if ( cpu_has_fxsr ) {
@@ -58,18 +69,191 @@
 void save_init_fpu( struct task_struct *tsk )
 {
 	__save_init_fpu(tsk);
-	stts();
+	kfpu_leave();
+}
+
+void kfpu_start(void)
+{
+	if ( cpu_has_fpu ) {
+		cpu_fpuactive[smp_processor_id()]++;
+		__asm__ __volatile__("clts");	/* Allow maths ops (or we recurse) */
+		init_fpu();
+		current->flags |= PF_USEDFPU;	/* So we fnsave on switch_to() */
+		current->used_math = 1;
+	}
 }
 
-void kernel_fpu_begin(void)
+void kfpu_acquire(struct kfpubuf_acquire *buf)
 {
-	struct task_struct *tsk = current;
+	if (!current->used_math) {
+		buf->saved = 0;
+		kfpu_try_acquire();
+		return;
+	}
+	buf->saved = 1;
+	unlazy_fpu(current);
+	if ( cpu_has_fxsr ) {
+		memcpy(buf->buffer, &current->thread.i387.fxsave,
+			sizeof(current->thread.i387.fxsave));
+	} else {
+		memcpy(buf->buffer, &current->thread.i387.fsave,
+			sizeof(current->thread.i387.fsave));
+	}
+	
+	if (cpu_fpuactive[smp_processor_id()])
+		BUG();
+	cpu_fpuactive[smp_processor_id()]++;
+	__asm__ __volatile__("clts");
+	init_fpu();
+	current->flags |= PF_USEDFPU;
+}
 
-	if (tsk->flags & PF_USEDFPU) {
-		__save_init_fpu(tsk);
+void kfpu_release(struct kfpubuf_acquire *buf)
+{
+	if (!buf->saved) {
+		kfpu_try_release();
 		return;
 	}
-	clts();
+	clear_fpu(current);
+	if ( cpu_has_fxsr ) {
+		memcpy(&current->thread.i387.fxsave, buf->buffer,
+			sizeof(current->thread.i387.fxsave));
+	} else {
+		memcpy(&current->thread.i387.fsave, buf->buffer,
+			sizeof(current->thread.i387.fsave));
+	}
+	if (cpu_fpuactive[smp_processor_id()])
+		BUG();
+	if (current->flags & PF_USEDFPU)
+		BUG();
+}
+
+/* returns 1 if it got fpu access, 0 otherwise */
+int kfpu_try_acquire(void)
+{
+	if (current->used_math)
+		return 0;
+	current->used_math = 1;
+	cpu_fpuactive[smp_processor_id()]++;
+	__asm__ __volatile__("clts");
+	init_fpu();
+	current->flags |= PF_USEDFPU;
+	return 1;
+}
+
+void kfpu_try_release(void)
+{
+	clear_fpu(current);
+	current->used_math = 0;
+	if (cpu_fpuactive[smp_processor_id()])
+		BUG();
+	if (current->flags & PF_USEDFPU)
+		BUG();
+}
+
+void kfpu_full_begin(struct kfpubuf_full *buf)
+{
+	cpu_fpuactive[smp_processor_id()]++;
+	__asm__ __volatile__("clts");
+	if(cpu_fpuactive[smp_processor_id()] > 1) {
+		char *buffer = sse_aligned(buffer);
+		asm volatile( "fxsave %0 ; fnclex"
+			      : "=m" (buffer) );
+	}
+	__asm__("fninit");
+	load_mxcsr(0x1f80);
+}
+
+void kfpu_mmx_begin(struct kfpubuf_mmx *buf)
+{
+	cpu_fpuactive[smp_processor_id()]++;
+	__asm__ __volatile__("clts");
+	if(cpu_fpuactive[smp_processor_id()] > 1) {
+		asm volatile( "fnsave %0 ; fwait"
+			      : "=m" (buf->buffer) );
+	}
+	__asm__("fninit");
+}
+
+/* ret val 0: caller doesn't need to save clobbered regs */
+int kfpu_nosave_begin(void)
+{
+	cpu_fpuactive[smp_processor_id()]++;
+	__asm__ __volatile__("clts");
+	return cpu_fpuactive[smp_processor_id()]-1;
+}
+
+void kfpu_full_end(struct kfpubuf_full *buf)
+{
+	if(cpu_fpuactive[smp_processor_id()] > 1) {
+		char *buffer = sse_aligned(buffer);
+		asm volatile( "fxrstor %0"
+			      : "=m" (buffer) );
+		cpu_fpuactive[smp_processor_id()]--;
+	} else {
+		cpu_fpuactive[smp_processor_id()]--;
+		stts();
+	}
+}
+
+void kfpu_mmx_end(struct kfpubuf_mmx *buf)
+{
+	if(cpu_fpuactive[smp_processor_id()] > 1) {
+		asm volatile( "frstor %0"
+			      : "=m" (buf->buffer) );
+		cpu_fpuactive[smp_processor_id()]--;
+	} else {
+		cpu_fpuactive[smp_processor_id()]--;
+		stts();
+	}
+}
+
+void kfpu_nosave_end(void)
+{
+	cpu_fpuactive[smp_processor_id()]--;
+	if(cpu_fpuactive[smp_processor_id()] == 0)
+		stts();
+}
+
+/* returns 1 if it got fpu access */
+int kfpu_try_begin(void)
+{
+	if (cpu_fpuactive[smp_processor_id()] > 0)
+		return 0;
+	cpu_fpuactive[smp_processor_id()]++;
+	__asm__ __volatile__("clts");
+	__asm__("fninit");
+	if ( cpu_has_xmm )
+		load_mxcsr(0x1f80);
+	return 1;
+}
+
+void kfpu_try_end(void)
+{
+	cpu_fpuactive[smp_processor_id()]--;
+	stts();
+}
+
+void kfpu_initialize(void)
+{
+	cpu_fpuactive[smp_processor_id()] = 0;
+	stts();
+}
+
+void kfpu_enter(void)
+{
+	if(cpu_fpuactive[smp_processor_id()] != 0)
+		BUG();
+	cpu_fpuactive[smp_processor_id()]++;
+	__asm__ __volatile__("clts");
+}
+
+void kfpu_leave(void)
+{
+	cpu_fpuactive[smp_processor_id()]--;
+	stts();
+	if(cpu_fpuactive[smp_processor_id()] != 0)
+		BUG();
 }
 
 void restore_fpu( struct task_struct *tsk )
--- 2.4/arch/i386/kernel/traps.c	Sun Feb 11 00:37:51 2001
+++ build-2.4/arch/i386/kernel/traps.c	Sun Feb 11 19:18:59 2001
@@ -731,7 +731,7 @@
  */
 asmlinkage void math_state_restore(struct pt_regs regs)
 {
-	__asm__ __volatile__("clts");		/* Allow maths ops (or we recurse) */
+	kfpu_enter();		/* Allow maths ops (or we recurse) */
 
 	if (current->used_math) {
 		restore_fpu(current);
--- 2.4/arch/i386/kernel/setup.c	Sun Feb 11 00:37:51 2001
+++ build-2.4/arch/i386/kernel/setup.c	Sun Feb 11 14:33:58 2001
@@ -2274,7 +2274,8 @@
 	 */
 	current->flags &= ~PF_USEDFPU;
 	current->used_math = 0;
-	stts();
+	kfpu_initialize();
+
 }
 
 /*
--- 2.4/arch/i386/kernel/i386_ksyms.c	Sun Feb 11 00:37:51 2001
+++ build-2.4/arch/i386/kernel/i386_ksyms.c	Sun Feb 11 19:25:05 2001
@@ -117,6 +117,25 @@
 EXPORT_SYMBOL(mmx_copy_page);
 #endif
 
+#ifdef CONFIG_X86_USE_SSE
+EXPORT_SYMBOL(sse_clear_page);
+EXPORT_SYMBOL(sse_copy_page);
+#endif
+
+EXPORT_SYMBOL(kfpu_start);
+EXPORT_SYMBOL(kfpu_acquire);
+EXPORT_SYMBOL(kfpu_release);
+EXPORT_SYMBOL(kfpu_try_acquire);
+EXPORT_SYMBOL(kfpu_try_release);
+EXPORT_SYMBOL(kfpu_full_begin);
+EXPORT_SYMBOL(kfpu_mmx_begin);
+EXPORT_SYMBOL(kfpu_nosave_begin);
+EXPORT_SYMBOL(kfpu_full_end);
+EXPORT_SYMBOL(kfpu_mmx_end);
+EXPORT_SYMBOL(kfpu_nosave_end);
+EXPORT_SYMBOL(kfpu_try_begin);
+EXPORT_SYMBOL(kfpu_try_end);
+
 #ifdef CONFIG_SMP
 EXPORT_SYMBOL(cpu_data);
 EXPORT_SYMBOL(kernel_flag);
--- 2.4/arch/i386/config.in	Sun Feb 11 00:37:50 2001
+++ build-2.4/arch/i386/config.in	Sun Feb 11 19:18:59 2001
@@ -91,6 +91,7 @@
    define_bool CONFIG_X86_GOOD_APIC y
    define_bool CONFIG_X86_PGE y
    define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+   define_bool CONFIG_X86_USE_SSE y
 fi
 if [ "$CONFIG_MPENTIUM4" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 7
@@ -98,6 +99,7 @@
    define_bool CONFIG_X86_GOOD_APIC y
    define_bool CONFIG_X86_PGE y
    define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+   define_bool CONFIG_X86_USE_SSE y
 fi
 if [ "$CONFIG_MK6" = "y" ]; then
    define_int  CONFIG_X86_L1_CACHE_SHIFT 5


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2001-02-11 20:38 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2001-02-11 19:03 [RFC] framework for fpu usage in kernel Manfred Spraul
2001-02-11 20:26 ` Alan Cox
2001-02-11 20:30   ` Manfred Spraul
2001-02-11 20:37     ` Alan Cox

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.