public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] 2/2 2.5.45 cleanup & add original copy_ro/from_user
@ 2002-11-02  8:06 Akira Tsukamoto
  2002-11-02 10:32 ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Akira Tsukamoto @ 2002-11-02  8:06 UTC (permalink / raw)
  To: linux-kernel; +Cc: Hirokazu Takahashi, Andrew Morton


This consists mainly of the optimized copy routine for PIII/P4.

It is basically identical to what was introduced in 2.5.45.
However,
I tweaked a bit.  I deleted accessing ‘movsl_mask’ every time, which 
is unlikely to change during the run time.
If anybody finds a better copy routine, don’t have to touch uaccess.h.
just edit here.

Akira Tsukamoto

diff -Nur -X dontdiff linux-2.5.45/include/asm-i386/uaccess-intel.h linux-2.5.45-aki/include/asm-i386/uaccess-intel.h
--- linux-2.5.45/include/asm-i386/uaccess-intel.h	Wed Dec 31 19:00:00 1969
+++ linux-2.5.45-aki/include/asm-i386/uaccess-intel.h	Sat Nov  2 03:00:45 2002
@@ -0,0 +1,294 @@
+#ifndef __i386_UACCESS_INTEL_H
+#define __i386_UACCESS_INTEL_H
+/*
+ * PentiumIII/Pentium4  Copy To/From Userspace, taka version.
+ *
+ * Split into CPU specific files by Akira Tsukamoto to keep #ifdef noise down.
+ */
+
+#define MOVSL_MASK 7
+static inline int 
+is_rep_movsl_faster(const void *a1, const void *a2, unsigned long n)
+{
+	if (n >= 64 && (((const long)a1 ^ (const long)a2) & MOVSL_MASK))
+		return 0;
+	return 1;
+}
+
+/* Using rep; movsl. */
+#define __copy_user_rep_movsl(to,from,size)				\
+do {									\
+	int __d0, __d1, __d2;						\
+	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
+		"0:	rep; movsl\n"					\
+		"	movl %3,%0\n"					\
+		"1:	rep; movsb\n"					\
+		"2:\n"							\
+		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 2b\n"					\
+		"3:	lea 0(%3,%0,4),%0\n"				\
+		"	jmp 2b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
+		"	.long 0b,3b\n"					\
+		"	.long 1b,2b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
+		: "memory");						\
+} while (0)
+
+#define __copy_user_zeroing_rep_movsl(to,from,size)			\
+do {									\
+	int __d0, __d1, __d2;						\
+	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
+		"0:	rep; movsl\n"					\
+		"	movl %3,%0\n"					\
+		"1:	rep; movsb\n"					\
+		"2:\n"							\
+		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 6f\n"					\
+		"3:	lea 0(%3,%0,4),%0\n"				\
+		"6:	pushl %0\n"					\
+		"	pushl %%eax\n"					\
+		"	xorl %%eax,%%eax\n"				\
+		"	rep; stosb\n"					\
+		"	popl %%eax\n"					\
+		"	popl %0\n"					\
+		"	jmp 2b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
+		"	.long 0b,3b\n"					\
+		"	.long 1b,6b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
+		: "memory");						\
+} while (0)
+
+
+/* Using bulk movl. */
+#define __copy_user_movl(to,from,size)					\
+do {									\
+	int d0, d1;							\
+	__asm__ __volatile__(						\
+		       "       .align 2,0x90\n" 			\
+		       "0:     movl 32(%4), %%eax\n"			\
+		       "       cmpl $67, %0\n"     			\
+		       "       jbe 1f\n"            			\
+		       "       movl 64(%4), %%eax\n"			\
+		       "       .align 2,0x90\n"     			\
+		       "1:     movl 0(%4), %%eax\n" 			\
+		       "       movl 4(%4), %%edx\n" 			\
+		       "2:     movl %%eax, 0(%3)\n" 			\
+		       "21:    movl %%edx, 4(%3)\n" 			\
+		       "       movl 8(%4), %%eax\n" 			\
+		       "       movl 12(%4),%%edx\n" 			\
+		       "3:     movl %%eax, 8(%3)\n" 			\
+		       "31:    movl %%edx, 12(%3)\n"			\
+		       "       movl 16(%4), %%eax\n"			\
+		       "       movl 20(%4), %%edx\n"			\
+		       "4:     movl %%eax, 16(%3)\n"			\
+		       "41:    movl %%edx, 20(%3)\n"			\
+		       "       movl 24(%4), %%eax\n"			\
+		       "       movl 28(%4), %%edx\n"			\
+		       "10:    movl %%eax, 24(%3)\n"			\
+		       "51:    movl %%edx, 28(%3)\n"			\
+		       "       movl 32(%4), %%eax\n"			\
+		       "       movl 36(%4), %%edx\n"			\
+		       "11:    movl %%eax, 32(%3)\n"			\
+		       "61:    movl %%edx, 36(%3)\n"			\
+		       "       movl 40(%4), %%eax\n"			\
+		       "       movl 44(%4), %%edx\n"			\
+		       "12:    movl %%eax, 40(%3)\n"			\
+		       "71:    movl %%edx, 44(%3)\n"			\
+		       "       movl 48(%4), %%eax\n"			\
+		       "       movl 52(%4), %%edx\n"			\
+		       "13:    movl %%eax, 48(%3)\n"			\
+		       "81:    movl %%edx, 52(%3)\n"			\
+		       "       movl 56(%4), %%eax\n"			\
+		       "       movl 60(%4), %%edx\n"			\
+		       "14:    movl %%eax, 56(%3)\n"			\
+		       "91:    movl %%edx, 60(%3)\n"			\
+		       "       addl $-64, %0\n"     			\
+		       "       addl $64, %4\n"      			\
+		       "       addl $64, %3\n"      			\
+		       "       cmpl $63, %0\n"      			\
+		       "       ja  0b\n"            			\
+		       "5:     movl  %0, %%eax\n"   			\
+		       "       shrl  $2, %0\n"      			\
+		       "       andl  $3, %%eax\n"   			\
+		       "       cld\n"               			\
+		       "6:     rep; movsl\n"        			\
+		       "       movl %%eax, %0\n"    			\
+		       "7:     rep; movsb\n"				\
+		       "8:\n"						\
+		       ".section .fixup,\"ax\"\n"			\
+		       "9:     lea 0(%%eax,%0,4),%0\n"			\
+		       "       jmp 8b\n"				\
+		       ".previous\n"					\
+		       ".section __ex_table,\"a\"\n"			\
+		       "       .align 4\n"				\
+		       "       .long 2b,8b\n"				\
+		       "       .long 21b,8b\n"				\
+		       "       .long 3b,8b\n"				\
+		       "       .long 31b,8b\n"				\
+		       "       .long 4b,8b\n"				\
+		       "       .long 41b,8b\n"				\
+		       "       .long 10b,8b\n"				\
+		       "       .long 51b,8b\n"				\
+		       "       .long 11b,8b\n"				\
+		       "       .long 61b,8b\n"				\
+		       "       .long 12b,8b\n"				\
+		       "       .long 71b,8b\n"				\
+		       "       .long 13b,8b\n"				\
+		       "       .long 81b,8b\n"				\
+		       "       .long 14b,8b\n"				\
+		       "       .long 91b,8b\n"				\
+		       "       .long 6b,9b\n"				\
+		       "       .long 7b,8b\n"				\
+		       ".previous"					\
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)		\
+		       :  "1"(to), "2"(from), "0"(size)			\
+		       : "eax", "edx", "memory");			\
+} while (0)
+
+#define __copy_user_zeroing_movl(to,from,size)				\
+do {									\
+	int d0, d1;							\
+	__asm__ __volatile__(						\
+		       "        .align 2,0x90\n"			\
+		       "0:      movl 32(%4), %%eax\n"			\
+		       "        cmpl $67, %0\n"      			\
+		       "        jbe 2f\n"            			\
+		       "1:      movl 64(%4), %%eax\n"			\
+		       "        .align 2,0x90\n"     			\
+		       "2:      movl 0(%4), %%eax\n" 			\
+		       "21:     movl 4(%4), %%edx\n" 			\
+		       "        movl %%eax, 0(%3)\n" 			\
+		       "        movl %%edx, 4(%3)\n" 			\
+		       "3:      movl 8(%4), %%eax\n" 			\
+		       "31:     movl 12(%4),%%edx\n" 			\
+		       "        movl %%eax, 8(%3)\n" 			\
+		       "        movl %%edx, 12(%3)\n"			\
+		       "4:      movl 16(%4), %%eax\n"			\
+		       "41:     movl 20(%4), %%edx\n"			\
+		       "        movl %%eax, 16(%3)\n"			\
+		       "        movl %%edx, 20(%3)\n"			\
+		       "10:     movl 24(%4), %%eax\n"			\
+		       "51:     movl 28(%4), %%edx\n"			\
+		       "        movl %%eax, 24(%3)\n"			\
+		       "        movl %%edx, 28(%3)\n"			\
+		       "11:     movl 32(%4), %%eax\n"			\
+		       "61:     movl 36(%4), %%edx\n"			\
+		       "        movl %%eax, 32(%3)\n"			\
+		       "        movl %%edx, 36(%3)\n"			\
+		       "12:     movl 40(%4), %%eax\n"			\
+		       "71:     movl 44(%4), %%edx\n"			\
+		       "        movl %%eax, 40(%3)\n"			\
+		       "        movl %%edx, 44(%3)\n"			\
+		       "13:     movl 48(%4), %%eax\n"			\
+		       "81:     movl 52(%4), %%edx\n"			\
+		       "        movl %%eax, 48(%3)\n"			\
+		       "        movl %%edx, 52(%3)\n"			\
+		       "14:     movl 56(%4), %%eax\n"			\
+		       "91:     movl 60(%4), %%edx\n"			\
+		       "        movl %%eax, 56(%3)\n"			\
+		       "        movl %%edx, 60(%3)\n"			\
+		       "        addl $-64, %0\n"     			\
+		       "        addl $64, %4\n"      			\
+		       "        addl $64, %3\n"      			\
+		       "        cmpl $63, %0\n"      			\
+		       "        ja  0b\n"            			\
+		       "5:      movl  %0, %%eax\n"   			\
+		       "        shrl  $2, %0\n"      			\
+		       "        andl $3, %%eax\n"    			\
+		       "        cld\n"               			\
+		       "6:      rep; movsl\n"   			\
+		       "        movl %%eax,%0\n"			\
+		       "7:      rep; movsb\n"				\
+		       "8:\n"						\
+		       ".section .fixup,\"ax\"\n"			\
+		       "9:      lea 0(%%eax,%0,4),%0\n"			\
+		       "16:     pushl %0\n"				\
+		       "        pushl %%eax\n"				\
+		       "        xorl %%eax,%%eax\n"			\
+		       "        rep; stosb\n"				\
+		       "        popl %%eax\n"				\
+		       "        popl %0\n"				\
+		       "        jmp 8b\n"				\
+		       ".previous\n"					\
+		       ".section __ex_table,\"a\"\n"			\
+		       "	.align 4\n"	   			\
+		       "	.long 0b,16b\n"	 			\
+		       "	.long 1b,16b\n"				\
+		       "	.long 2b,16b\n"				\
+		       "	.long 21b,16b\n"			\
+		       "	.long 3b,16b\n"				\
+		       "	.long 31b,16b\n"			\
+		       "	.long 4b,16b\n"				\
+		       "	.long 41b,16b\n"			\
+		       "	.long 10b,16b\n"			\
+		       "	.long 51b,16b\n"			\
+		       "	.long 11b,16b\n"			\
+		       "	.long 61b,16b\n"			\
+		       "	.long 12b,16b\n"			\
+		       "	.long 71b,16b\n"			\
+		       "	.long 13b,16b\n"			\
+		       "	.long 81b,16b\n"			\
+		       "	.long 14b,16b\n"			\
+		       "	.long 91b,16b\n"			\
+		       "	.long 6b,9b\n"				\
+		       "        .long 7b,16b\n" 			\
+		       ".previous"					\
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)		\
+		       :  "1"(to), "2"(from), "0"(size)			\
+		       : "eax", "edx", "memory");			\
+} while (0)
+
+/* These two will go inside copy_to/from_user() in usercopy.c */
+#define __do_copy_user(to,from,n)					\
+do {									\
+	if (is_rep_movsl_faster(to, from, n))				\
+		__copy_user_rep_movsl(to, from, n);			\
+	else								\
+		__copy_user_movl(to, from, n);				\
+} while (0)
+
+#define __do_copy_user_zeroing(to,from,n)				\
+do {									\
+	if (is_rep_movsl_faster(to, from, n))				\
+		__copy_user_zeroing_rep_movsl(to,from,n);		\
+	else								\
+		__copy_user_zeroing_movl(to, from, n);			\
+} while (0)
+
+#endif /* __i386_UACCESS_INTEL_H */




^ permalink raw reply	[flat|nested] 17+ messages in thread
[parent not found: <20021102025838.220E.AT541@columbia.edu.suse.lists.linux.kernel>]
* Re: [PATCH] 2/2 2.5.45 cleanup & add original copy_ro/from_user
@ 2002-11-04  3:36 Akira Tsukamoto
  0 siblings, 0 replies; 17+ messages in thread
From: Akira Tsukamoto @ 2002-11-04  3:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Hirokazu Takahashi

On Sat, 02 Nov 2002 20:04:32 -0800
Andrew Morton <akpm@digeo.com> mentioned:
> > From my patch, about the speed:
> > for PIII/4 CPU -> no change. using the same 2.5.45 copy.
> > for old i386 -> more optimal.
> > for Athlon -> 2.5.45 does not use unrolled copy for it either.
> 
> OK.  Please integrate you patch into the current kernel's usercopy.c.

I will make a revised patch, remove inline and putting inside usercopy.c

> The thing which requires some thought is "should the decision
> be made at compile time or runtime".  For Athlon vs Intel
> and i386 vs others, it should be performed at compile time.

I run faster_intel_copy on my Athlon and works OK and much much faster,
so how about grouping CPU type,

generic i386/i486
  use org REP MOVSL copy
generic i586
  keep as the current 5.45
  use revised REP MOVSL copy 
generic i686
  use revised REP MOVSL copy and unrolled MOVL
if SSE or 3DNOW comes out select them by MPENTIUMIII/4/K7




^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2002-11-04  4:30 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-11-02  8:06 [PATCH] 2/2 2.5.45 cleanup & add original copy_ro/from_user Akira Tsukamoto
2002-11-02 10:32 ` Andrew Morton
2002-11-02 11:07   ` Akira Tsukamoto
2002-11-02 18:13     ` Andrew Morton
2002-11-03  2:43       ` Akira Tsukamoto
2002-11-03  4:04         ` Andrew Morton
2002-11-03  2:43       ` Akira Tsukamoto
2002-11-03  2:57   ` Akira Tsukamoto
2002-11-03 21:24     ` Dave Jones
2002-11-03 22:22       ` Andries Brouwer
     [not found] <20021102025838.220E.AT541@columbia.edu.suse.lists.linux.kernel>
     [not found] ` <3DC3A9C0.7979C276@digeo.com.suse.lists.linux.kernel>
2002-11-02 10:58   ` Andi Kleen
2002-11-02 11:03     ` Andrew Morton
2002-11-02 16:55     ` Denis Vlasenko
2002-11-02 12:09       ` Andi Kleen
2002-11-02 17:08         ` Denis Vlasenko
2002-11-02 12:23           ` Andi Kleen
  -- strict thread matches above, loose matches on Subject: below --
2002-11-04  3:36 Akira Tsukamoto

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox