memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
@ 2005-03-29 14:37 Denis Vlasenko
  2005-03-29 15:06 ` Richard Guenther
                   ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-29 14:37 UTC (permalink / raw)
  To: linux-kernel, gcc

Try testcase below the sig.

This causes nearly one thousand calls to memcpy in my kernel
(not an allyesconfig one):

# objdump -d vmlinux | grep -F '<memcpy>' | wc -l
    959

# gcc -O2 -c t.c
# objdump -r -d t.o

t.o:     file format elf32-i386

Disassembly of section .text:

00000000 <f3>:
   0:   55                      push   %ebp
   1:   89 e5                   mov    %esp,%ebp
   3:   83 ec 0c                sub    $0xc,%esp
   6:   6a 03                   push   $0x3
   8:   ff 75 0c                pushl  0xc(%ebp)
   b:   ff 75 08                pushl  0x8(%ebp)
   e:   e8 fc ff ff ff          call   f <f3+0xf>
                        f: R_386_PC32   memcpy
  13:   83 c4 10                add    $0x10,%esp
  16:   c9                      leave
  17:   c3                      ret

00000018 <f3b>:
  18:   55                      push   %ebp
  19:   89 e5                   mov    %esp,%ebp
  1b:   8b 55 0c                mov    0xc(%ebp),%edx
  1e:   66 8b 02                mov    (%edx),%ax
  21:   8b 4d 08                mov    0x8(%ebp),%ecx
  24:   66 89 01                mov    %ax,(%ecx)
  27:   8a 42 02                mov    0x2(%edx),%al
  2a:   88 41 02                mov    %al,0x2(%ecx)
  2d:   c9                      leave
  2e:   c3                      ret
  2f:   90                      nop

00000030 <f3k>:
  30:   55                      push   %ebp
  31:   89 e5                   mov    %esp,%ebp
  33:   57                      push   %edi
  34:   56                      push   %esi
  35:   8b 7d 08                mov    0x8(%ebp),%edi
  38:   8b 75 0c                mov    0xc(%ebp),%esi
  3b:   b9 ee 02 00 00          mov    $0x2ee,%ecx
  40:   f3 a5                   repz movsl %ds:(%esi),%es:(%edi)
  42:   5e                      pop    %esi
  43:   5f                      pop    %edi
  44:   c9                      leave
  45:   c3                      ret


--
vda


typedef unsigned int size_t;

static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
        "rep ; movsl\n\t"
        "testb $2,%b4\n\t"
        "je 1f\n\t"
        "movsw\n"
        "1:\ttestb $1,%b4\n\t"
        "je 2f\n\t"
        "movsb\n"
        "2:"
        : "=&c" (d0), "=&D" (d1), "=&S" (d2)
        :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
        : "memory");
return (to);
}

/*
 * This looks horribly ugly, but the compiler can optimize it totally,
 * as the count is constant.
 */
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
        if (n <= 128)
                return __builtin_memcpy(to, from, n);

#define COMMON(x) \
__asm__ __volatile__( \
        "rep ; movsl" \
        x \
        : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
        : "0" (n/4),"1" ((long) to),"2" ((long) from) \
        : "memory");
{
        int d0, d1, d2;
        switch (n % 4) {
                case 0: COMMON(""); return to;
                case 1: COMMON("\n\tmovsb"); return to;
                case 2: COMMON("\n\tmovsw"); return to;
                default: COMMON("\n\tmovsw\n\tmovsb"); return to;
        }
}

#undef COMMON
}

#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
 __constant_memcpy((t),(f),(n)) : \
 __memcpy((t),(f),(n)))

int f3(char *a, char *b) { memcpy(a,b,3); }
int f3b(char *a, char *b) { __builtin_memcpy(a,b,3); }
int f3k(char *a, char *b) { memcpy(a,b,3000); }


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
@ 2005-03-29 15:06 ` Richard Guenther
  2005-03-29 15:08 ` Nathan Sidwell
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 24+ messages in thread
From: Richard Guenther @ 2005-03-29 15:06 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: linux-kernel, gcc

On Tue, 29 Mar 2005 17:37:06 +0300, Denis Vlasenko <vda@ilport.com.ua> wrote:
> Try testcase below the sig.
> 
> This causes nearly one thousand calls to memcpy in my kernel
> (not an allyesconfig one):

> static inline void * __memcpy(void * to, const void * from, size_t n)
> {
> int d0, d1, d2;
> __asm__ __volatile__(
>         "rep ; movsl\n\t"
>         "testb $2,%b4\n\t"
>         "je 1f\n\t"
>         "movsw\n"
>         "1:\ttestb $1,%b4\n\t"
>         "je 2f\n\t"
>         "movsb\n"
>         "2:"
>         : "=&c" (d0), "=&D" (d1), "=&S" (d2)
>         :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
>         : "memory");
> return (to);
> }

The question is, what reason does -Winline give for this inlining
decision?  And then
of course, how is the size estimate counted for the above.  What kind
of tree node do
we get for the ASM expression?

Richard.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
  2005-03-29 15:06 ` Richard Guenther
@ 2005-03-29 15:08 ` Nathan Sidwell
  2005-03-29 15:13 ` Jakub Jelinek
  2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
  3 siblings, 0 replies; 24+ messages in thread
From: Nathan Sidwell @ 2005-03-29 15:08 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: linux-kernel, gcc

Denis Vlasenko wrote:

> Disassembly of section .text:
> 

>    e:   e8 fc ff ff ff          call   f <f3+0xf>
>                         f: R_386_PC32   memcpy

> #define memcpy(t, f, n) \
> (__builtin_constant_p(n) ? \
>  __constant_memcpy((t),(f),(n)) : \
>  __memcpy((t),(f),(n)))

given this #define, how can 'memcpy' appear in the object file?  It appears
that something odd is happening with preprocessing.  Check the .i files are
as you expect. -dD and -E options will be helpful to you.

nathan

-- 
Nathan Sidwell    ::   http://www.codesourcery.com   ::     CodeSourcery LLC
nathan@codesourcery.com    ::     http://www.planetfall.pwp.blueyonder.co.uk


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
  2005-03-29 15:06 ` Richard Guenther
  2005-03-29 15:08 ` Nathan Sidwell
@ 2005-03-29 15:13 ` Jakub Jelinek
  2005-03-29 15:42   ` Andrew Pinski
  2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
  3 siblings, 1 reply; 24+ messages in thread
From: Jakub Jelinek @ 2005-03-29 15:13 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: linux-kernel, gcc

On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> typedef unsigned int size_t;
> 
> static inline void * __memcpy(void * to, const void * from, size_t n)
> {
> int d0, d1, d2;
> __asm__ __volatile__(
>         "rep ; movsl\n\t"
>         "testb $2,%b4\n\t"
>         "je 1f\n\t"
>         "movsw\n"
>         "1:\ttestb $1,%b4\n\t"
>         "je 2f\n\t"
>         "movsb\n"
>         "2:"
>         : "=&c" (d0), "=&D" (d1), "=&S" (d2)
>         :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
>         : "memory");
> return (to);
> }
> 
> /*
>  * This looks horribly ugly, but the compiler can optimize it totally,
>  * as the count is constant.
>  */
> static inline void * __constant_memcpy(void * to, const void * from, size_t n)
> {
>         if (n <= 128)
>                 return __builtin_memcpy(to, from, n);
> 
> #define COMMON(x) \
> __asm__ __volatile__( \
>         "rep ; movsl" \
>         x \
>         : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
>         : "0" (n/4),"1" ((long) to),"2" ((long) from) \
>         : "memory");
> {
>         int d0, d1, d2;
>         switch (n % 4) {
>                 case 0: COMMON(""); return to;
>                 case 1: COMMON("\n\tmovsb"); return to;
>                 case 2: COMMON("\n\tmovsw"); return to;
>                 default: COMMON("\n\tmovsw\n\tmovsb"); return to;
>         }
> }
> 
> #undef COMMON
> }
> 
> #define memcpy(t, f, n) \
> (__builtin_constant_p(n) ? \
>  __constant_memcpy((t),(f),(n)) : \
>  __memcpy((t),(f),(n)))
> 
> int f3(char *a, char *b) { memcpy(a,b,3); }

The problem is that in GCC < 4.0 there is no constant propagation
pass before expanding builtin functions, so the __builtin_memcpy
call above sees a variable rather than a constant.

Either use GCC 4.0+, where this works just fine, or move the
n <= 128 case into the macro:
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
 ((n) <= 128 ? __builtin_memcpy(t,f,n) : __constant_memcpy(t,f,n) : \
 __memcpy(t,f,n))

	Jakub

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-03-29 15:13 ` Jakub Jelinek
@ 2005-03-29 15:42   ` Andrew Pinski
  2005-03-30  2:27     ` Gerold Jury
  0 siblings, 1 reply; 24+ messages in thread
From: Andrew Pinski @ 2005-03-29 15:42 UTC (permalink / raw)
  To: jakub; +Cc: Denis Vlasenko, linux-kernel, gcc

> 
> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> > /*
> >  * This looks horribly ugly, but the compiler can optimize it totally,
> >  * as the count is constant.
> >  */
> > static inline void * __constant_memcpy(void * to, const void * from, size_t n)
> > {
> >         if (n <= 128)
> >                 return __builtin_memcpy(to, from, n);
> The problem is that in GCC < 4.0 there is no constant propagation
> pass before expanding builtin functions, so the __builtin_memcpy
> call above sees a variable rather than a constant.

or change "size_t n" to "const size_t n" will also fix the issue.
As we do some (well very little and with inlining and const values)
const progation before 4.0.0 on the trees before expanding the builtin.

-- Pinski

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH] fix i386 memcpy
  2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
                   ` (2 preceding siblings ...)
  2005-03-29 15:13 ` Jakub Jelinek
@ 2005-03-29 20:22 ` Denis Vlasenko
  2005-03-29 20:24   ` Denis Vlasenko
  3 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-29 20:22 UTC (permalink / raw)
  To: Andrew Morton, Denis Vlasenko; +Cc: linux-kernel, gcc

[-- Attachment #1: Type: text/plain, Size: 2961 bytes --]

This patch shortens non-constant memcpy() by two bytes
and fixes spurious out-of-line constant memcpy().

Patch is run-tested (I run on patched kernel right now).

Benchmark and code generation test program will be mailed as reply.

# size vmlinux.org vmlinux
   text    data     bss     dec     hex filename
3954591 1553426  236544 5744561  57a7b1 vmlinux.org
3952615 1553426  236544 5742585  579ff9 vmlinux

Example of changes (part of dump_fpu() body):
old.............................................. new......................
8d 83 40 02 00 00 lea    0x240(%ebx),%eax         8d b3 40 02 00 00 lea    0x240(%ebx),%esi
74 31             je     c0108b27 <dump_fpu+0x9c> 74 2e             je     c0108b1d <dump_fpu+0x92>
6a 1c             push   $0x1c                    8b 7d 0c          mov    0xc(%ebp),%edi
50                push   %eax                     b9 07 00 00 00    mov    $0x7,%ecx
56                push   %esi                     f3 a5             repz movsl %ds:(%esi),%es:(%edi)
e8 49 21 10 00    call   c020ac48 <memcpy>        8b 55 0c          mov    0xc(%ebp),%edx
83 c4 0c          add    $0xc,%esp                83 c2 1c          add    $0x1c,%edx
83 c6 1c          add    $0x1c,%esi               8d 83 60 02 00 00 lea    0x260(%ebx),%eax
81 c3 60 02 00 00 add    $0x260,%ebx              b9 07 00 00 00    mov    $0x7,%ecx
bf 07 00 00 00    mov    $0x7,%edi                89 d7             mov    %edx,%edi
6a 0a             push   $0xa                     89 c6             mov    %eax,%esi
53                push   %ebx                     a5                movsl  %ds:(%esi),%es:(%edi)
56                push   %esi                     a5                movsl  %ds:(%esi),%es:(%edi)
e8 2f 21 10 00    call   c020ac48 <memcpy>        66 a5             movsw  %ds:(%esi),%es:(%edi)
83 c4 0c          add    $0xc,%esp                83 c2 0a          add    $0xa,%edx
83 c6 0a          add    $0xa,%esi                83 c0 10          add    $0x10,%eax
83 c3 10          add    $0x10,%ebx               49                dec    %ecx
4f                dec    %edi                     79 ef             jns    c0108b0a <dump_fpu+0x7f>
79 eb             jns    c0108b10 <dump_fpu+0x85> eb 0a             jmp    c0108b27 <dump_fpu+0x9c>
eb 0c             jmp    c0108b33 <dump_fpu+0xa8> 8b 7d 0c          mov    0xc(%ebp),%edi
6a 6c             push   $0x6c                    b9 1b 00 00 00    mov    $0x1b,%ecx
50                push   %eax                     f3 a5             repz movsl %ds:(%esi),%es:(%edi)
56                push   %esi                     8b 45 f0          mov    0xfffffff0(%ebp),%eax
e8 18 21 10 00    call   c020ac48 <memcpy>        5a                pop    %edx
83 c4 0c          add    $0xc,%esp                5b                pop    %ebx
8b 45 f0          mov    0xfffffff0(%ebp),%eax
8d 65 f4          lea    0xfffffff4(%ebp),%esp
5b                pop    %ebx
5e                pop    %esi

--
vda

[-- Attachment #2: string.memcpy.diff --]
[-- Type: text/x-diff, Size: 2953 bytes --]

--- linux-2.6.11.src/include/asm-i386/string.h.orig	Thu Mar  3 09:31:08 2005
+++ linux-2.6.11.src/include/asm-i386/string.h	Tue Mar 29 22:05:00 2005
@@ -198,46 +198,75 @@ static inline void * __memcpy(void * to,
 int d0, d1, d2;
 __asm__ __volatile__(
 	"rep ; movsl\n\t"
-	"testb $2,%b4\n\t"
-	"je 1f\n\t"
-	"movsw\n"
-	"1:\ttestb $1,%b4\n\t"
-	"je 2f\n\t"
-	"movsb\n"
-	"2:"
+	"movl %4,%%ecx\n\t"
+	"andl $3,%%ecx\n\t"
+	"jz 1f\n\t"	/* pay 2 byte penalty for a chance to skip microcoded rep */
+	"rep ; movsb\n\t"
+	"1:"
 	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-	:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
+	: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
 	: "memory");
 return (to);
 }
 
 /*
- * This looks horribly ugly, but the compiler can optimize it totally,
+ * This looks ugly, but the compiler can optimize it totally,
  * as the count is constant.
  */
 static inline void * __constant_memcpy(void * to, const void * from, size_t n)
 {
-	if (n <= 128)
-		return __builtin_memcpy(to, from, n);
-
-#define COMMON(x) \
-__asm__ __volatile__( \
-	"rep ; movsl" \
-	x \
-	: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
-	: "0" (n/4),"1" ((long) to),"2" ((long) from) \
-	: "memory");
-{
-	int d0, d1, d2;
+#if 1	/* want to do small copies with non-string ops? */
+	switch (n) {
+		case 0: return to;
+		case 1: *(char*)to = *(char*)from; return to;
+		case 2: *(short*)to = *(short*)from; return to;
+		case 4: *(int*)to = *(int*)from; return to;
+#if 1	/* including those doable with two moves? */
+		case 3: *(short*)to = *(short*)from;
+			*((char*)to+2) = *((char*)from+2); return to;
+		case 5: *(int*)to = *(int*)from;
+			*((char*)to+4) = *((char*)from+4); return to;
+		case 6: *(int*)to = *(int*)from;
+			*((short*)to+2) = *((short*)from+2); return to;
+		case 8: *(int*)to = *(int*)from;
+			*((int*)to+1) = *((int*)from+1); return to;
+#endif
+	}
+#else
+	if (!n) return to;
+#endif
+	{
+		/* load esi/edi */
+		int esi, edi;
+		__asm__ __volatile__(
+			""
+			: "=&D" (edi), "=&S" (esi)
+			: "0" ((long) to),"1" ((long) from)
+			: "memory"
+		);
+	}
+	if (n >= 5*4) {
+		/* large block: use rep prefix */
+		int ecx;
+		__asm__ __volatile__(
+			"rep ; movsl"
+			: "=&c" (ecx)
+			: "0" (n/4)
+		);
+	} else {
+		/* small block: don't clobber ecx + smaller code */
+		if (n >= 4*4) __asm__ __volatile__("movsl");
+		if (n >= 3*4) __asm__ __volatile__("movsl");
+		if (n >= 2*4) __asm__ __volatile__("movsl");
+		if (n >= 1*4) __asm__ __volatile__("movsl");
+	}
 	switch (n % 4) {
-		case 0: COMMON(""); return to;
-		case 1: COMMON("\n\tmovsb"); return to;
-		case 2: COMMON("\n\tmovsw"); return to;
-		default: COMMON("\n\tmovsw\n\tmovsb"); return to;
+		/* tail */
+		case 0: return to;
+		case 1: __asm__ __volatile__("movsb"); return to;
+		case 2: __asm__ __volatile__("movsw"); return to;
+		default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
 	}
-}
-  
-#undef COMMON
 }
 
 #define __HAVE_ARCH_MEMCPY

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] fix i386 memcpy
  2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
@ 2005-03-29 20:24   ` Denis Vlasenko
  0 siblings, 0 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-29 20:24 UTC (permalink / raw)
  To: Denis Vlasenko, Andrew Morton; +Cc: linux-kernel, gcc

[-- Attachment #1: Type: text/plain, Size: 298 bytes --]

On Tuesday 29 March 2005 23:22, Denis Vlasenko wrote:
> This patch shortens non-constant memcpy() by two bytes
> and fixes spurious out-of-line constant memcpy().
> 
> Patch is run-tested (I run on patched kernel right now).
> 
> Benchmark and code generation test program will be mailed as reply.

[-- Attachment #2: bench.c --]
[-- Type: text/x-csrc, Size: 3755 bytes --]

/* Compile with: gcc -Os -fomit-frame-pointer -falign-functions=32 */
/* results:
processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 11
model name      : Intel(R) Celeron
stepping        : 1
cpu MHz         : 1196.236
cache size      : 256 KB
movsl_X wins    : N<=3
rep_movsl wins  : N>=6
('assign' wins always at the cost of much larger code)
*/

#include <time.h>
#include <stdio.h>

#define N 5

#define MOVSL1 __asm__ __volatile__("movsl")
#define MOVSL2 MOVSL1;MOVSL1
#define MOVSL3 MOVSL2;MOVSL1
#define MOVSL4 MOVSL3;MOVSL1
#define MOVSL5 MOVSL4;MOVSL1
#define MOVSL6 MOVSL5;MOVSL1
#define MOVSL7 MOVSL6;MOVSL1
#define MOVSL8 MOVSL7;MOVSL1
#define MOVSL9 MOVSL8;MOVSL1
#define MOVSL10 MOVSL9;MOVSL1
#define MOVSL11 MOVSL10;MOVSL1
#define MOVSL12 MOVSL11;MOVSL1
#define MOVSL13 MOVSL12;MOVSL1
#define MOVSL14 MOVSL13;MOVSL1
#define MOVSL15 MOVSL14;MOVSL1
#define MOVSL16 MOVSL15;MOVSL1
#define MOVSL17 MOVSL16;MOVSL1
#define MOVSL18 MOVSL17;MOVSL1
#define MOVSL19 MOVSL18;MOVSL1

#define MOVSL_(n) MOVSL##n
#define MOVSL(n) MOVSL_(n)

static inline void * rep_movsl(void * to, const void * from, size_t n)
{
	{
		int esi, edi;
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	{
		int ecx;
		__asm__ __volatile__(
			"rep ; movsl"
			: "=&c" (ecx)
			: "0" (n/4)
		);
	}
}

static inline void * movsl_X(void * to, const void * from, size_t n)
{
	{
		int esi, edi;
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	MOVSL(N);
}

static inline void * assign(void * to, const void * from, size_t n)
{
	switch (n) {
		case 4:
			*(unsigned long *)to = *(const unsigned long *)from;
			return to;
		case 8:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			return to;
		case 12:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			return to;
		case 16:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			return to;
		case 20:
			*(unsigned long *)to = *(const unsigned long *)from;
			*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
			*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
			*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
			*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
			return to;
		default:
			return rep_movsl(to, from, n);
	}
}


char f[256],t[256];

char *fp = f;
char *tp = t;

void r() { rep_movsl(f,t,N*4); }
void m() { movsl_X(f,t,N*4); }
void a() { assign(f,t,N*4); }

void rp() { rep_movsl(fp,tp,N*4); }
void mp() { movsl_X(fp,tp,N*4); }
void ap() { assign(fp,tp,N*4); }

int measure(void (*f)()) {
    int cnt = 0;
    time_t t = time(0);
    while(t==time(0)) f(); /* cache hot */
    t = time(0);
    while(t==time(0)) {
	f(); f(); f(); f(); f(); f(); f(); f();
	f(); f(); f(); f(); f(); f(); f(); f();
	cnt += 16;
    }
    return cnt;
}

int main() {
    printf("On global array:\n");
    printf("rep movsl(%d) per sec: %d\n", N, measure(r));
    printf("  movsl_X(%d) per sec: %d\n", N, measure(m));
    printf("   assign(%d) per sec: %d\n", N, measure(a));
    printf("Indirect:\n");
    printf("rep movsl(%d) per sec: %d\n", N, measure(rp));
    printf("  movsl_X(%d) per sec: %d\n", N, measure(mp));
    printf("   assign(%d) per sec: %d\n", N, measure(ap));
    return 0;
}

[-- Attachment #3: codecheck.c --]
[-- Type: text/x-csrc, Size: 5993 bytes --]

/* Compile with: gcc -Os -fomit-frame-pointer */
/* Check for correctness/size: objdump -r -d <file.o> | $PAGER */

typedef unsigned int size_t;

static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
#if 1	/* want to do small copies with non-string ops? */
	switch (n) {
		case 0: return to;
		case 1: *(char*)to = *(char*)from; return to;
		case 2: *(short*)to = *(short*)from; return to;
		case 4: *(int*)to = *(int*)from; return to;
#if 1	/* including those doable with two moves? */
		case 3: *(short*)to = *(short*)from;
			*((char*)to+2) = *((char*)from+2); return to;
		case 5: *(int*)to = *(int*)from;
			*((char*)to+4) = *((char*)from+4); return to;
		case 6: *(int*)to = *(int*)from;
			*((short*)to+2) = *((short*)from+2); return to;
		case 8: *(int*)to = *(int*)from;
			*((int*)to+1) = *((int*)from+1); return to;
#endif
	}
#else
	if (!n) return to;
#endif
	{
		/* load esi/edi */
		int esi, edi;
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	if (n >= 5*4) {
		/* large block: use rep prefix */
		int ecx;
		__asm__ __volatile__(
			"rep ; movsl"
			: "=&c" (ecx)
			: "0" (n/4)
		);
	} else {
		/* small block: don't clobber ecx + smaller code */
		if (n >= 4*4) __asm__ __volatile__("movsl");
		if (n >= 3*4) __asm__ __volatile__("movsl");
		if (n >= 2*4) __asm__ __volatile__("movsl");
		if (n >= 1*4) __asm__ __volatile__("movsl");
	}
	switch (n % 4) {
		/* tail */
		case 0: return to;
		case 1: __asm__ __volatile__("movsb"); return to;
		case 2: __asm__ __volatile__("movsw"); return to;
		default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
	}
}

static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
	"rep ; movsl\n\t"
	"movl %4,%%ecx\n\t"
	"andl $3,%%ecx\n\t"
	"jz 1f\n\t"	/* pay 2 byte penalty for a chance to skip microcoded rep */
	"rep ; movsb\n\t"
	"1:"
	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
	: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
	: "memory");
return (to);
}

#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
 __constant_memcpy((t),(f),(n)) : \
 __memcpy((t),(f),(n)))

int f00(char *a, char *b) __attribute__ ((section ("ff00"))); int f00(char *a, char *b) { memcpy(a,b,0); }
int f01(char *a, char *b) __attribute__ ((section ("ff01"))); int f01(char *a, char *b) { memcpy(a,b,1); }
int f02(char *a, char *b) __attribute__ ((section ("ff02"))); int f02(char *a, char *b) { memcpy(a,b,2); }
int f03(char *a, char *b) __attribute__ ((section ("ff03"))); int f03(char *a, char *b) { memcpy(a,b,3); }
int f04(char *a, char *b) __attribute__ ((section ("ff04"))); int f04(char *a, char *b) { memcpy(a,b,4); }
int f05(char *a, char *b) __attribute__ ((section ("ff05"))); int f05(char *a, char *b) { memcpy(a,b,5); }
int f06(char *a, char *b) __attribute__ ((section ("ff06"))); int f06(char *a, char *b) { memcpy(a,b,6); }
int f07(char *a, char *b) __attribute__ ((section ("ff07"))); int f07(char *a, char *b) { memcpy(a,b,7); }
int f08(char *a, char *b) __attribute__ ((section ("ff08"))); int f08(char *a, char *b) { memcpy(a,b,8); }
int f09(char *a, char *b) __attribute__ ((section ("ff09"))); int f09(char *a, char *b) { memcpy(a,b,9); }
int f10(char *a, char *b) __attribute__ ((section ("ff10"))); int f10(char *a, char *b) { memcpy(a,b,10); }
int f11(char *a, char *b) __attribute__ ((section ("ff11"))); int f11(char *a, char *b) { memcpy(a,b,11); }
int f12(char *a, char *b) __attribute__ ((section ("ff12"))); int f12(char *a, char *b) { memcpy(a,b,12); }
int f13(char *a, char *b) __attribute__ ((section ("ff13"))); int f13(char *a, char *b) { memcpy(a,b,13); }
int f14(char *a, char *b) __attribute__ ((section ("ff14"))); int f14(char *a, char *b) { memcpy(a,b,14); }
int f15(char *a, char *b) __attribute__ ((section ("ff15"))); int f15(char *a, char *b) { memcpy(a,b,15); }
int f16(char *a, char *b) __attribute__ ((section ("ff16"))); int f16(char *a, char *b) { memcpy(a,b,16); }
int f17(char *a, char *b) __attribute__ ((section ("ff17"))); int f17(char *a, char *b) { memcpy(a,b,17); }
int f18(char *a, char *b) __attribute__ ((section ("ff18"))); int f18(char *a, char *b) { memcpy(a,b,18); }
int f19(char *a, char *b) __attribute__ ((section ("ff19"))); int f19(char *a, char *b) { memcpy(a,b,19); }
int f20(char *a, char *b) __attribute__ ((section ("ff20"))); int f20(char *a, char *b) { memcpy(a,b,20); }
int f21(char *a, char *b) __attribute__ ((section ("ff21"))); int f21(char *a, char *b) { memcpy(a,b,21); }
int f22(char *a, char *b) __attribute__ ((section ("ff22"))); int f22(char *a, char *b) { memcpy(a,b,22); }
int f23(char *a, char *b) __attribute__ ((section ("ff23"))); int f23(char *a, char *b) { memcpy(a,b,23); }
int f24(char *a, char *b) __attribute__ ((section ("ff24"))); int f24(char *a, char *b) { memcpy(a,b,24); }
int f25(char *a, char *b) __attribute__ ((section ("ff25"))); int f25(char *a, char *b) { memcpy(a,b,25); }
int f26(char *a, char *b) __attribute__ ((section ("ff26"))); int f26(char *a, char *b) { memcpy(a,b,26); }
int f27(char *a, char *b) __attribute__ ((section ("ff27"))); int f27(char *a, char *b) { memcpy(a,b,27); }
int f28(char *a, char *b) __attribute__ ((section ("ff28"))); int f28(char *a, char *b) { memcpy(a,b,28); }
int f29(char *a, char *b) __attribute__ ((section ("ff29"))); int f29(char *a, char *b) { memcpy(a,b,29); }
int f3k(char *a, char *b) __attribute__ ((section ("ff3k"))); int f3k(char *a, char *b) { memcpy(a,b,3000); }

int f(char *a, char *b) {
memcpy(a,b,0);
memcpy(a,b,1); 
memcpy(a,b,2); 
memcpy(a,b,3); 
memcpy(a,b,4); 
memcpy(a,b,5); 
memcpy(a,b,6); 
memcpy(a,b,7); 
memcpy(a,b,8); 
memcpy(a,b,9); 
memcpy(a,b,10);
memcpy(a,b,11);
memcpy(a,b,12);
memcpy(a,b,13);
memcpy(a,b,14);
memcpy(a,b,15);
memcpy(a,b,16);
memcpy(a,b,17);
memcpy(a,b,18);
memcpy(a,b,19);
memcpy(a,b,20);
memcpy(a,b,21);
memcpy(a,b,22);
memcpy(a,b,23);
memcpy(a,b,24);
memcpy(a,b,25);
memcpy(a,b,3000);
}

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-03-29 15:42   ` Andrew Pinski
@ 2005-03-30  2:27     ` Gerold Jury
  2005-03-30  6:15       ` Denis Vlasenko
  0 siblings, 1 reply; 24+ messages in thread
From: Gerold Jury @ 2005-03-30  2:27 UTC (permalink / raw)
  To: Linux Kernel Mailing List


>> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
>> > /*
>> >  * This looks horribly ugly, but the compiler can optimize it totally,
>> >  * as the count is constant.
>> >  */
>> > static inline void * __constant_memcpy(void * to, const void * from,
>> > size_t n) {
>> >         if (n <= 128)
>> >                 return __builtin_memcpy(to, from, n);
>>
>> The problem is that in GCC < 4.0 there is no constant propagation
>> pass before expanding builtin functions, so the __builtin_memcpy
>> call above sees a variable rather than a constant.
>
>or change "size_t n" to "const size_t n" will also fix the issue.
>As we do some (well very little and with inlining and const values)
>const progation before 4.0.0 on the trees before expanding the builtin.
>
>-- Pinski
>-
I used the following "const size_t n" change on x86_64
and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
(kernel 2.6.12-rc1, running now)

--- include/asm-x86_64/string.h.~1~     2005-03-02 08:38:33.000000000 +0100
+++ include/asm-x86_64/string.h 2005-03-30 03:24:35.000000000 +0200
@@ -28,9 +28,9 @@
    function. */

 #define __HAVE_ARCH_MEMCPY 1
-extern void *__memcpy(void *to, const void *from, size_t len);
+extern void *__memcpy(void *to, const void *from, const size_t len);
 #define memcpy(dst,src,len) \
-       ({ size_t __len = (len);                                \
+       ({ const size_t __len = (len);                          \
           void *__ret;                                         \
           if (__builtin_constant_p(len) && __len >= 64)        \
                 __ret = __memcpy((dst),(src),__len);           \

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-03-30  2:27     ` Gerold Jury
@ 2005-03-30  6:15       ` Denis Vlasenko
  2005-04-01 21:43         ` Jan Hubicka
  0 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-30  6:15 UTC (permalink / raw)
  To: Gerold Jury, jakub; +Cc: Linux Kernel Mailing List, gcc

On Wednesday 30 March 2005 05:27, Gerold Jury wrote:
> 
> >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> >> > /*
> >> >  * This looks horribly ugly, but the compiler can optimize it totally,
> >> >  * as the count is constant.
> >> >  */
> >> > static inline void * __constant_memcpy(void * to, const void * from,
> >> > size_t n) {
> >> >         if (n <= 128)
> >> >                 return __builtin_memcpy(to, from, n);
> >>
> >> The problem is that in GCC < 4.0 there is no constant propagation
> >> pass before expanding builtin functions, so the __builtin_memcpy
> >> call above sees a variable rather than a constant.
> >
> >or change "size_t n" to "const size_t n" will also fix the issue.
> >As we do some (well very little and with inlining and const values)
> >const progation before 4.0.0 on the trees before expanding the builtin.
> >
> >-- Pinski
> >-
> I used the following "const size_t n" change on x86_64
> and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
> (kernel 2.6.12-rc1, running now)

What do you mean, 'reduced'?

(/me is checking....)

Oh shit... It still emits half of memcpys, to be exact - for
struct copies:

arch/i386/kernel/process.c:

int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
        unsigned long unused,
        struct task_struct * p, struct pt_regs * regs)
{
        struct pt_regs * childregs;
        struct task_struct *tsk;
        int err;

        childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
        *childregs = *regs;
        ^^^^^^^^^^^^^^^^^^^
        childregs->eax = 0;
        childregs->esp = esp;

# make arch/i386/kernel/process.s

copy_thread:
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %edi
        pushl   %esi
        pushl   %ebx
        subl    $20, %esp
        movl    24(%ebp), %eax
        movl    4(%eax), %esi
        pushl   $60
        leal    8132(%esi), %ebx
        pushl   28(%ebp)
        pushl   %ebx
        call    memcpy  <=================
        movl    $0, 24(%ebx)
        movl    16(%ebp), %eax
        movl    %eax, 52(%ebx)
        movl    24(%ebp), %edx
        addl    $8192, %esi
        movl    %ebx, 516(%edx)
        movl    %esi, -32(%ebp)
        movl    %esi, 504(%edx)
        movl    $ret_from_fork, 512(%edx)

Jakub, is there a way to instruct gcc to inine this copy, or better yet,
to use user-supplied inline version of memcpy?
--
vda


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-03-30  6:15       ` Denis Vlasenko
@ 2005-04-01 21:43         ` Jan Hubicka
  2005-04-02 12:18           ` Denis Vlasenko
  0 siblings, 1 reply; 24+ messages in thread
From: Jan Hubicka @ 2005-04-01 21:43 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc

> On Wednesday 30 March 2005 05:27, Gerold Jury wrote:
> > 
> > >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> > >> > /*
> > >> >  * This looks horribly ugly, but the compiler can optimize it totally,
> > >> >  * as the count is constant.
> > >> >  */
> > >> > static inline void * __constant_memcpy(void * to, const void * from,
> > >> > size_t n) {
> > >> >         if (n <= 128)
> > >> >                 return __builtin_memcpy(to, from, n);
> > >>
> > >> The problem is that in GCC < 4.0 there is no constant propagation
> > >> pass before expanding builtin functions, so the __builtin_memcpy
> > >> call above sees a variable rather than a constant.
> > >
> > >or change "size_t n" to "const size_t n" will also fix the issue.
> > >As we do some (well very little and with inlining and const values)
> > >const progation before 4.0.0 on the trees before expanding the builtin.
> > >
> > >-- Pinski
> > >-
> > I used the following "const size_t n" change on x86_64
> > and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
> > (kernel 2.6.12-rc1, running now)
> 
> What do you mean, 'reduced'?
> 
> (/me is checking....)
> 
> Oh shit... It still emits half of memcpys, to be exact - for
> struct copies:
> 
> arch/i386/kernel/process.c:
> 
> int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
>         unsigned long unused,
>         struct task_struct * p, struct pt_regs * regs)
> {
>         struct pt_regs * childregs;
>         struct task_struct *tsk;
>         int err;
> 
>         childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
>         *childregs = *regs;
>         ^^^^^^^^^^^^^^^^^^^
>         childregs->eax = 0;
>         childregs->esp = esp;
> 
> # make arch/i386/kernel/process.s
> 
> copy_thread:
>         pushl   %ebp
>         movl    %esp, %ebp
>         pushl   %edi
>         pushl   %esi
>         pushl   %ebx
>         subl    $20, %esp
>         movl    24(%ebp), %eax
>         movl    4(%eax), %esi
>         pushl   $60
>         leal    8132(%esi), %ebx
>         pushl   28(%ebp)
>         pushl   %ebx
>         call    memcpy  <=================
>         movl    $0, 24(%ebx)
>         movl    16(%ebp), %eax
>         movl    %eax, 52(%ebx)
>         movl    24(%ebp), %edx
>         addl    $8192, %esi
>         movl    %ebx, 516(%edx)
>         movl    %esi, -32(%ebp)
>         movl    %esi, 504(%edx)
>         movl    $ret_from_fork, 512(%edx)
> 
> Jakub, is there a way to instruct gcc to inine this copy, or better yet,
> to use user-supplied inline version of memcpy?

You can't inline struct copy as it is not function call at first place.
You can experiment with -minline-all-stringops where GCC will use it's
own memcpy implementation for this.

Honza
> --
> vda

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-04-01 21:43         ` Jan Hubicka
@ 2005-04-02 12:18           ` Denis Vlasenko
  2005-04-02 12:26             ` Denis Vlasenko
  0 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-02 12:18 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc

> >         childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
> >         *childregs = *regs;
> >         ^^^^^^^^^^^^^^^^^^^
> >         childregs->eax = 0;
> >         childregs->esp = esp;
> > 
> > # make arch/i386/kernel/process.s
> > 
> > copy_thread:
> >         pushl   %ebp
> >         movl    %esp, %ebp
> >         pushl   %edi
> >         pushl   %esi
> >         pushl   %ebx
> >         subl    $20, %esp
> >         movl    24(%ebp), %eax
> >         movl    4(%eax), %esi
> >         pushl   $60
> >         leal    8132(%esi), %ebx
> >         pushl   28(%ebp)
> >         pushl   %ebx
> >         call    memcpy  <=================
> >         movl    $0, 24(%ebx)
> > 
> > Jakub, is there a way to instruct gcc to inine this copy, or better yet,
> > to use user-supplied inline version of memcpy?
> 
> You can't inline struct copy as it is not function call at first place.
> You can experiment with -minline-all-stringops where GCC will use it's
> own memcpy implementation for this.

No luck. Actually, memcpy calls are produced with -Os. Adding
-minline-all-stringops changes nothing.

-O2 compile does inline copying, however, suboptimally.
Pushing/popping esi/edi on the stack is not needed.
Also "mov $1,ecx; rep; movsl" is rather silly.

Here what did I test:

t.c:
#define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; }
#define STRUCT(n) STRUCT1(n)

STRUCT(1)
STRUCT(2)
STRUCT(3)
STRUCT(4)
STRUCT(5)
STRUCT(6)
STRUCT(7)
STRUCT(8)
STRUCT(9)
STRUCT(10)
STRUCT(11)
STRUCT(12)
STRUCT(13)
STRUCT(14)
STRUCT(15)
STRUCT(16)
STRUCT(17)
STRUCT(18)
STRUCT(19)
STRUCT(20)

mk.sh:
#!/bin/sh

# yeah yeah. push/pop + 1 repetition 'rep movsl'
#  88:   55                      push   %ebp
#  89:   89 e5                   mov    %esp,%ebp
#  8b:   57                      push   %edi
#  8c:   56                      push   %esi
#  8d:   fc                      cld
#  8e:   bf 00 00 00 00          mov    $0x0,%edi
#                        8f: R_386_32    v7
#  93:   be 00 00 00 00          mov    $0x0,%esi
#                        94: R_386_32    w7
#  98:   b9 01 00 00 00          mov    $0x1,%ecx
#  9d:   f3 a5                   repz movsl %ds:(%esi),%es:(%edi)
#  9f:   66 a5                   movsw  %ds:(%esi),%es:(%edi)
#  a1:   a4                      movsb  %ds:(%esi),%es:(%edi)
#  a2:   5e                      pop    %esi
#  a3:   5f                      pop    %edi
#  a4:   c9                      leave
#  a5:   c3                      ret
#  a6:   89 f6                   mov    %esi,%esi
if false; then
gcc -O2 \
falign-functions=1 -falign-labels=1 -falign-loops=1 -falign-jumps=1 \
-c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

# -Os: emits memcpy
if false; then
gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \
-Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \
-fno-common -ffreestanding -Os -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement -c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

# -march=486: emits horrible tail:
# 271:   f3 a5                   repz movsl %ds:(%esi),%es:(%edi)
# 273:   5e                      pop    %esi
# 274:   66 a1 10 00 00 00       mov    0x10,%ax
#                        276: R_386_32   w19
# 27a:   5f                      pop    %edi
# 27b:   66 a3 10 00 00 00       mov    %ax,0x10
#                        27d: R_386_32   v19
# 281:   5d                      pop    %ebp
# 282:   a0 12 00 00 00          mov    0x12,%al
#                        283: R_386_32   w19
# 287:   a2 12 00 00 00          mov    %al,0x12
#                        288: R_386_32   v19
# 28c:   c3                      ret
if false; then
gcc \
-fno-common -ffreestanding -O2 -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement \
-c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

# -Os -minline-all-stringops: still emits memcpy
if true; then
gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \
-Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \
-fno-common -ffreestanding -Os -minline-all-stringops -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement -c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

--
vda


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
  2005-04-02 12:18           ` Denis Vlasenko
@ 2005-04-02 12:26             ` Denis Vlasenko
  2005-04-05 16:34               ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
  0 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-02 12:26 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc

[-- Attachment #1: Type: text/plain, Size: 360 bytes --]

On Saturday 02 April 2005 15:18, Denis Vlasenko wrote:
> -O2 compile does inline copying, however, suboptimally.
> Pushing/popping esi/edi on the stack is not needed.
> Also "mov $1,ecx; rep; movsl" is rather silly.

I think I am wrong about push/pop. Sorry.

However, other observation is still valid. You
may wish to compile this updated t.c and see.
--
vda

[-- Attachment #2: t.c --]
[-- Type: text/x-csrc, Size: 2513 bytes --]

static inline void * __memcpy(void * to, const void * from, int n)
{
int d0, d1, d2;
__asm__ __volatile__(
	"rep ; movsl\n\t"
	"movl %4,%%ecx\n\t"
	"andl $3,%%ecx\n\t"
	"jz 1f\n\t"	/* pay 2 byte penalty for a chance to skip microcoded rep */
	"rep ; movsb\n\t"
	"1:"
	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
	: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
	: "memory");
return (to);
}

/*
 * This looks ugly, but the compiler can optimize it totally,
 * as the count is constant.
 */
static inline void * __constant_memcpy(void * to, const void * from, int n)
{
#if 1	/* want to do small copies with non-string ops? */
	switch (n) {
		case 0: return to;
		case 1: *(char*)to = *(char*)from; return to;
		case 2: *(short*)to = *(short*)from; return to;
		case 4: *(int*)to = *(int*)from; return to;
#if 1	/* including those doable with two moves? */
		case 3: *(short*)to = *(short*)from;
			*((char*)to+2) = *((char*)from+2); return to;
		case 5: *(int*)to = *(int*)from;
			*((char*)to+4) = *((char*)from+4); return to;
		case 6: *(int*)to = *(int*)from;
			*((short*)to+2) = *((short*)from+2); return to;
		case 8: *(int*)to = *(int*)from;
			*((int*)to+1) = *((int*)from+1); return to;
#endif
	}
#else
	if (!n) return to;
#endif
	{
		/* load esi/edi */
		int esi, edi;
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	if (n >= 5*4) {
		/* large block: use rep prefix */
		int ecx;
		__asm__ __volatile__(
			"rep ; movsl"
			: "=&c" (ecx)
			: "0" (n/4)
		);
	} else {
		/* small block: don't clobber ecx + smaller code */
		if (n >= 4*4) __asm__ __volatile__("movsl");
		if (n >= 3*4) __asm__ __volatile__("movsl");
		if (n >= 2*4) __asm__ __volatile__("movsl");
		if (n >= 1*4) __asm__ __volatile__("movsl");
	}
	switch (n % 4) {
		/* tail */
		case 0: return to;
		case 1: __asm__ __volatile__("movsb"); return to;
		case 2: __asm__ __volatile__("movsw"); return to;
		default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
	}
}

#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
 __constant_memcpy((t),(f),(n)) : \
 __memcpy((t),(f),(n)))


#define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; } void g##n(void) { memcpy(&v##n,&w##n,n); }
#define STRUCT(n) STRUCT1(n)

STRUCT(1)
STRUCT(2)
STRUCT(3)
STRUCT(4)
STRUCT(5)
STRUCT(6)
STRUCT(7)
STRUCT(8)
STRUCT(9)
STRUCT(10)
STRUCT(11)
STRUCT(12)
STRUCT(13)
STRUCT(14)
STRUCT(15)
STRUCT(16)
STRUCT(17)
STRUCT(18)
STRUCT(19)
STRUCT(20)

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-02 12:26             ` Denis Vlasenko
@ 2005-04-05 16:34               ` Christophe Saout
  2005-04-06 10:14                 ` Denis Vlasenko
  2005-04-06 16:11                 ` Denis Vlasenko
  0 siblings, 2 replies; 24+ messages in thread
From: Christophe Saout @ 2005-04-05 16:34 UTC (permalink / raw)
  To: Denis Vlasenko
  Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub,
	Linux Kernel Mailing List, gcc

[-- Attachment #1: Type: text/plain, Size: 1798 bytes --]

Hi Denis,

the new i386 memcpy macro is a ticking timebomb.

I've been debugging a new mISDN crash, just to find out that a memcpy
was not inlined correctly.

Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).

This source code:

        mISDN_pid_t     pid;
	[...]
        memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t));

was compiled as:

        lea    0xffffffa4(%ebp),%esi    <---- %esi is loaded
(       add    $0x10,%ebx                      )
(       mov    %ebx,%eax                       ) something else
(       call   1613 <test_stack_protocol+0x83> ) %esi preserved
        mov    0xffffffa0(%ebp),%edx
        mov    0x74(%edx),%edi          <---- %edi is loaded
        add    $0x20,%edi                     offset in structure added
!       mov    $0x14,%esi        !!!!!! <---- %esi overwritten!
        mov    %esi,%ecx                <---- %ecx loaded
        repz movsl %ds:(%esi),%es:(%edi)

Apparently the compiled decided that the value 0x14 could be reused
afterwards (which it does for an inlined memset of the same size some
instructions below) and clobbers %esi.

Looking at the macro:

                __asm__ __volatile__(
                        ""
                        : "=&D" (edi), "=&S" (esi)
                        : "0" ((long) to),"1" ((long) from)
                        : "memory"
                );
        }
        if (n >= 5*4) {
                /* large block: use rep prefix */
                int ecx;
                __asm__ __volatile__(
                        "rep ; movsl"
                        : "=&c" (ecx)

it seems obvious that the compiled assumes it can reuse %esi and %edi
for something else between the two __asm__ sections. These should
probably be merged.


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-05 16:34               ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
@ 2005-04-06 10:14                 ` Denis Vlasenko
  2005-04-06 11:05                   ` Dave Korn
  2005-04-06 12:05                   ` Christophe Saout
  2005-04-06 16:11                 ` Denis Vlasenko
  1 sibling, 2 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-06 10:14 UTC (permalink / raw)
  To: Christophe Saout
  Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub,
	Linux Kernel Mailing List, gcc

On Tuesday 05 April 2005 19:34, Christophe Saout wrote:
> Hi Denis,
> 
> the new i386 memcpy macro is a ticking timebomb.
> 
> I've been debugging a new mISDN crash, just to find out that a memcpy
> was not inlined correctly.
> 
> Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).
> 
> This source code:
> 
>         mISDN_pid_t     pid;
> 	[...]
>         memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t));
> 
> was compiled as:
> 
>         lea    0xffffffa4(%ebp),%esi    <---- %esi is loaded
> (       add    $0x10,%ebx                      )
> (       mov    %ebx,%eax                       ) something else
> (       call   1613 <test_stack_protocol+0x83> ) %esi preserved
>         mov    0xffffffa0(%ebp),%edx
>         mov    0x74(%edx),%edi          <---- %edi is loaded
>         add    $0x20,%edi                     offset in structure added
> !       mov    $0x14,%esi        !!!!!! <---- %esi overwritten!
>         mov    %esi,%ecx                <---- %ecx loaded
>         repz movsl %ds:(%esi),%es:(%edi)
> 
> Apparently the compiled decided that the value 0x14 could be reused
> afterwards (which it does for an inlined memset of the same size some
> instructions below) and clobbers %esi.
> 
> Looking at the macro:
> 
>                 __asm__ __volatile__(
>                         ""
>                         : "=&D" (edi), "=&S" (esi)
>                         : "0" ((long) to),"1" ((long) from)
>                         : "memory"
>                 );
>         }
>         if (n >= 5*4) {
>                 /* large block: use rep prefix */
>                 int ecx;
>                 __asm__ __volatile__(
>                         "rep ; movsl"
>                         : "=&c" (ecx)
> 
> it seems obvious that the compiled assumes it can reuse %esi and %edi
> for something else between the two __asm__ sections. These should
> probably be merged.

Oh shit. I was trying to be too clever. I still run with this patch,
so it must be happening very rarely.

Does this one compile ok?

static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
	long esi, edi;
#if 1	/* want to do small copies with non-string ops? */
	switch (n) {
		case 0: return to;
		case 1: *(char*)to = *(char*)from; return to;
		case 2: *(short*)to = *(short*)from; return to;
		case 4: *(int*)to = *(int*)from; return to;
#if 1	/* including those doable with two moves? */
		case 3: *(short*)to = *(short*)from;
			*((char*)to+2) = *((char*)from+2); return to;
		case 5: *(int*)to = *(int*)from;
			*((char*)to+4) = *((char*)from+4); return to;
		case 6: *(int*)to = *(int*)from;
			*((short*)to+2) = *((short*)from+2); return to;
		case 8: *(int*)to = *(int*)from;
			*((int*)to+1) = *((int*)from+1); return to;
#endif
	}
#else
	if (!n) return to;
#endif
	{
		/* load esi/edi */
		__asm__ __volatile__(
			""
			: "=&D" (edi), "=&S" (esi)
			: "0" ((long) to),"1" ((long) from)
			: "memory"
		);
	}
	if (n >= 5*4) {
		/* large block: use rep prefix */
		int ecx;
		__asm__ __volatile__(
			"rep ; movsl"
			: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
			: "0" (n/4), "1" (edi),"2" (esi)
			: "memory"
		);
	} else {
		/* small block: don't clobber ecx + smaller code */
		if (n >= 4*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
		if (n >= 3*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
		if (n >= 2*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
		if (n >= 1*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
	}
	switch (n % 4) {
		/* tail */
		case 0: return to;
		case 1: __asm__ __volatile__("movsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
		case 2: __asm__ __volatile__("movsw":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
		default: __asm__ __volatile__("movsw\n\tmovsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
	}
}
--
vda


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 10:14                 ` Denis Vlasenko
@ 2005-04-06 11:05                   ` Dave Korn
  2005-04-06 11:13                     ` Dave Korn
  2005-04-06 12:05                   ` Christophe Saout
  1 sibling, 1 reply; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:05 UTC (permalink / raw)
  To: 'Denis Vlasenko', 'Christophe Saout'
  Cc: 'Andrew Morton', 'Jan Hubicka',
	'Gerold Jury', jakub, 'Linux Kernel Mailing List',
	gcc

----Original Message----
>From: Denis Vlasenko
>Sent: 06 April 2005 11:14

  Is this someone's idea of an April Fool's joke?  Because if it is, I've
suffered a serious sense-of-humour failure.

> Oh shit. I was trying to be too clever. I still run with this patch,
> so it must be happening very rarely.

  The kernel is way too important for cross-your-fingers-and-hope
engineering techniques to be applied.  This patch should never have been
permitted.  How on earth could anything like this hope to make it through a
strict review?

> Does this one compile ok?

> 	{
> 		/* load esi/edi */
> 		__asm__ __volatile__(
> 			""
> 			: "=&D" (edi), "=&S" (esi)
> 			: "0" ((long) to),"1" ((long) from)
> 			: "memory"
> 		);
> 	}
> 	if (n >= 5*4) {
> 		/* large block: use rep prefix */
> 		int ecx;
> 		__asm__ __volatile__(
> 			"rep ; movsl"
> 			: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
> 			: "0" (n/4), "1" (edi),"2" (esi)
> 			: "memory"
> 		);

  It doesn't matter if it compiles or not, it's still *utterly* invalid.
You can NOT make assumptions about registers keeping their values between
one asm block and another.  Immediately after the closing quote of the first
asm, the compiler can do ANYTHING IT WANTS and to just _hope_ that it won't
use the registers you want is voodoo programming.  Even if it works when you
try it once, there are zero guarantees that another version or revision of
the compiler or even just a tiny change to the source that affects the
behaviour of the scheduler when compiling the function won't produce
something completely different, meaning that this code is appallingly
fragile.  This code should be completely discarded and rewritten properly.

    cheers,
      DaveK
-- 
Can't think of a witty .sigline today....

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 11:05                   ` Dave Korn
@ 2005-04-06 11:13                     ` Dave Korn
  2005-04-06 11:53                       ` Dave Korn
  0 siblings, 1 reply; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:13 UTC (permalink / raw)
  To: 'Dave Korn', 'Denis Vlasenko',
	'Christophe Saout'
  Cc: 'Andrew Morton', 'Jan Hubicka',
	'Gerold Jury', jakub, 'Linux Kernel Mailing List',
	gcc

----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:06


  Me and my big mouth.

  OK, that one does work.

  Sorry for the outburst.

    cheers,
      DaveK
-- 
Can't think of a witty .sigline today....


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 11:13                     ` Dave Korn
@ 2005-04-06 11:53                       ` Dave Korn
  2005-04-06 11:56                         ` Dave Korn
  2005-04-06 13:18                         ` Richard B. Johnson
  0 siblings, 2 replies; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:53 UTC (permalink / raw)
  To: 'Dave Korn', 'Denis Vlasenko',
	'Christophe Saout'
  Cc: 'Andrew Morton', 'Jan Hubicka',
	'Gerold Jury', jakub, 'Linux Kernel Mailing List',
	gcc

----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:13

> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:06
> 
> 
>   Me and my big mouth.
> 
>   OK, that one does work.
> 
>   Sorry for the outburst.
> 


.... well, actually, maybe it doesn't after all.


  What's that uninitialised variable ecx doing there eh?


    cheers,
      DaveK
-- 
Can't think of a witty .sigline today....


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 11:53                       ` Dave Korn
@ 2005-04-06 11:56                         ` Dave Korn
  2005-04-06 13:18                         ` Richard B. Johnson
  1 sibling, 0 replies; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:56 UTC (permalink / raw)
  To: 'Dave Korn', 'Denis Vlasenko',
	'Christophe Saout'
  Cc: 'Andrew Morton', 'Jan Hubicka',
	'Gerold Jury', jakub, 'Linux Kernel Mailing List',
	gcc

----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:53

> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:13
> 
>> ----Original Message----
>>> From: Dave Korn
>>> Sent: 06 April 2005 12:06
>> 
>> 
>>   Me and my big mouth.
>> 
>>   OK, that one does work.
>> 
>>   Sorry for the outburst.
>> 
> 
> 
> .... well, actually, maybe it doesn't after all.
> 
> 
>   What's that uninitialised variable ecx doing there eh?

  Oh, I see, it's there as an output so it can be matched as an input by the
"0" constraint.

  Ok, guess it does.


    cheers,
      DaveK
-- 
Can't think of a witty .sigline today....


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 10:14                 ` Denis Vlasenko
  2005-04-06 11:05                   ` Dave Korn
@ 2005-04-06 12:05                   ` Christophe Saout
  2005-04-06 12:36                     ` Andrew Haley
  2005-04-06 15:18                     ` Paolo Bonzini
  1 sibling, 2 replies; 24+ messages in thread
From: Christophe Saout @ 2005-04-06 12:05 UTC (permalink / raw)
  To: Denis Vlasenko
  Cc: gcc, Linux Kernel Mailing List, jakub, Gerold Jury, Jan Hubicka,
	Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 1628 bytes --]

Am Mittwoch, den 06.04.2005, 13:14 +0300 schrieb Denis Vlasenko:

> Oh shit. I was trying to be too clever. I still run with this patch,
> so it must be happening very rarely.

Yes, that's right, it happened with code that's not in the mainline tree
but could have happened anywhere.

> Does this one compile ok?

Yes, the case that failed is now okay. I changed it slightly to assign
esi and edi directy on top of the functions, no asm section needed here.
The compiler will make sure that they have the correct values when
needed.

In the case above the compiler now uses %ebx to save the loop counter
instead of %esi.

In drivers/cdrom/cdrom.c I'm observing one very strange thing though.

It appears that the compiler decided to put the local variable edi on
the stack for some unexplicable reason (or possibly there is?). Since
the asm sections are memory barriers the compiler then saves the value
of %edi on the stack before entering the next assembler section.

    1f1c:       a5                      movsl  %ds:(%esi),%es:(%edi)
    1f1d:       89 7d 84                mov    %edi,0xffffff84(%ebp)
    1f20:       a5                      movsl  %ds:(%esi),%es:(%edi)
    1f21:       89 7d 84                mov    %edi,0xffffff84(%ebp)
    1f24:       66 a5                   movsw  %ds:(%esi),%es:(%edi)

(this is a constant 10 byte memcpy)

The only thing that would avoid this is to either tell the compiler to
never put esi/edi in memory (which I think is not possibly across
different versions of gcc) or to always generate a single asm section
for all the different cases.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 12:05                   ` Christophe Saout
@ 2005-04-06 12:36                     ` Andrew Haley
  2005-04-06 15:18                     ` Paolo Bonzini
  1 sibling, 0 replies; 24+ messages in thread
From: Andrew Haley @ 2005-04-06 12:36 UTC (permalink / raw)
  To: Christophe Saout
  Cc: Denis Vlasenko, gcc, Linux Kernel Mailing List, jakub,
	Gerold Jury, Jan Hubicka, Andrew Morton

I'm having a little difficulty understanding what this is for.  Is it
that gcc's builtin memcpy expander generates bad code, or that older
versions of gcc generate bad code, or what?  gcc generates too much
code?

Andrew.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 11:53                       ` Dave Korn
  2005-04-06 11:56                         ` Dave Korn
@ 2005-04-06 13:18                         ` Richard B. Johnson
  2005-04-06 14:16                           ` Denis Vlasenko
  1 sibling, 1 reply; 24+ messages in thread
From: Richard B. Johnson @ 2005-04-06 13:18 UTC (permalink / raw)
  To: Dave Korn
  Cc: Denis Vlasenko, Christophe Saout, Andrew Morton, Jan Hubicka,
	Gerold Jury, jakub, Linux Kernel Mailing List, gcc

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1265 bytes --]


Attached is inline ix86 memcpy() plus test code that tests its
corner-cases. The in-line code makes no jumps, but uses longword
copies, word copies and any spare byte copy. It works at all
offsets, doesn't require alignment but would work fastest if
both source and destination were longword aligned.

On Wed, 6 Apr 2005, Dave Korn wrote:

> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:13
>
>> ----Original Message----
>>> From: Dave Korn
>>> Sent: 06 April 2005 12:06
>>
>>
>>   Me and my big mouth.
>>
>>   OK, that one does work.
>>
>>   Sorry for the outburst.
>>
>
>
> .... well, actually, maybe it doesn't after all.
>
>
>  What's that uninitialised variable ecx doing there eh?
>
>
>    cheers,
>      DaveK
> -- 
> Can't think of a witty .sigline today....
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

Cheers,
Dick Johnson
Penguin : Linux version 2.6.11 on an i686 machine (5537.79 BogoMips).
  Notice : All mail here is now cached for review by Dictator Bush.
                  98.36% of all statistics are fiction.

[-- Attachment #2: Type: TEXT/PLAIN, Size: 1421 bytes --]


#include <stdio.h>
#include <string.h>


//
//  Inline ix86 memcpy() that contains no jumps. Not copied from
//  anybody.	Contributed by rjohnson@analogic.com
//

static __inline__ void *memcpy(void *dst, void *src, size_t len) {
        void *ret = dst;
        __asm__ __volatile__(	\
	"cld\n"			\
	"shr $1, %%ecx\n"	\
	"pushf\n"		\
	"shr $1, %%ecx\n"	\
	"pushf\n"		\
	"rep\n"			\
	"movsl\n"		\
	"popf\n"		\
	"adcl %%ecx, %%ecx\n"	\
	"rep\n"			\
	"movsw\n"		\
	"popf\n"		\
	"adcl %%ecx, %%ecx\n"	\
	"rep\n"			\
	"movsb\n"		\
	: "=D" (dst), "=S" (src), "=c"(len)
	: "0"  (dst), "1"  (src), "2" (len)
	: "memory" );
    return ret;
}


const char tester[]=	"0123456789"
			"0123456789"
			"0123456789"
			"0123456789"
			"0123456789"
			"0123456789"
			"0123456789"
			"0123456789";
char allocated[0x1000];

int main()
{
    size_t i;
    char buf[0x1000];

    memset(buf, 0x00, sizeof(buf));
    for(i=0; i< sizeof(buf); i++)
        puts(memcpy(buf, (char *)tester, i));
    memset(buf, 0x00, sizeof(buf));
    for(i=0; i< sizeof(buf)-1; i++)
        puts(memcpy(&buf[1], (char *)tester, i));
    memset(buf, 0x00, sizeof(buf));
    for(i=0; i< sizeof(buf)-2; i++)
        puts(memcpy(&buf[2], (char *)tester, i));
    memset(buf, 0x00, sizeof(buf));
    for(i=0; i< sizeof(buf)-3; i++)
        puts(memcpy(&buf[3], (char *)tester, i));
    return 0;
}

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 13:18                         ` Richard B. Johnson
@ 2005-04-06 14:16                           ` Denis Vlasenko
  0 siblings, 0 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-06 14:16 UTC (permalink / raw)
  To: linux-os, Dave Korn
  Cc: Denis Vlasenko, Christophe Saout, Andrew Morton, Jan Hubicka,
	Gerold Jury, jakub, Linux Kernel Mailing List, gcc

On Wednesday 06 April 2005 16:18, Richard B. Johnson wrote:
> 
> Attached is inline ix86 memcpy() plus test code that tests its
> corner-cases. The in-line code makes no jumps, but uses longword
> copies, word copies and any spare byte copy. It works at all
> offsets, doesn't require alignment but would work fastest if
> both source and destination were longword aligned.

Yours is:

        "shr $1, %%ecx\n"       \
        "pushf\n"               \
        "shr $1, %%ecx\n"       \
        "pushf\n"               \   <=== not needed
        "rep\n"                 \
        "movsl\n"               \
        "popf\n"                \   <=== not needed
        "adcl %%ecx, %%ecx\n"   \
        "rep\n"                 \
        "movsw\n"               \
        "popf\n"                \
        "adcl %%ecx, %%ecx\n"   \
        "rep\n"                 \
        "movsb\n"               \

You struggle too much for that movsw.

-mm one (which happen to be mine) is:

	"movl %ecx,%4"
	"shr $2,%ecx"
        "rep ; movsl"
        "movl %4,%%ecx"
        "andl $3,%%ecx"
        "jz 1ft"     /* pay 2 byte penalty for a chance to skip microcoded rep */
        "rep ; movsb"
"1:"

and I can still drop that jz. It is there just to have
a chance to skip rep movsb, it was measured to be slow
enough to matter. rep movs are a bit slow to start, on small
blocks it is measurable.

However, maybe it is even better without jz,
need to benchmark 'cold path' (i.e. where branch predictor
have no data to predict it) somehow.
--
vda


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-06 12:05                   ` Christophe Saout
  2005-04-06 12:36                     ` Andrew Haley
@ 2005-04-06 15:18                     ` Paolo Bonzini
  1 sibling, 0 replies; 24+ messages in thread
From: Paolo Bonzini @ 2005-04-06 15:18 UTC (permalink / raw)
  To: linux-kernel; +Cc: gcc

> The only thing that would avoid this is to either tell the compiler to
> never put esi/edi in memory (which I think is not possibly across
> different versions of gcc) or to always generate a single asm section
> for all the different cases.

Use __asm__ ("%esi") and __asm__ ("%edi").  It is not guaranteed that 
they access the registers always (you can still have copy propagation 
etcetera); but, if your __asm__ statement constraints match the register 
you specify, then you can be reasonably sure that good code is produced.

Paolo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
  2005-04-05 16:34               ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
  2005-04-06 10:14                 ` Denis Vlasenko
@ 2005-04-06 16:11                 ` Denis Vlasenko
  1 sibling, 0 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-06 16:11 UTC (permalink / raw)
  To: Christophe Saout
  Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub,
	Linux Kernel Mailing List, gcc

[-- Attachment #1: Type: text/plain, Size: 620 bytes --]

On Tuesday 05 April 2005 19:34, Christophe Saout wrote:
> the new i386 memcpy macro is a ticking timebomb.
> 
> I've been debugging a new mISDN crash, just to find out that a memcpy
> was not inlined correctly.
> 
> Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).

Updated patch against 2.6.11 follows. This one, like the original
patch, is run tested too.

This time I took no chances, esi/edi contents are
explicitly propagated from one asm() block to another.
I didn't do it before, not expecting that gcc can be
soooo incredibly clever. Sorry.

Christophe does this one look/compile ok?
--
vda

[-- Attachment #2: string2.h.diff --]
[-- Type: text/x-diff, Size: 3288 bytes --]

--- linux-2.6.11.src/include/asm-i386/string.h.orig	Thu Mar  3 09:31:08 2005
+++ linux-2.6.11.src/include/asm-i386/string.h	Wed Apr  6 19:08:39 2005
@@ -198,47 +198,80 @@ static inline void * __memcpy(void * to,
 int d0, d1, d2;
 __asm__ __volatile__(
 	"rep ; movsl\n\t"
-	"testb $2,%b4\n\t"
-	"je 1f\n\t"
-	"movsw\n"
-	"1:\ttestb $1,%b4\n\t"
-	"je 2f\n\t"
-	"movsb\n"
-	"2:"
+	"movl %4,%%ecx\n\t"
+	"andl $3,%%ecx\n\t"
+#if 1	/* want to pay 2 byte penalty for a chance to skip microcoded rep? */
+	"jz 1f\n\t"
+#endif
+	"rep ; movsb\n\t"
+	"1:"
 	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-	:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
+	: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
 	: "memory");
 return (to);
 }
 
 /*
- * This looks horribly ugly, but the compiler can optimize it totally,
+ * This looks ugly, but the compiler can optimize it totally,
  * as the count is constant.
  */
 static inline void * __constant_memcpy(void * to, const void * from, size_t n)
 {
-	if (n <= 128)
-		return __builtin_memcpy(to, from, n);
-
-#define COMMON(x) \
-__asm__ __volatile__( \
-	"rep ; movsl" \
-	x \
-	: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
-	: "0" (n/4),"1" ((long) to),"2" ((long) from) \
-	: "memory");
-{
-	int d0, d1, d2;
+	long esi, edi;
+	if (!n) return to;
+#if 1	/* want to do small copies with non-string ops? */
+	switch (n) {
+		case 1: *(char*)to = *(char*)from; return to;
+		case 2: *(short*)to = *(short*)from; return to;
+		case 4: *(int*)to = *(int*)from; return to;
+#if 1	/* including those doable with two moves? */
+		case 3: *(short*)to = *(short*)from;
+			*((char*)to+2) = *((char*)from+2); return to;
+		case 5: *(int*)to = *(int*)from;
+			*((char*)to+4) = *((char*)from+4); return to;
+		case 6: *(int*)to = *(int*)from;
+			*((short*)to+2) = *((short*)from+2); return to;
+		case 8: *(int*)to = *(int*)from;
+			*((int*)to+1) = *((int*)from+1); return to;
+#endif
+	}
+#endif
+	esi = (long) from;
+	edi = (long) to;
+	if (n >= 5*4) {
+		/* large block: use rep prefix */
+		int ecx;
+		__asm__ __volatile__(
+			"rep ; movsl"
+			: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
+			: "0" (n/4), "1" (edi),"2" (esi)
+			: "memory"
+		);
+	} else {
+		/* small block: don't clobber ecx + smaller code */
+		if (n >= 4*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+		if (n >= 3*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+		if (n >= 2*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+		if (n >= 1*4) __asm__ __volatile__("movsl"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+	}
 	switch (n % 4) {
-		case 0: COMMON(""); return to;
-		case 1: COMMON("\n\tmovsb"); return to;
-		case 2: COMMON("\n\tmovsw"); return to;
-		default: COMMON("\n\tmovsw\n\tmovsb"); return to;
+		/* tail */
+		case 0: return to;
+		case 1: __asm__ __volatile__("movsb"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+			return to;
+		case 2: __asm__ __volatile__("movsw"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+			return to;
+		default: __asm__ __volatile__("movsw\n\tmovsb"
+			:"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+			return to;
 	}
 }
-  
-#undef COMMON
-}
 
 #define __HAVE_ARCH_MEMCPY
 

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2005-04-07  7:32 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
2005-03-29 15:06 ` Richard Guenther
2005-03-29 15:08 ` Nathan Sidwell
2005-03-29 15:13 ` Jakub Jelinek
2005-03-29 15:42   ` Andrew Pinski
2005-03-30  2:27     ` Gerold Jury
2005-03-30  6:15       ` Denis Vlasenko
2005-04-01 21:43         ` Jan Hubicka
2005-04-02 12:18           ` Denis Vlasenko
2005-04-02 12:26             ` Denis Vlasenko
2005-04-05 16:34               ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
2005-04-06 10:14                 ` Denis Vlasenko
2005-04-06 11:05                   ` Dave Korn
2005-04-06 11:13                     ` Dave Korn
2005-04-06 11:53                       ` Dave Korn
2005-04-06 11:56                         ` Dave Korn
2005-04-06 13:18                         ` Richard B. Johnson
2005-04-06 14:16                           ` Denis Vlasenko
2005-04-06 12:05                   ` Christophe Saout
2005-04-06 12:36                     ` Andrew Haley
2005-04-06 15:18                     ` Paolo Bonzini
2005-04-06 16:11                 ` Denis Vlasenko
2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
2005-03-29 20:24   ` Denis Vlasenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox