* memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
@ 2005-03-29 14:37 Denis Vlasenko
2005-03-29 15:06 ` Richard Guenther
` (3 more replies)
0 siblings, 4 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-29 14:37 UTC (permalink / raw)
To: linux-kernel, gcc
Try testcase below the sig.
This causes nearly one thousand calls to memcpy in my kernel
(not an allyesconfig one):
# objdump -d vmlinux | grep -F '<memcpy>' | wc -l
959
# gcc -O2 -c t.c
# objdump -r -d t.o
t.o: file format elf32-i386
Disassembly of section .text:
00000000 <f3>:
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 83 ec 0c sub $0xc,%esp
6: 6a 03 push $0x3
8: ff 75 0c pushl 0xc(%ebp)
b: ff 75 08 pushl 0x8(%ebp)
e: e8 fc ff ff ff call f <f3+0xf>
f: R_386_PC32 memcpy
13: 83 c4 10 add $0x10,%esp
16: c9 leave
17: c3 ret
00000018 <f3b>:
18: 55 push %ebp
19: 89 e5 mov %esp,%ebp
1b: 8b 55 0c mov 0xc(%ebp),%edx
1e: 66 8b 02 mov (%edx),%ax
21: 8b 4d 08 mov 0x8(%ebp),%ecx
24: 66 89 01 mov %ax,(%ecx)
27: 8a 42 02 mov 0x2(%edx),%al
2a: 88 41 02 mov %al,0x2(%ecx)
2d: c9 leave
2e: c3 ret
2f: 90 nop
00000030 <f3k>:
30: 55 push %ebp
31: 89 e5 mov %esp,%ebp
33: 57 push %edi
34: 56 push %esi
35: 8b 7d 08 mov 0x8(%ebp),%edi
38: 8b 75 0c mov 0xc(%ebp),%esi
3b: b9 ee 02 00 00 mov $0x2ee,%ecx
40: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
42: 5e pop %esi
43: 5f pop %edi
44: c9 leave
45: c3 ret
--
vda
typedef unsigned int size_t;
static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return (to);
}
/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
if (n <= 128)
return __builtin_memcpy(to, from, n);
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; movsl" \
x \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (n/4),"1" ((long) to),"2" ((long) from) \
: "memory");
{
int d0, d1, d2;
switch (n % 4) {
case 0: COMMON(""); return to;
case 1: COMMON("\n\tmovsb"); return to;
case 2: COMMON("\n\tmovsw"); return to;
default: COMMON("\n\tmovsw\n\tmovsb"); return to;
}
}
#undef COMMON
}
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
__constant_memcpy((t),(f),(n)) : \
__memcpy((t),(f),(n)))
int f3(char *a, char *b) { memcpy(a,b,3); }
int f3b(char *a, char *b) { __builtin_memcpy(a,b,3); }
int f3k(char *a, char *b) { memcpy(a,b,3000); }
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
@ 2005-03-29 15:06 ` Richard Guenther
2005-03-29 15:08 ` Nathan Sidwell
` (2 subsequent siblings)
3 siblings, 0 replies; 24+ messages in thread
From: Richard Guenther @ 2005-03-29 15:06 UTC (permalink / raw)
To: Denis Vlasenko; +Cc: linux-kernel, gcc
On Tue, 29 Mar 2005 17:37:06 +0300, Denis Vlasenko <vda@ilport.com.ua> wrote:
> Try testcase below the sig.
>
> This causes nearly one thousand calls to memcpy in my kernel
> (not an allyesconfig one):
> static inline void * __memcpy(void * to, const void * from, size_t n)
> {
> int d0, d1, d2;
> __asm__ __volatile__(
> "rep ; movsl\n\t"
> "testb $2,%b4\n\t"
> "je 1f\n\t"
> "movsw\n"
> "1:\ttestb $1,%b4\n\t"
> "je 2f\n\t"
> "movsb\n"
> "2:"
> : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
> : "memory");
> return (to);
> }
The question is, what reason does -Winline give for this inlining
decision? And then
of course, how is the size estimate counted for the above. What kind
of tree node do
we get for the ASM expression?
Richard.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
2005-03-29 15:06 ` Richard Guenther
@ 2005-03-29 15:08 ` Nathan Sidwell
2005-03-29 15:13 ` Jakub Jelinek
2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
3 siblings, 0 replies; 24+ messages in thread
From: Nathan Sidwell @ 2005-03-29 15:08 UTC (permalink / raw)
To: Denis Vlasenko; +Cc: linux-kernel, gcc
Denis Vlasenko wrote:
> Disassembly of section .text:
>
> e: e8 fc ff ff ff call f <f3+0xf>
> f: R_386_PC32 memcpy
> #define memcpy(t, f, n) \
> (__builtin_constant_p(n) ? \
> __constant_memcpy((t),(f),(n)) : \
> __memcpy((t),(f),(n)))
given this #define, how can 'memcpy' appear in the object file? It appears
that something odd is happening with preprocessing. Check the .i files are
as you expect. -dD and -E options will be helpful to you.
nathan
--
Nathan Sidwell :: http://www.codesourcery.com :: CodeSourcery LLC
nathan@codesourcery.com :: http://www.planetfall.pwp.blueyonder.co.uk
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
2005-03-29 15:06 ` Richard Guenther
2005-03-29 15:08 ` Nathan Sidwell
@ 2005-03-29 15:13 ` Jakub Jelinek
2005-03-29 15:42 ` Andrew Pinski
2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
3 siblings, 1 reply; 24+ messages in thread
From: Jakub Jelinek @ 2005-03-29 15:13 UTC (permalink / raw)
To: Denis Vlasenko; +Cc: linux-kernel, gcc
On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> typedef unsigned int size_t;
>
> static inline void * __memcpy(void * to, const void * from, size_t n)
> {
> int d0, d1, d2;
> __asm__ __volatile__(
> "rep ; movsl\n\t"
> "testb $2,%b4\n\t"
> "je 1f\n\t"
> "movsw\n"
> "1:\ttestb $1,%b4\n\t"
> "je 2f\n\t"
> "movsb\n"
> "2:"
> : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
> : "memory");
> return (to);
> }
>
> /*
> * This looks horribly ugly, but the compiler can optimize it totally,
> * as the count is constant.
> */
> static inline void * __constant_memcpy(void * to, const void * from, size_t n)
> {
> if (n <= 128)
> return __builtin_memcpy(to, from, n);
>
> #define COMMON(x) \
> __asm__ __volatile__( \
> "rep ; movsl" \
> x \
> : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
> : "0" (n/4),"1" ((long) to),"2" ((long) from) \
> : "memory");
> {
> int d0, d1, d2;
> switch (n % 4) {
> case 0: COMMON(""); return to;
> case 1: COMMON("\n\tmovsb"); return to;
> case 2: COMMON("\n\tmovsw"); return to;
> default: COMMON("\n\tmovsw\n\tmovsb"); return to;
> }
> }
>
> #undef COMMON
> }
>
> #define memcpy(t, f, n) \
> (__builtin_constant_p(n) ? \
> __constant_memcpy((t),(f),(n)) : \
> __memcpy((t),(f),(n)))
>
> int f3(char *a, char *b) { memcpy(a,b,3); }
The problem is that in GCC < 4.0 there is no constant propagation
pass before expanding builtin functions, so the __builtin_memcpy
call above sees a variable rather than a constant.
Either use GCC 4.0+, where this works just fine, or move the
n <= 128 case into the macro:
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
((n) <= 128 ? __builtin_memcpy(t,f,n) : __constant_memcpy(t,f,n) : \
__memcpy(t,f,n))
Jakub
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-03-29 15:13 ` Jakub Jelinek
@ 2005-03-29 15:42 ` Andrew Pinski
2005-03-30 2:27 ` Gerold Jury
0 siblings, 1 reply; 24+ messages in thread
From: Andrew Pinski @ 2005-03-29 15:42 UTC (permalink / raw)
To: jakub; +Cc: Denis Vlasenko, linux-kernel, gcc
>
> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> > /*
> > * This looks horribly ugly, but the compiler can optimize it totally,
> > * as the count is constant.
> > */
> > static inline void * __constant_memcpy(void * to, const void * from, size_t n)
> > {
> > if (n <= 128)
> > return __builtin_memcpy(to, from, n);
> The problem is that in GCC < 4.0 there is no constant propagation
> pass before expanding builtin functions, so the __builtin_memcpy
> call above sees a variable rather than a constant.
or change "size_t n" to "const size_t n" will also fix the issue.
As we do some (well very little and with inlining and const values)
const progation before 4.0.0 on the trees before expanding the builtin.
-- Pinski
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH] fix i386 memcpy
2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
` (2 preceding siblings ...)
2005-03-29 15:13 ` Jakub Jelinek
@ 2005-03-29 20:22 ` Denis Vlasenko
2005-03-29 20:24 ` Denis Vlasenko
3 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-29 20:22 UTC (permalink / raw)
To: Andrew Morton, Denis Vlasenko; +Cc: linux-kernel, gcc
[-- Attachment #1: Type: text/plain, Size: 2961 bytes --]
This patch shortens non-constant memcpy() by two bytes
and fixes spurious out-of-line constant memcpy().
Patch is run-tested (I run on patched kernel right now).
Benchmark and code generation test program will be mailed as reply.
# size vmlinux.org vmlinux
text data bss dec hex filename
3954591 1553426 236544 5744561 57a7b1 vmlinux.org
3952615 1553426 236544 5742585 579ff9 vmlinux
Example of changes (part of dump_fpu() body):
old.............................................. new......................
8d 83 40 02 00 00 lea 0x240(%ebx),%eax 8d b3 40 02 00 00 lea 0x240(%ebx),%esi
74 31 je c0108b27 <dump_fpu+0x9c> 74 2e je c0108b1d <dump_fpu+0x92>
6a 1c push $0x1c 8b 7d 0c mov 0xc(%ebp),%edi
50 push %eax b9 07 00 00 00 mov $0x7,%ecx
56 push %esi f3 a5 repz movsl %ds:(%esi),%es:(%edi)
e8 49 21 10 00 call c020ac48 <memcpy> 8b 55 0c mov 0xc(%ebp),%edx
83 c4 0c add $0xc,%esp 83 c2 1c add $0x1c,%edx
83 c6 1c add $0x1c,%esi 8d 83 60 02 00 00 lea 0x260(%ebx),%eax
81 c3 60 02 00 00 add $0x260,%ebx b9 07 00 00 00 mov $0x7,%ecx
bf 07 00 00 00 mov $0x7,%edi 89 d7 mov %edx,%edi
6a 0a push $0xa 89 c6 mov %eax,%esi
53 push %ebx a5 movsl %ds:(%esi),%es:(%edi)
56 push %esi a5 movsl %ds:(%esi),%es:(%edi)
e8 2f 21 10 00 call c020ac48 <memcpy> 66 a5 movsw %ds:(%esi),%es:(%edi)
83 c4 0c add $0xc,%esp 83 c2 0a add $0xa,%edx
83 c6 0a add $0xa,%esi 83 c0 10 add $0x10,%eax
83 c3 10 add $0x10,%ebx 49 dec %ecx
4f dec %edi 79 ef jns c0108b0a <dump_fpu+0x7f>
79 eb jns c0108b10 <dump_fpu+0x85> eb 0a jmp c0108b27 <dump_fpu+0x9c>
eb 0c jmp c0108b33 <dump_fpu+0xa8> 8b 7d 0c mov 0xc(%ebp),%edi
6a 6c push $0x6c b9 1b 00 00 00 mov $0x1b,%ecx
50 push %eax f3 a5 repz movsl %ds:(%esi),%es:(%edi)
56 push %esi 8b 45 f0 mov 0xfffffff0(%ebp),%eax
e8 18 21 10 00 call c020ac48 <memcpy> 5a pop %edx
83 c4 0c add $0xc,%esp 5b pop %ebx
8b 45 f0 mov 0xfffffff0(%ebp),%eax
8d 65 f4 lea 0xfffffff4(%ebp),%esp
5b pop %ebx
5e pop %esi
--
vda
[-- Attachment #2: string.memcpy.diff --]
[-- Type: text/x-diff, Size: 2953 bytes --]
--- linux-2.6.11.src/include/asm-i386/string.h.orig Thu Mar 3 09:31:08 2005
+++ linux-2.6.11.src/include/asm-i386/string.h Tue Mar 29 22:05:00 2005
@@ -198,46 +198,75 @@ static inline void * __memcpy(void * to,
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
- "testb $2,%b4\n\t"
- "je 1f\n\t"
- "movsw\n"
- "1:\ttestb $1,%b4\n\t"
- "je 2f\n\t"
- "movsb\n"
- "2:"
+ "movl %4,%%ecx\n\t"
+ "andl $3,%%ecx\n\t"
+ "jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */
+ "rep ; movsb\n\t"
+ "1:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
- :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
+ : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
: "memory");
return (to);
}
/*
- * This looks horribly ugly, but the compiler can optimize it totally,
+ * This looks ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
- if (n <= 128)
- return __builtin_memcpy(to, from, n);
-
-#define COMMON(x) \
-__asm__ __volatile__( \
- "rep ; movsl" \
- x \
- : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
- : "0" (n/4),"1" ((long) to),"2" ((long) from) \
- : "memory");
-{
- int d0, d1, d2;
+#if 1 /* want to do small copies with non-string ops? */
+ switch (n) {
+ case 0: return to;
+ case 1: *(char*)to = *(char*)from; return to;
+ case 2: *(short*)to = *(short*)from; return to;
+ case 4: *(int*)to = *(int*)from; return to;
+#if 1 /* including those doable with two moves? */
+ case 3: *(short*)to = *(short*)from;
+ *((char*)to+2) = *((char*)from+2); return to;
+ case 5: *(int*)to = *(int*)from;
+ *((char*)to+4) = *((char*)from+4); return to;
+ case 6: *(int*)to = *(int*)from;
+ *((short*)to+2) = *((short*)from+2); return to;
+ case 8: *(int*)to = *(int*)from;
+ *((int*)to+1) = *((int*)from+1); return to;
+#endif
+ }
+#else
+ if (!n) return to;
+#endif
+ {
+ /* load esi/edi */
+ int esi, edi;
+ __asm__ __volatile__(
+ ""
+ : "=&D" (edi), "=&S" (esi)
+ : "0" ((long) to),"1" ((long) from)
+ : "memory"
+ );
+ }
+ if (n >= 5*4) {
+ /* large block: use rep prefix */
+ int ecx;
+ __asm__ __volatile__(
+ "rep ; movsl"
+ : "=&c" (ecx)
+ : "0" (n/4)
+ );
+ } else {
+ /* small block: don't clobber ecx + smaller code */
+ if (n >= 4*4) __asm__ __volatile__("movsl");
+ if (n >= 3*4) __asm__ __volatile__("movsl");
+ if (n >= 2*4) __asm__ __volatile__("movsl");
+ if (n >= 1*4) __asm__ __volatile__("movsl");
+ }
switch (n % 4) {
- case 0: COMMON(""); return to;
- case 1: COMMON("\n\tmovsb"); return to;
- case 2: COMMON("\n\tmovsw"); return to;
- default: COMMON("\n\tmovsw\n\tmovsb"); return to;
+ /* tail */
+ case 0: return to;
+ case 1: __asm__ __volatile__("movsb"); return to;
+ case 2: __asm__ __volatile__("movsw"); return to;
+ default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
}
-}
-
-#undef COMMON
}
#define __HAVE_ARCH_MEMCPY
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] fix i386 memcpy
2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
@ 2005-03-29 20:24 ` Denis Vlasenko
0 siblings, 0 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-29 20:24 UTC (permalink / raw)
To: Denis Vlasenko, Andrew Morton; +Cc: linux-kernel, gcc
[-- Attachment #1: Type: text/plain, Size: 298 bytes --]
On Tuesday 29 March 2005 23:22, Denis Vlasenko wrote:
> This patch shortens non-constant memcpy() by two bytes
> and fixes spurious out-of-line constant memcpy().
>
> Patch is run-tested (I run on patched kernel right now).
>
> Benchmark and code generation test program will be mailed as reply.
[-- Attachment #2: bench.c --]
[-- Type: text/x-csrc, Size: 3755 bytes --]
/* Compile with: gcc -Os -fomit-frame-pointer -falign-functions=32 */
/* results:
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 11
model name : Intel(R) Celeron
stepping : 1
cpu MHz : 1196.236
cache size : 256 KB
movsl_X wins : N<=3
rep_movsl wins : N>=6
('assign' wins always at the cost of much larger code)
*/
#include <time.h>
#include <stdio.h>
#define N 5
#define MOVSL1 __asm__ __volatile__("movsl")
#define MOVSL2 MOVSL1;MOVSL1
#define MOVSL3 MOVSL2;MOVSL1
#define MOVSL4 MOVSL3;MOVSL1
#define MOVSL5 MOVSL4;MOVSL1
#define MOVSL6 MOVSL5;MOVSL1
#define MOVSL7 MOVSL6;MOVSL1
#define MOVSL8 MOVSL7;MOVSL1
#define MOVSL9 MOVSL8;MOVSL1
#define MOVSL10 MOVSL9;MOVSL1
#define MOVSL11 MOVSL10;MOVSL1
#define MOVSL12 MOVSL11;MOVSL1
#define MOVSL13 MOVSL12;MOVSL1
#define MOVSL14 MOVSL13;MOVSL1
#define MOVSL15 MOVSL14;MOVSL1
#define MOVSL16 MOVSL15;MOVSL1
#define MOVSL17 MOVSL16;MOVSL1
#define MOVSL18 MOVSL17;MOVSL1
#define MOVSL19 MOVSL18;MOVSL1
#define MOVSL_(n) MOVSL##n
#define MOVSL(n) MOVSL_(n)
static inline void * rep_movsl(void * to, const void * from, size_t n)
{
{
int esi, edi;
__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
{
int ecx;
__asm__ __volatile__(
"rep ; movsl"
: "=&c" (ecx)
: "0" (n/4)
);
}
}
static inline void * movsl_X(void * to, const void * from, size_t n)
{
{
int esi, edi;
__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
MOVSL(N);
}
static inline void * assign(void * to, const void * from, size_t n)
{
switch (n) {
case 4:
*(unsigned long *)to = *(const unsigned long *)from;
return to;
case 8:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
return to;
case 12:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
return to;
case 16:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
return to;
case 20:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
return to;
default:
return rep_movsl(to, from, n);
}
}
char f[256],t[256];
char *fp = f;
char *tp = t;
void r() { rep_movsl(f,t,N*4); }
void m() { movsl_X(f,t,N*4); }
void a() { assign(f,t,N*4); }
void rp() { rep_movsl(fp,tp,N*4); }
void mp() { movsl_X(fp,tp,N*4); }
void ap() { assign(fp,tp,N*4); }
int measure(void (*f)()) {
int cnt = 0;
time_t t = time(0);
while(t==time(0)) f(); /* cache hot */
t = time(0);
while(t==time(0)) {
f(); f(); f(); f(); f(); f(); f(); f();
f(); f(); f(); f(); f(); f(); f(); f();
cnt += 16;
}
return cnt;
}
int main() {
printf("On global array:\n");
printf("rep movsl(%d) per sec: %d\n", N, measure(r));
printf(" movsl_X(%d) per sec: %d\n", N, measure(m));
printf(" assign(%d) per sec: %d\n", N, measure(a));
printf("Indirect:\n");
printf("rep movsl(%d) per sec: %d\n", N, measure(rp));
printf(" movsl_X(%d) per sec: %d\n", N, measure(mp));
printf(" assign(%d) per sec: %d\n", N, measure(ap));
return 0;
}
[-- Attachment #3: codecheck.c --]
[-- Type: text/x-csrc, Size: 5993 bytes --]
/* Compile with: gcc -Os -fomit-frame-pointer */
/* Check for correctness/size: objdump -r -d <file.o> | $PAGER */
typedef unsigned int size_t;
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
#if 1 /* want to do small copies with non-string ops? */
switch (n) {
case 0: return to;
case 1: *(char*)to = *(char*)from; return to;
case 2: *(short*)to = *(short*)from; return to;
case 4: *(int*)to = *(int*)from; return to;
#if 1 /* including those doable with two moves? */
case 3: *(short*)to = *(short*)from;
*((char*)to+2) = *((char*)from+2); return to;
case 5: *(int*)to = *(int*)from;
*((char*)to+4) = *((char*)from+4); return to;
case 6: *(int*)to = *(int*)from;
*((short*)to+2) = *((short*)from+2); return to;
case 8: *(int*)to = *(int*)from;
*((int*)to+1) = *((int*)from+1); return to;
#endif
}
#else
if (!n) return to;
#endif
{
/* load esi/edi */
int esi, edi;
__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
if (n >= 5*4) {
/* large block: use rep prefix */
int ecx;
__asm__ __volatile__(
"rep ; movsl"
: "=&c" (ecx)
: "0" (n/4)
);
} else {
/* small block: don't clobber ecx + smaller code */
if (n >= 4*4) __asm__ __volatile__("movsl");
if (n >= 3*4) __asm__ __volatile__("movsl");
if (n >= 2*4) __asm__ __volatile__("movsl");
if (n >= 1*4) __asm__ __volatile__("movsl");
}
switch (n % 4) {
/* tail */
case 0: return to;
case 1: __asm__ __volatile__("movsb"); return to;
case 2: __asm__ __volatile__("movsw"); return to;
default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
}
}
static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
"movl %4,%%ecx\n\t"
"andl $3,%%ecx\n\t"
"jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */
"rep ; movsb\n\t"
"1:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
: "memory");
return (to);
}
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
__constant_memcpy((t),(f),(n)) : \
__memcpy((t),(f),(n)))
int f00(char *a, char *b) __attribute__ ((section ("ff00"))); int f00(char *a, char *b) { memcpy(a,b,0); }
int f01(char *a, char *b) __attribute__ ((section ("ff01"))); int f01(char *a, char *b) { memcpy(a,b,1); }
int f02(char *a, char *b) __attribute__ ((section ("ff02"))); int f02(char *a, char *b) { memcpy(a,b,2); }
int f03(char *a, char *b) __attribute__ ((section ("ff03"))); int f03(char *a, char *b) { memcpy(a,b,3); }
int f04(char *a, char *b) __attribute__ ((section ("ff04"))); int f04(char *a, char *b) { memcpy(a,b,4); }
int f05(char *a, char *b) __attribute__ ((section ("ff05"))); int f05(char *a, char *b) { memcpy(a,b,5); }
int f06(char *a, char *b) __attribute__ ((section ("ff06"))); int f06(char *a, char *b) { memcpy(a,b,6); }
int f07(char *a, char *b) __attribute__ ((section ("ff07"))); int f07(char *a, char *b) { memcpy(a,b,7); }
int f08(char *a, char *b) __attribute__ ((section ("ff08"))); int f08(char *a, char *b) { memcpy(a,b,8); }
int f09(char *a, char *b) __attribute__ ((section ("ff09"))); int f09(char *a, char *b) { memcpy(a,b,9); }
int f10(char *a, char *b) __attribute__ ((section ("ff10"))); int f10(char *a, char *b) { memcpy(a,b,10); }
int f11(char *a, char *b) __attribute__ ((section ("ff11"))); int f11(char *a, char *b) { memcpy(a,b,11); }
int f12(char *a, char *b) __attribute__ ((section ("ff12"))); int f12(char *a, char *b) { memcpy(a,b,12); }
int f13(char *a, char *b) __attribute__ ((section ("ff13"))); int f13(char *a, char *b) { memcpy(a,b,13); }
int f14(char *a, char *b) __attribute__ ((section ("ff14"))); int f14(char *a, char *b) { memcpy(a,b,14); }
int f15(char *a, char *b) __attribute__ ((section ("ff15"))); int f15(char *a, char *b) { memcpy(a,b,15); }
int f16(char *a, char *b) __attribute__ ((section ("ff16"))); int f16(char *a, char *b) { memcpy(a,b,16); }
int f17(char *a, char *b) __attribute__ ((section ("ff17"))); int f17(char *a, char *b) { memcpy(a,b,17); }
int f18(char *a, char *b) __attribute__ ((section ("ff18"))); int f18(char *a, char *b) { memcpy(a,b,18); }
int f19(char *a, char *b) __attribute__ ((section ("ff19"))); int f19(char *a, char *b) { memcpy(a,b,19); }
int f20(char *a, char *b) __attribute__ ((section ("ff20"))); int f20(char *a, char *b) { memcpy(a,b,20); }
int f21(char *a, char *b) __attribute__ ((section ("ff21"))); int f21(char *a, char *b) { memcpy(a,b,21); }
int f22(char *a, char *b) __attribute__ ((section ("ff22"))); int f22(char *a, char *b) { memcpy(a,b,22); }
int f23(char *a, char *b) __attribute__ ((section ("ff23"))); int f23(char *a, char *b) { memcpy(a,b,23); }
int f24(char *a, char *b) __attribute__ ((section ("ff24"))); int f24(char *a, char *b) { memcpy(a,b,24); }
int f25(char *a, char *b) __attribute__ ((section ("ff25"))); int f25(char *a, char *b) { memcpy(a,b,25); }
int f26(char *a, char *b) __attribute__ ((section ("ff26"))); int f26(char *a, char *b) { memcpy(a,b,26); }
int f27(char *a, char *b) __attribute__ ((section ("ff27"))); int f27(char *a, char *b) { memcpy(a,b,27); }
int f28(char *a, char *b) __attribute__ ((section ("ff28"))); int f28(char *a, char *b) { memcpy(a,b,28); }
int f29(char *a, char *b) __attribute__ ((section ("ff29"))); int f29(char *a, char *b) { memcpy(a,b,29); }
int f3k(char *a, char *b) __attribute__ ((section ("ff3k"))); int f3k(char *a, char *b) { memcpy(a,b,3000); }
int f(char *a, char *b) {
memcpy(a,b,0);
memcpy(a,b,1);
memcpy(a,b,2);
memcpy(a,b,3);
memcpy(a,b,4);
memcpy(a,b,5);
memcpy(a,b,6);
memcpy(a,b,7);
memcpy(a,b,8);
memcpy(a,b,9);
memcpy(a,b,10);
memcpy(a,b,11);
memcpy(a,b,12);
memcpy(a,b,13);
memcpy(a,b,14);
memcpy(a,b,15);
memcpy(a,b,16);
memcpy(a,b,17);
memcpy(a,b,18);
memcpy(a,b,19);
memcpy(a,b,20);
memcpy(a,b,21);
memcpy(a,b,22);
memcpy(a,b,23);
memcpy(a,b,24);
memcpy(a,b,25);
memcpy(a,b,3000);
}
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-03-29 15:42 ` Andrew Pinski
@ 2005-03-30 2:27 ` Gerold Jury
2005-03-30 6:15 ` Denis Vlasenko
0 siblings, 1 reply; 24+ messages in thread
From: Gerold Jury @ 2005-03-30 2:27 UTC (permalink / raw)
To: Linux Kernel Mailing List
>> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
>> > /*
>> > * This looks horribly ugly, but the compiler can optimize it totally,
>> > * as the count is constant.
>> > */
>> > static inline void * __constant_memcpy(void * to, const void * from,
>> > size_t n) {
>> > if (n <= 128)
>> > return __builtin_memcpy(to, from, n);
>>
>> The problem is that in GCC < 4.0 there is no constant propagation
>> pass before expanding builtin functions, so the __builtin_memcpy
>> call above sees a variable rather than a constant.
>
>or change "size_t n" to "const size_t n" will also fix the issue.
>As we do some (well very little and with inlining and const values)
>const progation before 4.0.0 on the trees before expanding the builtin.
>
>-- Pinski
>-
I used the following "const size_t n" change on x86_64
and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
(kernel 2.6.12-rc1, running now)
--- include/asm-x86_64/string.h.~1~ 2005-03-02 08:38:33.000000000 +0100
+++ include/asm-x86_64/string.h 2005-03-30 03:24:35.000000000 +0200
@@ -28,9 +28,9 @@
function. */
#define __HAVE_ARCH_MEMCPY 1
-extern void *__memcpy(void *to, const void *from, size_t len);
+extern void *__memcpy(void *to, const void *from, const size_t len);
#define memcpy(dst,src,len) \
- ({ size_t __len = (len); \
+ ({ const size_t __len = (len); \
void *__ret; \
if (__builtin_constant_p(len) && __len >= 64) \
__ret = __memcpy((dst),(src),__len); \
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-03-30 2:27 ` Gerold Jury
@ 2005-03-30 6:15 ` Denis Vlasenko
2005-04-01 21:43 ` Jan Hubicka
0 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-30 6:15 UTC (permalink / raw)
To: Gerold Jury, jakub; +Cc: Linux Kernel Mailing List, gcc
On Wednesday 30 March 2005 05:27, Gerold Jury wrote:
>
> >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> >> > /*
> >> > * This looks horribly ugly, but the compiler can optimize it totally,
> >> > * as the count is constant.
> >> > */
> >> > static inline void * __constant_memcpy(void * to, const void * from,
> >> > size_t n) {
> >> > if (n <= 128)
> >> > return __builtin_memcpy(to, from, n);
> >>
> >> The problem is that in GCC < 4.0 there is no constant propagation
> >> pass before expanding builtin functions, so the __builtin_memcpy
> >> call above sees a variable rather than a constant.
> >
> >or change "size_t n" to "const size_t n" will also fix the issue.
> >As we do some (well very little and with inlining and const values)
> >const progation before 4.0.0 on the trees before expanding the builtin.
> >
> >-- Pinski
> >-
> I used the following "const size_t n" change on x86_64
> and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
> (kernel 2.6.12-rc1, running now)
What do you mean, 'reduced'?
(/me is checking....)
Oh shit... It still emits half of memcpys, to be exact - for
struct copies:
arch/i386/kernel/process.c:
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
struct pt_regs * childregs;
struct task_struct *tsk;
int err;
childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
*childregs = *regs;
^^^^^^^^^^^^^^^^^^^
childregs->eax = 0;
childregs->esp = esp;
# make arch/i386/kernel/process.s
copy_thread:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $20, %esp
movl 24(%ebp), %eax
movl 4(%eax), %esi
pushl $60
leal 8132(%esi), %ebx
pushl 28(%ebp)
pushl %ebx
call memcpy <=================
movl $0, 24(%ebx)
movl 16(%ebp), %eax
movl %eax, 52(%ebx)
movl 24(%ebp), %edx
addl $8192, %esi
movl %ebx, 516(%edx)
movl %esi, -32(%ebp)
movl %esi, 504(%edx)
movl $ret_from_fork, 512(%edx)
Jakub, is there a way to instruct gcc to inine this copy, or better yet,
to use user-supplied inline version of memcpy?
--
vda
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-03-30 6:15 ` Denis Vlasenko
@ 2005-04-01 21:43 ` Jan Hubicka
2005-04-02 12:18 ` Denis Vlasenko
0 siblings, 1 reply; 24+ messages in thread
From: Jan Hubicka @ 2005-04-01 21:43 UTC (permalink / raw)
To: Denis Vlasenko; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc
> On Wednesday 30 March 2005 05:27, Gerold Jury wrote:
> >
> > >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> > >> > /*
> > >> > * This looks horribly ugly, but the compiler can optimize it totally,
> > >> > * as the count is constant.
> > >> > */
> > >> > static inline void * __constant_memcpy(void * to, const void * from,
> > >> > size_t n) {
> > >> > if (n <= 128)
> > >> > return __builtin_memcpy(to, from, n);
> > >>
> > >> The problem is that in GCC < 4.0 there is no constant propagation
> > >> pass before expanding builtin functions, so the __builtin_memcpy
> > >> call above sees a variable rather than a constant.
> > >
> > >or change "size_t n" to "const size_t n" will also fix the issue.
> > >As we do some (well very little and with inlining and const values)
> > >const progation before 4.0.0 on the trees before expanding the builtin.
> > >
> > >-- Pinski
> > >-
> > I used the following "const size_t n" change on x86_64
> > and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
> > (kernel 2.6.12-rc1, running now)
>
> What do you mean, 'reduced'?
>
> (/me is checking....)
>
> Oh shit... It still emits half of memcpys, to be exact - for
> struct copies:
>
> arch/i386/kernel/process.c:
>
> int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
> unsigned long unused,
> struct task_struct * p, struct pt_regs * regs)
> {
> struct pt_regs * childregs;
> struct task_struct *tsk;
> int err;
>
> childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
> *childregs = *regs;
> ^^^^^^^^^^^^^^^^^^^
> childregs->eax = 0;
> childregs->esp = esp;
>
> # make arch/i386/kernel/process.s
>
> copy_thread:
> pushl %ebp
> movl %esp, %ebp
> pushl %edi
> pushl %esi
> pushl %ebx
> subl $20, %esp
> movl 24(%ebp), %eax
> movl 4(%eax), %esi
> pushl $60
> leal 8132(%esi), %ebx
> pushl 28(%ebp)
> pushl %ebx
> call memcpy <=================
> movl $0, 24(%ebx)
> movl 16(%ebp), %eax
> movl %eax, 52(%ebx)
> movl 24(%ebp), %edx
> addl $8192, %esi
> movl %ebx, 516(%edx)
> movl %esi, -32(%ebp)
> movl %esi, 504(%edx)
> movl $ret_from_fork, 512(%edx)
>
> Jakub, is there a way to instruct gcc to inine this copy, or better yet,
> to use user-supplied inline version of memcpy?
You can't inline struct copy as it is not function call at first place.
You can experiment with -minline-all-stringops where GCC will use it's
own memcpy implementation for this.
Honza
> --
> vda
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-04-01 21:43 ` Jan Hubicka
@ 2005-04-02 12:18 ` Denis Vlasenko
2005-04-02 12:26 ` Denis Vlasenko
0 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-02 12:18 UTC (permalink / raw)
To: Jan Hubicka; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc
> > childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
> > *childregs = *regs;
> > ^^^^^^^^^^^^^^^^^^^
> > childregs->eax = 0;
> > childregs->esp = esp;
> >
> > # make arch/i386/kernel/process.s
> >
> > copy_thread:
> > pushl %ebp
> > movl %esp, %ebp
> > pushl %edi
> > pushl %esi
> > pushl %ebx
> > subl $20, %esp
> > movl 24(%ebp), %eax
> > movl 4(%eax), %esi
> > pushl $60
> > leal 8132(%esi), %ebx
> > pushl 28(%ebp)
> > pushl %ebx
> > call memcpy <=================
> > movl $0, 24(%ebx)
> >
> > Jakub, is there a way to instruct gcc to inine this copy, or better yet,
> > to use user-supplied inline version of memcpy?
>
> You can't inline struct copy as it is not function call at first place.
> You can experiment with -minline-all-stringops where GCC will use it's
> own memcpy implementation for this.
No luck. Actually, memcpy calls are produced with -Os. Adding
-minline-all-stringops changes nothing.
-O2 compile does inline copying, however, suboptimally.
Pushing/popping esi/edi on the stack is not needed.
Also "mov $1,ecx; rep; movsl" is rather silly.
Here what did I test:
t.c:
#define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; }
#define STRUCT(n) STRUCT1(n)
STRUCT(1)
STRUCT(2)
STRUCT(3)
STRUCT(4)
STRUCT(5)
STRUCT(6)
STRUCT(7)
STRUCT(8)
STRUCT(9)
STRUCT(10)
STRUCT(11)
STRUCT(12)
STRUCT(13)
STRUCT(14)
STRUCT(15)
STRUCT(16)
STRUCT(17)
STRUCT(18)
STRUCT(19)
STRUCT(20)
mk.sh:
#!/bin/sh
# yeah yeah. push/pop + 1 repetition 'rep movsl'
# 88: 55 push %ebp
# 89: 89 e5 mov %esp,%ebp
# 8b: 57 push %edi
# 8c: 56 push %esi
# 8d: fc cld
# 8e: bf 00 00 00 00 mov $0x0,%edi
# 8f: R_386_32 v7
# 93: be 00 00 00 00 mov $0x0,%esi
# 94: R_386_32 w7
# 98: b9 01 00 00 00 mov $0x1,%ecx
# 9d: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
# 9f: 66 a5 movsw %ds:(%esi),%es:(%edi)
# a1: a4 movsb %ds:(%esi),%es:(%edi)
# a2: 5e pop %esi
# a3: 5f pop %edi
# a4: c9 leave
# a5: c3 ret
# a6: 89 f6 mov %esi,%esi
if false; then
gcc -O2 \
falign-functions=1 -falign-labels=1 -falign-loops=1 -falign-jumps=1 \
-c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi
# -Os: emits memcpy
if false; then
gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \
-Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \
-fno-common -ffreestanding -Os -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement -c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi
# -march=486: emits horrible tail:
# 271: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
# 273: 5e pop %esi
# 274: 66 a1 10 00 00 00 mov 0x10,%ax
# 276: R_386_32 w19
# 27a: 5f pop %edi
# 27b: 66 a3 10 00 00 00 mov %ax,0x10
# 27d: R_386_32 v19
# 281: 5d pop %ebp
# 282: a0 12 00 00 00 mov 0x12,%al
# 283: R_386_32 w19
# 287: a2 12 00 00 00 mov %al,0x12
# 288: R_386_32 v19
# 28c: c3 ret
if false; then
gcc \
-fno-common -ffreestanding -O2 -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement \
-c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi
# -Os -minline-all-stringops: still emits memcpy
if true; then
gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \
-Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \
-fno-common -ffreestanding -Os -minline-all-stringops -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement -c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi
--
vda
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
2005-04-02 12:18 ` Denis Vlasenko
@ 2005-04-02 12:26 ` Denis Vlasenko
2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
0 siblings, 1 reply; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-02 12:26 UTC (permalink / raw)
To: Jan Hubicka; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc
[-- Attachment #1: Type: text/plain, Size: 360 bytes --]
On Saturday 02 April 2005 15:18, Denis Vlasenko wrote:
> -O2 compile does inline copying, however, suboptimally.
> Pushing/popping esi/edi on the stack is not needed.
> Also "mov $1,ecx; rep; movsl" is rather silly.
I think I am wrong about push/pop. Sorry.
However, other observation is still valid. You
may wish to compile this updated t.c and see.
--
vda
[-- Attachment #2: t.c --]
[-- Type: text/x-csrc, Size: 2513 bytes --]
static inline void * __memcpy(void * to, const void * from, int n)
{
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
"movl %4,%%ecx\n\t"
"andl $3,%%ecx\n\t"
"jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */
"rep ; movsb\n\t"
"1:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
: "memory");
return (to);
}
/*
* This looks ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, int n)
{
#if 1 /* want to do small copies with non-string ops? */
switch (n) {
case 0: return to;
case 1: *(char*)to = *(char*)from; return to;
case 2: *(short*)to = *(short*)from; return to;
case 4: *(int*)to = *(int*)from; return to;
#if 1 /* including those doable with two moves? */
case 3: *(short*)to = *(short*)from;
*((char*)to+2) = *((char*)from+2); return to;
case 5: *(int*)to = *(int*)from;
*((char*)to+4) = *((char*)from+4); return to;
case 6: *(int*)to = *(int*)from;
*((short*)to+2) = *((short*)from+2); return to;
case 8: *(int*)to = *(int*)from;
*((int*)to+1) = *((int*)from+1); return to;
#endif
}
#else
if (!n) return to;
#endif
{
/* load esi/edi */
int esi, edi;
__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
if (n >= 5*4) {
/* large block: use rep prefix */
int ecx;
__asm__ __volatile__(
"rep ; movsl"
: "=&c" (ecx)
: "0" (n/4)
);
} else {
/* small block: don't clobber ecx + smaller code */
if (n >= 4*4) __asm__ __volatile__("movsl");
if (n >= 3*4) __asm__ __volatile__("movsl");
if (n >= 2*4) __asm__ __volatile__("movsl");
if (n >= 1*4) __asm__ __volatile__("movsl");
}
switch (n % 4) {
/* tail */
case 0: return to;
case 1: __asm__ __volatile__("movsb"); return to;
case 2: __asm__ __volatile__("movsw"); return to;
default: __asm__ __volatile__("movsw\n\tmovsb"); return to;
}
}
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
__constant_memcpy((t),(f),(n)) : \
__memcpy((t),(f),(n)))
#define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; } void g##n(void) { memcpy(&v##n,&w##n,n); }
#define STRUCT(n) STRUCT1(n)
STRUCT(1)
STRUCT(2)
STRUCT(3)
STRUCT(4)
STRUCT(5)
STRUCT(6)
STRUCT(7)
STRUCT(8)
STRUCT(9)
STRUCT(10)
STRUCT(11)
STRUCT(12)
STRUCT(13)
STRUCT(14)
STRUCT(15)
STRUCT(16)
STRUCT(17)
STRUCT(18)
STRUCT(19)
STRUCT(20)
^ permalink raw reply [flat|nested] 24+ messages in thread
* [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-02 12:26 ` Denis Vlasenko
@ 2005-04-05 16:34 ` Christophe Saout
2005-04-06 10:14 ` Denis Vlasenko
2005-04-06 16:11 ` Denis Vlasenko
0 siblings, 2 replies; 24+ messages in thread
From: Christophe Saout @ 2005-04-05 16:34 UTC (permalink / raw)
To: Denis Vlasenko
Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub,
Linux Kernel Mailing List, gcc
[-- Attachment #1: Type: text/plain, Size: 1798 bytes --]
Hi Denis,
the new i386 memcpy macro is a ticking timebomb.
I've been debugging a new mISDN crash, just to find out that a memcpy
was not inlined correctly.
Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).
This source code:
mISDN_pid_t pid;
[...]
memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t));
was compiled as:
lea 0xffffffa4(%ebp),%esi <---- %esi is loaded
( add $0x10,%ebx )
( mov %ebx,%eax ) something else
( call 1613 <test_stack_protocol+0x83> ) %esi preserved
mov 0xffffffa0(%ebp),%edx
mov 0x74(%edx),%edi <---- %edi is loaded
add $0x20,%edi offset in structure added
! mov $0x14,%esi !!!!!! <---- %esi overwritten!
mov %esi,%ecx <---- %ecx loaded
repz movsl %ds:(%esi),%es:(%edi)
Apparently the compiled decided that the value 0x14 could be reused
afterwards (which it does for an inlined memset of the same size some
instructions below) and clobbers %esi.
Looking at the macro:
__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
if (n >= 5*4) {
/* large block: use rep prefix */
int ecx;
__asm__ __volatile__(
"rep ; movsl"
: "=&c" (ecx)
it seems obvious that the compiled assumes it can reuse %esi and %edi
for something else between the two __asm__ sections. These should
probably be merged.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
@ 2005-04-06 10:14 ` Denis Vlasenko
2005-04-06 11:05 ` Dave Korn
2005-04-06 12:05 ` Christophe Saout
2005-04-06 16:11 ` Denis Vlasenko
1 sibling, 2 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-06 10:14 UTC (permalink / raw)
To: Christophe Saout
Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub,
Linux Kernel Mailing List, gcc
On Tuesday 05 April 2005 19:34, Christophe Saout wrote:
> Hi Denis,
>
> the new i386 memcpy macro is a ticking timebomb.
>
> I've been debugging a new mISDN crash, just to find out that a memcpy
> was not inlined correctly.
>
> Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).
>
> This source code:
>
> mISDN_pid_t pid;
> [...]
> memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t));
>
> was compiled as:
>
> lea 0xffffffa4(%ebp),%esi <---- %esi is loaded
> ( add $0x10,%ebx )
> ( mov %ebx,%eax ) something else
> ( call 1613 <test_stack_protocol+0x83> ) %esi preserved
> mov 0xffffffa0(%ebp),%edx
> mov 0x74(%edx),%edi <---- %edi is loaded
> add $0x20,%edi offset in structure added
> ! mov $0x14,%esi !!!!!! <---- %esi overwritten!
> mov %esi,%ecx <---- %ecx loaded
> repz movsl %ds:(%esi),%es:(%edi)
>
> Apparently the compiled decided that the value 0x14 could be reused
> afterwards (which it does for an inlined memset of the same size some
> instructions below) and clobbers %esi.
>
> Looking at the macro:
>
> __asm__ __volatile__(
> ""
> : "=&D" (edi), "=&S" (esi)
> : "0" ((long) to),"1" ((long) from)
> : "memory"
> );
> }
> if (n >= 5*4) {
> /* large block: use rep prefix */
> int ecx;
> __asm__ __volatile__(
> "rep ; movsl"
> : "=&c" (ecx)
>
> it seems obvious that the compiled assumes it can reuse %esi and %edi
> for something else between the two __asm__ sections. These should
> probably be merged.
Oh shit. I was trying to be too clever. I still run with this patch,
so it must be happening very rarely.
Does this one compile ok?
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
long esi, edi;
#if 1 /* want to do small copies with non-string ops? */
switch (n) {
case 0: return to;
case 1: *(char*)to = *(char*)from; return to;
case 2: *(short*)to = *(short*)from; return to;
case 4: *(int*)to = *(int*)from; return to;
#if 1 /* including those doable with two moves? */
case 3: *(short*)to = *(short*)from;
*((char*)to+2) = *((char*)from+2); return to;
case 5: *(int*)to = *(int*)from;
*((char*)to+4) = *((char*)from+4); return to;
case 6: *(int*)to = *(int*)from;
*((short*)to+2) = *((short*)from+2); return to;
case 8: *(int*)to = *(int*)from;
*((int*)to+1) = *((int*)from+1); return to;
#endif
}
#else
if (!n) return to;
#endif
{
/* load esi/edi */
__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
if (n >= 5*4) {
/* large block: use rep prefix */
int ecx;
__asm__ __volatile__(
"rep ; movsl"
: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
: "0" (n/4), "1" (edi),"2" (esi)
: "memory"
);
} else {
/* small block: don't clobber ecx + smaller code */
if (n >= 4*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
if (n >= 3*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
if (n >= 2*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
if (n >= 1*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
}
switch (n % 4) {
/* tail */
case 0: return to;
case 1: __asm__ __volatile__("movsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
case 2: __asm__ __volatile__("movsw":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
default: __asm__ __volatile__("movsw\n\tmovsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
}
}
--
vda
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 10:14 ` Denis Vlasenko
@ 2005-04-06 11:05 ` Dave Korn
2005-04-06 11:13 ` Dave Korn
2005-04-06 12:05 ` Christophe Saout
1 sibling, 1 reply; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:05 UTC (permalink / raw)
To: 'Denis Vlasenko', 'Christophe Saout'
Cc: 'Andrew Morton', 'Jan Hubicka',
'Gerold Jury', jakub, 'Linux Kernel Mailing List',
gcc
----Original Message----
>From: Denis Vlasenko
>Sent: 06 April 2005 11:14
Is this someone's idea of an April Fool's joke? Because if it is, I've
suffered a serious sense-of-humour failure.
> Oh shit. I was trying to be too clever. I still run with this patch,
> so it must be happening very rarely.
The kernel is way too important for cross-your-fingers-and-hope
engineering techniques to be applied. This patch should never have been
permitted. How on earth could anything like this hope to make it through a
strict review?
> Does this one compile ok?
> {
> /* load esi/edi */
> __asm__ __volatile__(
> ""
> : "=&D" (edi), "=&S" (esi)
> : "0" ((long) to),"1" ((long) from)
> : "memory"
> );
> }
> if (n >= 5*4) {
> /* large block: use rep prefix */
> int ecx;
> __asm__ __volatile__(
> "rep ; movsl"
> : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
> : "0" (n/4), "1" (edi),"2" (esi)
> : "memory"
> );
It doesn't matter if it compiles or not, it's still *utterly* invalid.
You can NOT make assumptions about registers keeping their values between
one asm block and another. Immediately after the closing quote of the first
asm, the compiler can do ANYTHING IT WANTS and to just _hope_ that it won't
use the registers you want is voodoo programming. Even if it works when you
try it once, there are zero guarantees that another version or revision of
the compiler or even just a tiny change to the source that affects the
behaviour of the scheduler when compiling the function won't produce
something completely different, meaning that this code is appallingly
fragile. This code should be completely discarded and rewritten properly.
cheers,
DaveK
--
Can't think of a witty .sigline today....
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 11:05 ` Dave Korn
@ 2005-04-06 11:13 ` Dave Korn
2005-04-06 11:53 ` Dave Korn
0 siblings, 1 reply; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:13 UTC (permalink / raw)
To: 'Dave Korn', 'Denis Vlasenko',
'Christophe Saout'
Cc: 'Andrew Morton', 'Jan Hubicka',
'Gerold Jury', jakub, 'Linux Kernel Mailing List',
gcc
----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:06
Me and my big mouth.
OK, that one does work.
Sorry for the outburst.
cheers,
DaveK
--
Can't think of a witty .sigline today....
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 11:13 ` Dave Korn
@ 2005-04-06 11:53 ` Dave Korn
2005-04-06 11:56 ` Dave Korn
2005-04-06 13:18 ` Richard B. Johnson
0 siblings, 2 replies; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:53 UTC (permalink / raw)
To: 'Dave Korn', 'Denis Vlasenko',
'Christophe Saout'
Cc: 'Andrew Morton', 'Jan Hubicka',
'Gerold Jury', jakub, 'Linux Kernel Mailing List',
gcc
----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:13
> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:06
>
>
> Me and my big mouth.
>
> OK, that one does work.
>
> Sorry for the outburst.
>
.... well, actually, maybe it doesn't after all.
What's that uninitialised variable ecx doing there eh?
cheers,
DaveK
--
Can't think of a witty .sigline today....
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 11:53 ` Dave Korn
@ 2005-04-06 11:56 ` Dave Korn
2005-04-06 13:18 ` Richard B. Johnson
1 sibling, 0 replies; 24+ messages in thread
From: Dave Korn @ 2005-04-06 11:56 UTC (permalink / raw)
To: 'Dave Korn', 'Denis Vlasenko',
'Christophe Saout'
Cc: 'Andrew Morton', 'Jan Hubicka',
'Gerold Jury', jakub, 'Linux Kernel Mailing List',
gcc
----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:53
> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:13
>
>> ----Original Message----
>>> From: Dave Korn
>>> Sent: 06 April 2005 12:06
>>
>>
>> Me and my big mouth.
>>
>> OK, that one does work.
>>
>> Sorry for the outburst.
>>
>
>
> .... well, actually, maybe it doesn't after all.
>
>
> What's that uninitialised variable ecx doing there eh?
Oh, I see, it's there as an output so it can be matched as an input by the
"0" constraint.
Ok, guess it does.
cheers,
DaveK
--
Can't think of a witty .sigline today....
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 10:14 ` Denis Vlasenko
2005-04-06 11:05 ` Dave Korn
@ 2005-04-06 12:05 ` Christophe Saout
2005-04-06 12:36 ` Andrew Haley
2005-04-06 15:18 ` Paolo Bonzini
1 sibling, 2 replies; 24+ messages in thread
From: Christophe Saout @ 2005-04-06 12:05 UTC (permalink / raw)
To: Denis Vlasenko
Cc: gcc, Linux Kernel Mailing List, jakub, Gerold Jury, Jan Hubicka,
Andrew Morton
[-- Attachment #1: Type: text/plain, Size: 1628 bytes --]
Am Mittwoch, den 06.04.2005, 13:14 +0300 schrieb Denis Vlasenko:
> Oh shit. I was trying to be too clever. I still run with this patch,
> so it must be happening very rarely.
Yes, that's right, it happened with code that's not in the mainline tree
but could have happened anywhere.
> Does this one compile ok?
Yes, the case that failed is now okay. I changed it slightly to assign
esi and edi directy on top of the functions, no asm section needed here.
The compiler will make sure that they have the correct values when
needed.
In the case above the compiler now uses %ebx to save the loop counter
instead of %esi.
In drivers/cdrom/cdrom.c I'm observing one very strange thing though.
It appears that the compiler decided to put the local variable edi on
the stack for some unexplicable reason (or possibly there is?). Since
the asm sections are memory barriers the compiler then saves the value
of %edi on the stack before entering the next assembler section.
1f1c: a5 movsl %ds:(%esi),%es:(%edi)
1f1d: 89 7d 84 mov %edi,0xffffff84(%ebp)
1f20: a5 movsl %ds:(%esi),%es:(%edi)
1f21: 89 7d 84 mov %edi,0xffffff84(%ebp)
1f24: 66 a5 movsw %ds:(%esi),%es:(%edi)
(this is a constant 10 byte memcpy)
The only thing that would avoid this is to either tell the compiler to
never put esi/edi in memory (which I think is not possibly across
different versions of gcc) or to always generate a single asm section
for all the different cases.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 12:05 ` Christophe Saout
@ 2005-04-06 12:36 ` Andrew Haley
2005-04-06 15:18 ` Paolo Bonzini
1 sibling, 0 replies; 24+ messages in thread
From: Andrew Haley @ 2005-04-06 12:36 UTC (permalink / raw)
To: Christophe Saout
Cc: Denis Vlasenko, gcc, Linux Kernel Mailing List, jakub,
Gerold Jury, Jan Hubicka, Andrew Morton
I'm having a little difficulty understanding what this is for. Is it
that gcc's builtin memcpy expander generates bad code, or that older
versions of gcc generate bad code, or what? gcc generates too much
code?
Andrew.
^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 11:53 ` Dave Korn
2005-04-06 11:56 ` Dave Korn
@ 2005-04-06 13:18 ` Richard B. Johnson
2005-04-06 14:16 ` Denis Vlasenko
1 sibling, 1 reply; 24+ messages in thread
From: Richard B. Johnson @ 2005-04-06 13:18 UTC (permalink / raw)
To: Dave Korn
Cc: Denis Vlasenko, Christophe Saout, Andrew Morton, Jan Hubicka,
Gerold Jury, jakub, Linux Kernel Mailing List, gcc
[-- Attachment #1: Type: TEXT/PLAIN, Size: 1265 bytes --]
Attached is inline ix86 memcpy() plus test code that tests its
corner-cases. The in-line code makes no jumps, but uses longword
copies, word copies and any spare byte copy. It works at all
offsets, doesn't require alignment but would work fastest if
both source and destination were longword aligned.
On Wed, 6 Apr 2005, Dave Korn wrote:
> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:13
>
>> ----Original Message----
>>> From: Dave Korn
>>> Sent: 06 April 2005 12:06
>>
>>
>> Me and my big mouth.
>>
>> OK, that one does work.
>>
>> Sorry for the outburst.
>>
>
>
> .... well, actually, maybe it doesn't after all.
>
>
> What's that uninitialised variable ecx doing there eh?
>
>
> cheers,
> DaveK
> --
> Can't think of a witty .sigline today....
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Cheers,
Dick Johnson
Penguin : Linux version 2.6.11 on an i686 machine (5537.79 BogoMips).
Notice : All mail here is now cached for review by Dictator Bush.
98.36% of all statistics are fiction.
[-- Attachment #2: Type: TEXT/PLAIN, Size: 1421 bytes --]
#include <stdio.h>
#include <string.h>
//
// Inline ix86 memcpy() that contains no jumps. Not copied from
// anybody. Contributed by rjohnson@analogic.com
//
static __inline__ void *memcpy(void *dst, void *src, size_t len) {
void *ret = dst;
__asm__ __volatile__( \
"cld\n" \
"shr $1, %%ecx\n" \
"pushf\n" \
"shr $1, %%ecx\n" \
"pushf\n" \
"rep\n" \
"movsl\n" \
"popf\n" \
"adcl %%ecx, %%ecx\n" \
"rep\n" \
"movsw\n" \
"popf\n" \
"adcl %%ecx, %%ecx\n" \
"rep\n" \
"movsb\n" \
: "=D" (dst), "=S" (src), "=c"(len)
: "0" (dst), "1" (src), "2" (len)
: "memory" );
return ret;
}
const char tester[]= "0123456789"
"0123456789"
"0123456789"
"0123456789"
"0123456789"
"0123456789"
"0123456789"
"0123456789";
char allocated[0x1000];
int main()
{
size_t i;
char buf[0x1000];
memset(buf, 0x00, sizeof(buf));
for(i=0; i< sizeof(buf); i++)
puts(memcpy(buf, (char *)tester, i));
memset(buf, 0x00, sizeof(buf));
for(i=0; i< sizeof(buf)-1; i++)
puts(memcpy(&buf[1], (char *)tester, i));
memset(buf, 0x00, sizeof(buf));
for(i=0; i< sizeof(buf)-2; i++)
puts(memcpy(&buf[2], (char *)tester, i));
memset(buf, 0x00, sizeof(buf));
for(i=0; i< sizeof(buf)-3; i++)
puts(memcpy(&buf[3], (char *)tester, i));
return 0;
}
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 13:18 ` Richard B. Johnson
@ 2005-04-06 14:16 ` Denis Vlasenko
0 siblings, 0 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-06 14:16 UTC (permalink / raw)
To: linux-os, Dave Korn
Cc: Denis Vlasenko, Christophe Saout, Andrew Morton, Jan Hubicka,
Gerold Jury, jakub, Linux Kernel Mailing List, gcc
On Wednesday 06 April 2005 16:18, Richard B. Johnson wrote:
>
> Attached is inline ix86 memcpy() plus test code that tests its
> corner-cases. The in-line code makes no jumps, but uses longword
> copies, word copies and any spare byte copy. It works at all
> offsets, doesn't require alignment but would work fastest if
> both source and destination were longword aligned.
Yours is:
"shr $1, %%ecx\n" \
"pushf\n" \
"shr $1, %%ecx\n" \
"pushf\n" \ <=== not needed
"rep\n" \
"movsl\n" \
"popf\n" \ <=== not needed
"adcl %%ecx, %%ecx\n" \
"rep\n" \
"movsw\n" \
"popf\n" \
"adcl %%ecx, %%ecx\n" \
"rep\n" \
"movsb\n" \
You struggle too much for that movsw.
-mm one (which happen to be mine) is:
"movl %ecx,%4"
"shr $2,%ecx"
"rep ; movsl"
"movl %4,%%ecx"
"andl $3,%%ecx"
"jz 1ft" /* pay 2 byte penalty for a chance to skip microcoded rep */
"rep ; movsb"
"1:"
and I can still drop that jz. It is there just to have
a chance to skip rep movsb, it was measured to be slow
enough to matter. rep movs are a bit slow to start, on small
blocks it is measurable.
However, maybe it is even better without jz,
need to benchmark 'cold path' (i.e. where branch predictor
have no data to predict it) somehow.
--
vda
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-06 12:05 ` Christophe Saout
2005-04-06 12:36 ` Andrew Haley
@ 2005-04-06 15:18 ` Paolo Bonzini
1 sibling, 0 replies; 24+ messages in thread
From: Paolo Bonzini @ 2005-04-06 15:18 UTC (permalink / raw)
To: linux-kernel; +Cc: gcc
> The only thing that would avoid this is to either tell the compiler to
> never put esi/edi in memory (which I think is not possibly across
> different versions of gcc) or to always generate a single asm section
> for all the different cases.
Use __asm__ ("%esi") and __asm__ ("%edi"). It is not guaranteed that
they access the registers always (you can still have copy propagation
etcetera); but, if your __asm__ statement constraints match the register
you specify, then you can be reasonably sure that good code is produced.
Paolo
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy
2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
2005-04-06 10:14 ` Denis Vlasenko
@ 2005-04-06 16:11 ` Denis Vlasenko
1 sibling, 0 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-04-06 16:11 UTC (permalink / raw)
To: Christophe Saout
Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub,
Linux Kernel Mailing List, gcc
[-- Attachment #1: Type: text/plain, Size: 620 bytes --]
On Tuesday 05 April 2005 19:34, Christophe Saout wrote:
> the new i386 memcpy macro is a ticking timebomb.
>
> I've been debugging a new mISDN crash, just to find out that a memcpy
> was not inlined correctly.
>
> Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).
Updated patch against 2.6.11 follows. This one, like the original
patch, is run tested too.
This time I took no chances, esi/edi contents are
explicitly propagated from one asm() block to another.
I didn't do it before, not expecting that gcc can be
soooo incredibly clever. Sorry.
Christophe does this one look/compile ok?
--
vda
[-- Attachment #2: string2.h.diff --]
[-- Type: text/x-diff, Size: 3288 bytes --]
--- linux-2.6.11.src/include/asm-i386/string.h.orig Thu Mar 3 09:31:08 2005
+++ linux-2.6.11.src/include/asm-i386/string.h Wed Apr 6 19:08:39 2005
@@ -198,47 +198,80 @@ static inline void * __memcpy(void * to,
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
- "testb $2,%b4\n\t"
- "je 1f\n\t"
- "movsw\n"
- "1:\ttestb $1,%b4\n\t"
- "je 2f\n\t"
- "movsb\n"
- "2:"
+ "movl %4,%%ecx\n\t"
+ "andl $3,%%ecx\n\t"
+#if 1 /* want to pay 2 byte penalty for a chance to skip microcoded rep? */
+ "jz 1f\n\t"
+#endif
+ "rep ; movsb\n\t"
+ "1:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
- :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
+ : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from)
: "memory");
return (to);
}
/*
- * This looks horribly ugly, but the compiler can optimize it totally,
+ * This looks ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
- if (n <= 128)
- return __builtin_memcpy(to, from, n);
-
-#define COMMON(x) \
-__asm__ __volatile__( \
- "rep ; movsl" \
- x \
- : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
- : "0" (n/4),"1" ((long) to),"2" ((long) from) \
- : "memory");
-{
- int d0, d1, d2;
+ long esi, edi;
+ if (!n) return to;
+#if 1 /* want to do small copies with non-string ops? */
+ switch (n) {
+ case 1: *(char*)to = *(char*)from; return to;
+ case 2: *(short*)to = *(short*)from; return to;
+ case 4: *(int*)to = *(int*)from; return to;
+#if 1 /* including those doable with two moves? */
+ case 3: *(short*)to = *(short*)from;
+ *((char*)to+2) = *((char*)from+2); return to;
+ case 5: *(int*)to = *(int*)from;
+ *((char*)to+4) = *((char*)from+4); return to;
+ case 6: *(int*)to = *(int*)from;
+ *((short*)to+2) = *((short*)from+2); return to;
+ case 8: *(int*)to = *(int*)from;
+ *((int*)to+1) = *((int*)from+1); return to;
+#endif
+ }
+#endif
+ esi = (long) from;
+ edi = (long) to;
+ if (n >= 5*4) {
+ /* large block: use rep prefix */
+ int ecx;
+ __asm__ __volatile__(
+ "rep ; movsl"
+ : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
+ : "0" (n/4), "1" (edi),"2" (esi)
+ : "memory"
+ );
+ } else {
+ /* small block: don't clobber ecx + smaller code */
+ if (n >= 4*4) __asm__ __volatile__("movsl"
+ :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+ if (n >= 3*4) __asm__ __volatile__("movsl"
+ :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+ if (n >= 2*4) __asm__ __volatile__("movsl"
+ :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+ if (n >= 1*4) __asm__ __volatile__("movsl"
+ :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+ }
switch (n % 4) {
- case 0: COMMON(""); return to;
- case 1: COMMON("\n\tmovsb"); return to;
- case 2: COMMON("\n\tmovsw"); return to;
- default: COMMON("\n\tmovsw\n\tmovsb"); return to;
+ /* tail */
+ case 0: return to;
+ case 1: __asm__ __volatile__("movsb"
+ :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+ return to;
+ case 2: __asm__ __volatile__("movsw"
+ :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+ return to;
+ default: __asm__ __volatile__("movsw\n\tmovsb"
+ :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
+ return to;
}
}
-
-#undef COMMON
-}
#define __HAVE_ARCH_MEMCPY
^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2005-04-07 7:32 UTC | newest]
Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko
2005-03-29 15:06 ` Richard Guenther
2005-03-29 15:08 ` Nathan Sidwell
2005-03-29 15:13 ` Jakub Jelinek
2005-03-29 15:42 ` Andrew Pinski
2005-03-30 2:27 ` Gerold Jury
2005-03-30 6:15 ` Denis Vlasenko
2005-04-01 21:43 ` Jan Hubicka
2005-04-02 12:18 ` Denis Vlasenko
2005-04-02 12:26 ` Denis Vlasenko
2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout
2005-04-06 10:14 ` Denis Vlasenko
2005-04-06 11:05 ` Dave Korn
2005-04-06 11:13 ` Dave Korn
2005-04-06 11:53 ` Dave Korn
2005-04-06 11:56 ` Dave Korn
2005-04-06 13:18 ` Richard B. Johnson
2005-04-06 14:16 ` Denis Vlasenko
2005-04-06 12:05 ` Christophe Saout
2005-04-06 12:36 ` Andrew Haley
2005-04-06 15:18 ` Paolo Bonzini
2005-04-06 16:11 ` Denis Vlasenko
2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko
2005-03-29 20:24 ` Denis Vlasenko
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox