* memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel
@ 2005-03-29 14:37 Denis Vlasenko
2005-03-29 15:06 ` Richard Guenther
` (3 more replies)
0 siblings, 4 replies; 24+ messages in thread
From: Denis Vlasenko @ 2005-03-29 14:37 UTC (permalink / raw)
To: linux-kernel, gcc
Try testcase below the sig.
This causes nearly one thousand calls to memcpy in my kernel
(not an allyesconfig one):
# objdump -d vmlinux | grep -F '<memcpy>' | wc -l
959
# gcc -O2 -c t.c
# objdump -r -d t.o
t.o: file format elf32-i386
Disassembly of section .text:
00000000 <f3>:
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 83 ec 0c sub $0xc,%esp
6: 6a 03 push $0x3
8: ff 75 0c pushl 0xc(%ebp)
b: ff 75 08 pushl 0x8(%ebp)
e: e8 fc ff ff ff call f <f3+0xf>
f: R_386_PC32 memcpy
13: 83 c4 10 add $0x10,%esp
16: c9 leave
17: c3 ret
00000018 <f3b>:
18: 55 push %ebp
19: 89 e5 mov %esp,%ebp
1b: 8b 55 0c mov 0xc(%ebp),%edx
1e: 66 8b 02 mov (%edx),%ax
21: 8b 4d 08 mov 0x8(%ebp),%ecx
24: 66 89 01 mov %ax,(%ecx)
27: 8a 42 02 mov 0x2(%edx),%al
2a: 88 41 02 mov %al,0x2(%ecx)
2d: c9 leave
2e: c3 ret
2f: 90 nop
00000030 <f3k>:
30: 55 push %ebp
31: 89 e5 mov %esp,%ebp
33: 57 push %edi
34: 56 push %esi
35: 8b 7d 08 mov 0x8(%ebp),%edi
38: 8b 75 0c mov 0xc(%ebp),%esi
3b: b9 ee 02 00 00 mov $0x2ee,%ecx
40: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
42: 5e pop %esi
43: 5f pop %edi
44: c9 leave
45: c3 ret
--
vda
typedef unsigned int size_t;
static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return (to);
}
/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
if (n <= 128)
return __builtin_memcpy(to, from, n);
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; movsl" \
x \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (n/4),"1" ((long) to),"2" ((long) from) \
: "memory");
{
int d0, d1, d2;
switch (n % 4) {
case 0: COMMON(""); return to;
case 1: COMMON("\n\tmovsb"); return to;
case 2: COMMON("\n\tmovsw"); return to;
default: COMMON("\n\tmovsw\n\tmovsb"); return to;
}
}
#undef COMMON
}
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
__constant_memcpy((t),(f),(n)) : \
__memcpy((t),(f),(n)))
int f3(char *a, char *b) { memcpy(a,b,3); }
int f3b(char *a, char *b) { __builtin_memcpy(a,b,3); }
int f3k(char *a, char *b) { memcpy(a,b,3000); }
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko @ 2005-03-29 15:06 ` Richard Guenther 2005-03-29 15:08 ` Nathan Sidwell ` (2 subsequent siblings) 3 siblings, 0 replies; 24+ messages in thread From: Richard Guenther @ 2005-03-29 15:06 UTC (permalink / raw) To: Denis Vlasenko; +Cc: linux-kernel, gcc On Tue, 29 Mar 2005 17:37:06 +0300, Denis Vlasenko <vda@ilport.com.ua> wrote: > Try testcase below the sig. > > This causes nearly one thousand calls to memcpy in my kernel > (not an allyesconfig one): > static inline void * __memcpy(void * to, const void * from, size_t n) > { > int d0, d1, d2; > __asm__ __volatile__( > "rep ; movsl\n\t" > "testb $2,%b4\n\t" > "je 1f\n\t" > "movsw\n" > "1:\ttestb $1,%b4\n\t" > "je 2f\n\t" > "movsb\n" > "2:" > : "=&c" (d0), "=&D" (d1), "=&S" (d2) > :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) > : "memory"); > return (to); > } The question is, what reason does -Winline give for this inlining decision? And then of course, how is the size estimate counted for the above. What kind of tree node do we get for the ASM expression? Richard. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko 2005-03-29 15:06 ` Richard Guenther @ 2005-03-29 15:08 ` Nathan Sidwell 2005-03-29 15:13 ` Jakub Jelinek 2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko 3 siblings, 0 replies; 24+ messages in thread From: Nathan Sidwell @ 2005-03-29 15:08 UTC (permalink / raw) To: Denis Vlasenko; +Cc: linux-kernel, gcc Denis Vlasenko wrote: > Disassembly of section .text: > > e: e8 fc ff ff ff call f <f3+0xf> > f: R_386_PC32 memcpy > #define memcpy(t, f, n) \ > (__builtin_constant_p(n) ? \ > __constant_memcpy((t),(f),(n)) : \ > __memcpy((t),(f),(n))) given this #define, how can 'memcpy' appear in the object file? It appears that something odd is happening with preprocessing. Check the .i files are as you expect. -dD and -E options will be helpful to you. nathan -- Nathan Sidwell :: http://www.codesourcery.com :: CodeSourcery LLC nathan@codesourcery.com :: http://www.planetfall.pwp.blueyonder.co.uk ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko 2005-03-29 15:06 ` Richard Guenther 2005-03-29 15:08 ` Nathan Sidwell @ 2005-03-29 15:13 ` Jakub Jelinek 2005-03-29 15:42 ` Andrew Pinski 2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko 3 siblings, 1 reply; 24+ messages in thread From: Jakub Jelinek @ 2005-03-29 15:13 UTC (permalink / raw) To: Denis Vlasenko; +Cc: linux-kernel, gcc On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote: > typedef unsigned int size_t; > > static inline void * __memcpy(void * to, const void * from, size_t n) > { > int d0, d1, d2; > __asm__ __volatile__( > "rep ; movsl\n\t" > "testb $2,%b4\n\t" > "je 1f\n\t" > "movsw\n" > "1:\ttestb $1,%b4\n\t" > "je 2f\n\t" > "movsb\n" > "2:" > : "=&c" (d0), "=&D" (d1), "=&S" (d2) > :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) > : "memory"); > return (to); > } > > /* > * This looks horribly ugly, but the compiler can optimize it totally, > * as the count is constant. > */ > static inline void * __constant_memcpy(void * to, const void * from, size_t n) > { > if (n <= 128) > return __builtin_memcpy(to, from, n); > > #define COMMON(x) \ > __asm__ __volatile__( \ > "rep ; movsl" \ > x \ > : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ > : "0" (n/4),"1" ((long) to),"2" ((long) from) \ > : "memory"); > { > int d0, d1, d2; > switch (n % 4) { > case 0: COMMON(""); return to; > case 1: COMMON("\n\tmovsb"); return to; > case 2: COMMON("\n\tmovsw"); return to; > default: COMMON("\n\tmovsw\n\tmovsb"); return to; > } > } > > #undef COMMON > } > > #define memcpy(t, f, n) \ > (__builtin_constant_p(n) ? \ > __constant_memcpy((t),(f),(n)) : \ > __memcpy((t),(f),(n))) > > int f3(char *a, char *b) { memcpy(a,b,3); } The problem is that in GCC < 4.0 there is no constant propagation pass before expanding builtin functions, so the __builtin_memcpy call above sees a variable rather than a constant. Either use GCC 4.0+, where this works just fine, or move the n <= 128 case into the macro: #define memcpy(t, f, n) \ (__builtin_constant_p(n) ? \ ((n) <= 128 ? __builtin_memcpy(t,f,n) : __constant_memcpy(t,f,n) : \ __memcpy(t,f,n)) Jakub ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-03-29 15:13 ` Jakub Jelinek @ 2005-03-29 15:42 ` Andrew Pinski 2005-03-30 2:27 ` Gerold Jury 0 siblings, 1 reply; 24+ messages in thread From: Andrew Pinski @ 2005-03-29 15:42 UTC (permalink / raw) To: jakub; +Cc: Denis Vlasenko, linux-kernel, gcc > > On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote: > > /* > > * This looks horribly ugly, but the compiler can optimize it totally, > > * as the count is constant. > > */ > > static inline void * __constant_memcpy(void * to, const void * from, size_t n) > > { > > if (n <= 128) > > return __builtin_memcpy(to, from, n); > The problem is that in GCC < 4.0 there is no constant propagation > pass before expanding builtin functions, so the __builtin_memcpy > call above sees a variable rather than a constant. or change "size_t n" to "const size_t n" will also fix the issue. As we do some (well very little and with inlining and const values) const progation before 4.0.0 on the trees before expanding the builtin. -- Pinski ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-03-29 15:42 ` Andrew Pinski @ 2005-03-30 2:27 ` Gerold Jury 2005-03-30 6:15 ` Denis Vlasenko 0 siblings, 1 reply; 24+ messages in thread From: Gerold Jury @ 2005-03-30 2:27 UTC (permalink / raw) To: Linux Kernel Mailing List >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote: >> > /* >> > * This looks horribly ugly, but the compiler can optimize it totally, >> > * as the count is constant. >> > */ >> > static inline void * __constant_memcpy(void * to, const void * from, >> > size_t n) { >> > if (n <= 128) >> > return __builtin_memcpy(to, from, n); >> >> The problem is that in GCC < 4.0 there is no constant propagation >> pass before expanding builtin functions, so the __builtin_memcpy >> call above sees a variable rather than a constant. > >or change "size_t n" to "const size_t n" will also fix the issue. >As we do some (well very little and with inlining and const values) >const progation before 4.0.0 on the trees before expanding the builtin. > >-- Pinski >- I used the following "const size_t n" change on x86_64 and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3. (kernel 2.6.12-rc1, running now) --- include/asm-x86_64/string.h.~1~ 2005-03-02 08:38:33.000000000 +0100 +++ include/asm-x86_64/string.h 2005-03-30 03:24:35.000000000 +0200 @@ -28,9 +28,9 @@ function. */ #define __HAVE_ARCH_MEMCPY 1 -extern void *__memcpy(void *to, const void *from, size_t len); +extern void *__memcpy(void *to, const void *from, const size_t len); #define memcpy(dst,src,len) \ - ({ size_t __len = (len); \ + ({ const size_t __len = (len); \ void *__ret; \ if (__builtin_constant_p(len) && __len >= 64) \ __ret = __memcpy((dst),(src),__len); \ ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-03-30 2:27 ` Gerold Jury @ 2005-03-30 6:15 ` Denis Vlasenko 2005-04-01 21:43 ` Jan Hubicka 0 siblings, 1 reply; 24+ messages in thread From: Denis Vlasenko @ 2005-03-30 6:15 UTC (permalink / raw) To: Gerold Jury, jakub; +Cc: Linux Kernel Mailing List, gcc On Wednesday 30 March 2005 05:27, Gerold Jury wrote: > > >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote: > >> > /* > >> > * This looks horribly ugly, but the compiler can optimize it totally, > >> > * as the count is constant. > >> > */ > >> > static inline void * __constant_memcpy(void * to, const void * from, > >> > size_t n) { > >> > if (n <= 128) > >> > return __builtin_memcpy(to, from, n); > >> > >> The problem is that in GCC < 4.0 there is no constant propagation > >> pass before expanding builtin functions, so the __builtin_memcpy > >> call above sees a variable rather than a constant. > > > >or change "size_t n" to "const size_t n" will also fix the issue. > >As we do some (well very little and with inlining and const values) > >const progation before 4.0.0 on the trees before expanding the builtin. > > > >-- Pinski > >- > I used the following "const size_t n" change on x86_64 > and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3. > (kernel 2.6.12-rc1, running now) What do you mean, 'reduced'? (/me is checking....) Oh shit... It still emits half of memcpys, to be exact - for struct copies: arch/i386/kernel/process.c: int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { struct pt_regs * childregs; struct task_struct *tsk; int err; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; *childregs = *regs; ^^^^^^^^^^^^^^^^^^^ childregs->eax = 0; childregs->esp = esp; # make arch/i386/kernel/process.s copy_thread: pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %ebx subl $20, %esp movl 24(%ebp), %eax movl 4(%eax), %esi pushl $60 leal 8132(%esi), %ebx pushl 28(%ebp) pushl %ebx call memcpy <================= movl $0, 24(%ebx) movl 16(%ebp), %eax movl %eax, 52(%ebx) movl 24(%ebp), %edx addl $8192, %esi movl %ebx, 516(%edx) movl %esi, -32(%ebp) movl %esi, 504(%edx) movl $ret_from_fork, 512(%edx) Jakub, is there a way to instruct gcc to inine this copy, or better yet, to use user-supplied inline version of memcpy? -- vda ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-03-30 6:15 ` Denis Vlasenko @ 2005-04-01 21:43 ` Jan Hubicka 2005-04-02 12:18 ` Denis Vlasenko 0 siblings, 1 reply; 24+ messages in thread From: Jan Hubicka @ 2005-04-01 21:43 UTC (permalink / raw) To: Denis Vlasenko; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc > On Wednesday 30 March 2005 05:27, Gerold Jury wrote: > > > > >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote: > > >> > /* > > >> > * This looks horribly ugly, but the compiler can optimize it totally, > > >> > * as the count is constant. > > >> > */ > > >> > static inline void * __constant_memcpy(void * to, const void * from, > > >> > size_t n) { > > >> > if (n <= 128) > > >> > return __builtin_memcpy(to, from, n); > > >> > > >> The problem is that in GCC < 4.0 there is no constant propagation > > >> pass before expanding builtin functions, so the __builtin_memcpy > > >> call above sees a variable rather than a constant. > > > > > >or change "size_t n" to "const size_t n" will also fix the issue. > > >As we do some (well very little and with inlining and const values) > > >const progation before 4.0.0 on the trees before expanding the builtin. > > > > > >-- Pinski > > >- > > I used the following "const size_t n" change on x86_64 > > and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3. > > (kernel 2.6.12-rc1, running now) > > What do you mean, 'reduced'? > > (/me is checking....) > > Oh shit... It still emits half of memcpys, to be exact - for > struct copies: > > arch/i386/kernel/process.c: > > int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, > unsigned long unused, > struct task_struct * p, struct pt_regs * regs) > { > struct pt_regs * childregs; > struct task_struct *tsk; > int err; > > childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; > *childregs = *regs; > ^^^^^^^^^^^^^^^^^^^ > childregs->eax = 0; > childregs->esp = esp; > > # make arch/i386/kernel/process.s > > copy_thread: > pushl %ebp > movl %esp, %ebp > pushl %edi > pushl %esi > pushl %ebx > subl $20, %esp > movl 24(%ebp), %eax > movl 4(%eax), %esi > pushl $60 > leal 8132(%esi), %ebx > pushl 28(%ebp) > pushl %ebx > call memcpy <================= > movl $0, 24(%ebx) > movl 16(%ebp), %eax > movl %eax, 52(%ebx) > movl 24(%ebp), %edx > addl $8192, %esi > movl %ebx, 516(%edx) > movl %esi, -32(%ebp) > movl %esi, 504(%edx) > movl $ret_from_fork, 512(%edx) > > Jakub, is there a way to instruct gcc to inine this copy, or better yet, > to use user-supplied inline version of memcpy? You can't inline struct copy as it is not function call at first place. You can experiment with -minline-all-stringops where GCC will use it's own memcpy implementation for this. Honza > -- > vda ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-04-01 21:43 ` Jan Hubicka @ 2005-04-02 12:18 ` Denis Vlasenko 2005-04-02 12:26 ` Denis Vlasenko 0 siblings, 1 reply; 24+ messages in thread From: Denis Vlasenko @ 2005-04-02 12:18 UTC (permalink / raw) To: Jan Hubicka; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc > > childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; > > *childregs = *regs; > > ^^^^^^^^^^^^^^^^^^^ > > childregs->eax = 0; > > childregs->esp = esp; > > > > # make arch/i386/kernel/process.s > > > > copy_thread: > > pushl %ebp > > movl %esp, %ebp > > pushl %edi > > pushl %esi > > pushl %ebx > > subl $20, %esp > > movl 24(%ebp), %eax > > movl 4(%eax), %esi > > pushl $60 > > leal 8132(%esi), %ebx > > pushl 28(%ebp) > > pushl %ebx > > call memcpy <================= > > movl $0, 24(%ebx) > > > > Jakub, is there a way to instruct gcc to inine this copy, or better yet, > > to use user-supplied inline version of memcpy? > > You can't inline struct copy as it is not function call at first place. > You can experiment with -minline-all-stringops where GCC will use it's > own memcpy implementation for this. No luck. Actually, memcpy calls are produced with -Os. Adding -minline-all-stringops changes nothing. -O2 compile does inline copying, however, suboptimally. Pushing/popping esi/edi on the stack is not needed. Also "mov $1,ecx; rep; movsl" is rather silly. Here what did I test: t.c: #define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; } #define STRUCT(n) STRUCT1(n) STRUCT(1) STRUCT(2) STRUCT(3) STRUCT(4) STRUCT(5) STRUCT(6) STRUCT(7) STRUCT(8) STRUCT(9) STRUCT(10) STRUCT(11) STRUCT(12) STRUCT(13) STRUCT(14) STRUCT(15) STRUCT(16) STRUCT(17) STRUCT(18) STRUCT(19) STRUCT(20) mk.sh: #!/bin/sh # yeah yeah. push/pop + 1 repetition 'rep movsl' # 88: 55 push %ebp # 89: 89 e5 mov %esp,%ebp # 8b: 57 push %edi # 8c: 56 push %esi # 8d: fc cld # 8e: bf 00 00 00 00 mov $0x0,%edi # 8f: R_386_32 v7 # 93: be 00 00 00 00 mov $0x0,%esi # 94: R_386_32 w7 # 98: b9 01 00 00 00 mov $0x1,%ecx # 9d: f3 a5 repz movsl %ds:(%esi),%es:(%edi) # 9f: 66 a5 movsw %ds:(%esi),%es:(%edi) # a1: a4 movsb %ds:(%esi),%es:(%edi) # a2: 5e pop %esi # a3: 5f pop %edi # a4: c9 leave # a5: c3 ret # a6: 89 f6 mov %esi,%esi if false; then gcc -O2 \ falign-functions=1 -falign-labels=1 -falign-loops=1 -falign-jumps=1 \ -c t.c echo Done; read junk objdump -d -r t.o | $PAGER exit fi # -Os: emits memcpy if false; then gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \ -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \ -fno-common -ffreestanding -Os -falign-functions=1 -falign-labels=1 \ -falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \ -mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \ -Wdeclaration-after-statement -c t.c echo Done; read junk objdump -d -r t.o | $PAGER exit fi # -march=486: emits horrible tail: # 271: f3 a5 repz movsl %ds:(%esi),%es:(%edi) # 273: 5e pop %esi # 274: 66 a1 10 00 00 00 mov 0x10,%ax # 276: R_386_32 w19 # 27a: 5f pop %edi # 27b: 66 a3 10 00 00 00 mov %ax,0x10 # 27d: R_386_32 v19 # 281: 5d pop %ebp # 282: a0 12 00 00 00 mov 0x12,%al # 283: R_386_32 w19 # 287: a2 12 00 00 00 mov %al,0x12 # 288: R_386_32 v19 # 28c: c3 ret if false; then gcc \ -fno-common -ffreestanding -O2 -falign-functions=1 -falign-labels=1 \ -falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \ -mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \ -Wdeclaration-after-statement \ -c t.c echo Done; read junk objdump -d -r t.o | $PAGER exit fi # -Os -minline-all-stringops: still emits memcpy if true; then gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \ -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \ -fno-common -ffreestanding -Os -minline-all-stringops -falign-functions=1 -falign-labels=1 \ -falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \ -mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \ -Wdeclaration-after-statement -c t.c echo Done; read junk objdump -d -r t.o | $PAGER exit fi -- vda ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel 2005-04-02 12:18 ` Denis Vlasenko @ 2005-04-02 12:26 ` Denis Vlasenko 2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout 0 siblings, 1 reply; 24+ messages in thread From: Denis Vlasenko @ 2005-04-02 12:26 UTC (permalink / raw) To: Jan Hubicka; +Cc: Gerold Jury, jakub, Linux Kernel Mailing List, gcc [-- Attachment #1: Type: text/plain, Size: 360 bytes --] On Saturday 02 April 2005 15:18, Denis Vlasenko wrote: > -O2 compile does inline copying, however, suboptimally. > Pushing/popping esi/edi on the stack is not needed. > Also "mov $1,ecx; rep; movsl" is rather silly. I think I am wrong about push/pop. Sorry. However, other observation is still valid. You may wish to compile this updated t.c and see. -- vda [-- Attachment #2: t.c --] [-- Type: text/x-csrc, Size: 2513 bytes --] static inline void * __memcpy(void * to, const void * from, int n) { int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" "movl %4,%%ecx\n\t" "andl $3,%%ecx\n\t" "jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */ "rep ; movsb\n\t" "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) : "memory"); return (to); } /* * This looks ugly, but the compiler can optimize it totally, * as the count is constant. */ static inline void * __constant_memcpy(void * to, const void * from, int n) { #if 1 /* want to do small copies with non-string ops? */ switch (n) { case 0: return to; case 1: *(char*)to = *(char*)from; return to; case 2: *(short*)to = *(short*)from; return to; case 4: *(int*)to = *(int*)from; return to; #if 1 /* including those doable with two moves? */ case 3: *(short*)to = *(short*)from; *((char*)to+2) = *((char*)from+2); return to; case 5: *(int*)to = *(int*)from; *((char*)to+4) = *((char*)from+4); return to; case 6: *(int*)to = *(int*)from; *((short*)to+2) = *((short*)from+2); return to; case 8: *(int*)to = *(int*)from; *((int*)to+1) = *((int*)from+1); return to; #endif } #else if (!n) return to; #endif { /* load esi/edi */ int esi, edi; __asm__ __volatile__( "" : "=&D" (edi), "=&S" (esi) : "0" ((long) to),"1" ((long) from) : "memory" ); } if (n >= 5*4) { /* large block: use rep prefix */ int ecx; __asm__ __volatile__( "rep ; movsl" : "=&c" (ecx) : "0" (n/4) ); } else { /* small block: don't clobber ecx + smaller code */ if (n >= 4*4) __asm__ __volatile__("movsl"); if (n >= 3*4) __asm__ __volatile__("movsl"); if (n >= 2*4) __asm__ __volatile__("movsl"); if (n >= 1*4) __asm__ __volatile__("movsl"); } switch (n % 4) { /* tail */ case 0: return to; case 1: __asm__ __volatile__("movsb"); return to; case 2: __asm__ __volatile__("movsw"); return to; default: __asm__ __volatile__("movsw\n\tmovsb"); return to; } } #define memcpy(t, f, n) \ (__builtin_constant_p(n) ? \ __constant_memcpy((t),(f),(n)) : \ __memcpy((t),(f),(n))) #define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; } void g##n(void) { memcpy(&v##n,&w##n,n); } #define STRUCT(n) STRUCT1(n) STRUCT(1) STRUCT(2) STRUCT(3) STRUCT(4) STRUCT(5) STRUCT(6) STRUCT(7) STRUCT(8) STRUCT(9) STRUCT(10) STRUCT(11) STRUCT(12) STRUCT(13) STRUCT(14) STRUCT(15) STRUCT(16) STRUCT(17) STRUCT(18) STRUCT(19) STRUCT(20) ^ permalink raw reply [flat|nested] 24+ messages in thread
* [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-02 12:26 ` Denis Vlasenko @ 2005-04-05 16:34 ` Christophe Saout 2005-04-06 10:14 ` Denis Vlasenko 2005-04-06 16:11 ` Denis Vlasenko 0 siblings, 2 replies; 24+ messages in thread From: Christophe Saout @ 2005-04-05 16:34 UTC (permalink / raw) To: Denis Vlasenko Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub, Linux Kernel Mailing List, gcc [-- Attachment #1: Type: text/plain, Size: 1798 bytes --] Hi Denis, the new i386 memcpy macro is a ticking timebomb. I've been debugging a new mISDN crash, just to find out that a memcpy was not inlined correctly. Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed). This source code: mISDN_pid_t pid; [...] memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t)); was compiled as: lea 0xffffffa4(%ebp),%esi <---- %esi is loaded ( add $0x10,%ebx ) ( mov %ebx,%eax ) something else ( call 1613 <test_stack_protocol+0x83> ) %esi preserved mov 0xffffffa0(%ebp),%edx mov 0x74(%edx),%edi <---- %edi is loaded add $0x20,%edi offset in structure added ! mov $0x14,%esi !!!!!! <---- %esi overwritten! mov %esi,%ecx <---- %ecx loaded repz movsl %ds:(%esi),%es:(%edi) Apparently the compiled decided that the value 0x14 could be reused afterwards (which it does for an inlined memset of the same size some instructions below) and clobbers %esi. Looking at the macro: __asm__ __volatile__( "" : "=&D" (edi), "=&S" (esi) : "0" ((long) to),"1" ((long) from) : "memory" ); } if (n >= 5*4) { /* large block: use rep prefix */ int ecx; __asm__ __volatile__( "rep ; movsl" : "=&c" (ecx) it seems obvious that the compiled assumes it can reuse %esi and %edi for something else between the two __asm__ sections. These should probably be merged. [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout @ 2005-04-06 10:14 ` Denis Vlasenko 2005-04-06 11:05 ` Dave Korn 2005-04-06 12:05 ` Christophe Saout 2005-04-06 16:11 ` Denis Vlasenko 1 sibling, 2 replies; 24+ messages in thread From: Denis Vlasenko @ 2005-04-06 10:14 UTC (permalink / raw) To: Christophe Saout Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub, Linux Kernel Mailing List, gcc On Tuesday 05 April 2005 19:34, Christophe Saout wrote: > Hi Denis, > > the new i386 memcpy macro is a ticking timebomb. > > I've been debugging a new mISDN crash, just to find out that a memcpy > was not inlined correctly. > > Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed). > > This source code: > > mISDN_pid_t pid; > [...] > memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t)); > > was compiled as: > > lea 0xffffffa4(%ebp),%esi <---- %esi is loaded > ( add $0x10,%ebx ) > ( mov %ebx,%eax ) something else > ( call 1613 <test_stack_protocol+0x83> ) %esi preserved > mov 0xffffffa0(%ebp),%edx > mov 0x74(%edx),%edi <---- %edi is loaded > add $0x20,%edi offset in structure added > ! mov $0x14,%esi !!!!!! <---- %esi overwritten! > mov %esi,%ecx <---- %ecx loaded > repz movsl %ds:(%esi),%es:(%edi) > > Apparently the compiled decided that the value 0x14 could be reused > afterwards (which it does for an inlined memset of the same size some > instructions below) and clobbers %esi. > > Looking at the macro: > > __asm__ __volatile__( > "" > : "=&D" (edi), "=&S" (esi) > : "0" ((long) to),"1" ((long) from) > : "memory" > ); > } > if (n >= 5*4) { > /* large block: use rep prefix */ > int ecx; > __asm__ __volatile__( > "rep ; movsl" > : "=&c" (ecx) > > it seems obvious that the compiled assumes it can reuse %esi and %edi > for something else between the two __asm__ sections. These should > probably be merged. Oh shit. I was trying to be too clever. I still run with this patch, so it must be happening very rarely. Does this one compile ok? static inline void * __constant_memcpy(void * to, const void * from, size_t n) { long esi, edi; #if 1 /* want to do small copies with non-string ops? */ switch (n) { case 0: return to; case 1: *(char*)to = *(char*)from; return to; case 2: *(short*)to = *(short*)from; return to; case 4: *(int*)to = *(int*)from; return to; #if 1 /* including those doable with two moves? */ case 3: *(short*)to = *(short*)from; *((char*)to+2) = *((char*)from+2); return to; case 5: *(int*)to = *(int*)from; *((char*)to+4) = *((char*)from+4); return to; case 6: *(int*)to = *(int*)from; *((short*)to+2) = *((short*)from+2); return to; case 8: *(int*)to = *(int*)from; *((int*)to+1) = *((int*)from+1); return to; #endif } #else if (!n) return to; #endif { /* load esi/edi */ __asm__ __volatile__( "" : "=&D" (edi), "=&S" (esi) : "0" ((long) to),"1" ((long) from) : "memory" ); } if (n >= 5*4) { /* large block: use rep prefix */ int ecx; __asm__ __volatile__( "rep ; movsl" : "=&c" (ecx), "=&D" (edi), "=&S" (esi) : "0" (n/4), "1" (edi),"2" (esi) : "memory" ); } else { /* small block: don't clobber ecx + smaller code */ if (n >= 4*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); if (n >= 3*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); if (n >= 2*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); if (n >= 1*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); } switch (n % 4) { /* tail */ case 0: return to; case 1: __asm__ __volatile__("movsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to; case 2: __asm__ __volatile__("movsw":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to; default: __asm__ __volatile__("movsw\n\tmovsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to; } } -- vda ^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 10:14 ` Denis Vlasenko @ 2005-04-06 11:05 ` Dave Korn 2005-04-06 11:13 ` Dave Korn 2005-04-06 12:05 ` Christophe Saout 1 sibling, 1 reply; 24+ messages in thread From: Dave Korn @ 2005-04-06 11:05 UTC (permalink / raw) To: 'Denis Vlasenko', 'Christophe Saout' Cc: 'Andrew Morton', 'Jan Hubicka', 'Gerold Jury', jakub, 'Linux Kernel Mailing List', gcc ----Original Message---- >From: Denis Vlasenko >Sent: 06 April 2005 11:14 Is this someone's idea of an April Fool's joke? Because if it is, I've suffered a serious sense-of-humour failure. > Oh shit. I was trying to be too clever. I still run with this patch, > so it must be happening very rarely. The kernel is way too important for cross-your-fingers-and-hope engineering techniques to be applied. This patch should never have been permitted. How on earth could anything like this hope to make it through a strict review? > Does this one compile ok? > { > /* load esi/edi */ > __asm__ __volatile__( > "" > : "=&D" (edi), "=&S" (esi) > : "0" ((long) to),"1" ((long) from) > : "memory" > ); > } > if (n >= 5*4) { > /* large block: use rep prefix */ > int ecx; > __asm__ __volatile__( > "rep ; movsl" > : "=&c" (ecx), "=&D" (edi), "=&S" (esi) > : "0" (n/4), "1" (edi),"2" (esi) > : "memory" > ); It doesn't matter if it compiles or not, it's still *utterly* invalid. You can NOT make assumptions about registers keeping their values between one asm block and another. Immediately after the closing quote of the first asm, the compiler can do ANYTHING IT WANTS and to just _hope_ that it won't use the registers you want is voodoo programming. Even if it works when you try it once, there are zero guarantees that another version or revision of the compiler or even just a tiny change to the source that affects the behaviour of the scheduler when compiling the function won't produce something completely different, meaning that this code is appallingly fragile. This code should be completely discarded and rewritten properly. cheers, DaveK -- Can't think of a witty .sigline today.... ^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 11:05 ` Dave Korn @ 2005-04-06 11:13 ` Dave Korn 2005-04-06 11:53 ` Dave Korn 0 siblings, 1 reply; 24+ messages in thread From: Dave Korn @ 2005-04-06 11:13 UTC (permalink / raw) To: 'Dave Korn', 'Denis Vlasenko', 'Christophe Saout' Cc: 'Andrew Morton', 'Jan Hubicka', 'Gerold Jury', jakub, 'Linux Kernel Mailing List', gcc ----Original Message---- >From: Dave Korn >Sent: 06 April 2005 12:06 Me and my big mouth. OK, that one does work. Sorry for the outburst. cheers, DaveK -- Can't think of a witty .sigline today.... ^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 11:13 ` Dave Korn @ 2005-04-06 11:53 ` Dave Korn 2005-04-06 11:56 ` Dave Korn 2005-04-06 13:18 ` Richard B. Johnson 0 siblings, 2 replies; 24+ messages in thread From: Dave Korn @ 2005-04-06 11:53 UTC (permalink / raw) To: 'Dave Korn', 'Denis Vlasenko', 'Christophe Saout' Cc: 'Andrew Morton', 'Jan Hubicka', 'Gerold Jury', jakub, 'Linux Kernel Mailing List', gcc ----Original Message---- >From: Dave Korn >Sent: 06 April 2005 12:13 > ----Original Message---- >> From: Dave Korn >> Sent: 06 April 2005 12:06 > > > Me and my big mouth. > > OK, that one does work. > > Sorry for the outburst. > .... well, actually, maybe it doesn't after all. What's that uninitialised variable ecx doing there eh? cheers, DaveK -- Can't think of a witty .sigline today.... ^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 11:53 ` Dave Korn @ 2005-04-06 11:56 ` Dave Korn 2005-04-06 13:18 ` Richard B. Johnson 1 sibling, 0 replies; 24+ messages in thread From: Dave Korn @ 2005-04-06 11:56 UTC (permalink / raw) To: 'Dave Korn', 'Denis Vlasenko', 'Christophe Saout' Cc: 'Andrew Morton', 'Jan Hubicka', 'Gerold Jury', jakub, 'Linux Kernel Mailing List', gcc ----Original Message---- >From: Dave Korn >Sent: 06 April 2005 12:53 > ----Original Message---- >> From: Dave Korn >> Sent: 06 April 2005 12:13 > >> ----Original Message---- >>> From: Dave Korn >>> Sent: 06 April 2005 12:06 >> >> >> Me and my big mouth. >> >> OK, that one does work. >> >> Sorry for the outburst. >> > > > .... well, actually, maybe it doesn't after all. > > > What's that uninitialised variable ecx doing there eh? Oh, I see, it's there as an output so it can be matched as an input by the "0" constraint. Ok, guess it does. cheers, DaveK -- Can't think of a witty .sigline today.... ^ permalink raw reply [flat|nested] 24+ messages in thread
* RE: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 11:53 ` Dave Korn 2005-04-06 11:56 ` Dave Korn @ 2005-04-06 13:18 ` Richard B. Johnson 2005-04-06 14:16 ` Denis Vlasenko 1 sibling, 1 reply; 24+ messages in thread From: Richard B. Johnson @ 2005-04-06 13:18 UTC (permalink / raw) To: Dave Korn Cc: Denis Vlasenko, Christophe Saout, Andrew Morton, Jan Hubicka, Gerold Jury, jakub, Linux Kernel Mailing List, gcc [-- Attachment #1: Type: TEXT/PLAIN, Size: 1265 bytes --] Attached is inline ix86 memcpy() plus test code that tests its corner-cases. The in-line code makes no jumps, but uses longword copies, word copies and any spare byte copy. It works at all offsets, doesn't require alignment but would work fastest if both source and destination were longword aligned. On Wed, 6 Apr 2005, Dave Korn wrote: > ----Original Message---- >> From: Dave Korn >> Sent: 06 April 2005 12:13 > >> ----Original Message---- >>> From: Dave Korn >>> Sent: 06 April 2005 12:06 >> >> >> Me and my big mouth. >> >> OK, that one does work. >> >> Sorry for the outburst. >> > > > .... well, actually, maybe it doesn't after all. > > > What's that uninitialised variable ecx doing there eh? > > > cheers, > DaveK > -- > Can't think of a witty .sigline today.... > > - > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > Cheers, Dick Johnson Penguin : Linux version 2.6.11 on an i686 machine (5537.79 BogoMips). Notice : All mail here is now cached for review by Dictator Bush. 98.36% of all statistics are fiction. [-- Attachment #2: Type: TEXT/PLAIN, Size: 1421 bytes --] #include <stdio.h> #include <string.h> // // Inline ix86 memcpy() that contains no jumps. Not copied from // anybody. Contributed by rjohnson@analogic.com // static __inline__ void *memcpy(void *dst, void *src, size_t len) { void *ret = dst; __asm__ __volatile__( \ "cld\n" \ "shr $1, %%ecx\n" \ "pushf\n" \ "shr $1, %%ecx\n" \ "pushf\n" \ "rep\n" \ "movsl\n" \ "popf\n" \ "adcl %%ecx, %%ecx\n" \ "rep\n" \ "movsw\n" \ "popf\n" \ "adcl %%ecx, %%ecx\n" \ "rep\n" \ "movsb\n" \ : "=D" (dst), "=S" (src), "=c"(len) : "0" (dst), "1" (src), "2" (len) : "memory" ); return ret; } const char tester[]= "0123456789" "0123456789" "0123456789" "0123456789" "0123456789" "0123456789" "0123456789" "0123456789"; char allocated[0x1000]; int main() { size_t i; char buf[0x1000]; memset(buf, 0x00, sizeof(buf)); for(i=0; i< sizeof(buf); i++) puts(memcpy(buf, (char *)tester, i)); memset(buf, 0x00, sizeof(buf)); for(i=0; i< sizeof(buf)-1; i++) puts(memcpy(&buf[1], (char *)tester, i)); memset(buf, 0x00, sizeof(buf)); for(i=0; i< sizeof(buf)-2; i++) puts(memcpy(&buf[2], (char *)tester, i)); memset(buf, 0x00, sizeof(buf)); for(i=0; i< sizeof(buf)-3; i++) puts(memcpy(&buf[3], (char *)tester, i)); return 0; } ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 13:18 ` Richard B. Johnson @ 2005-04-06 14:16 ` Denis Vlasenko 0 siblings, 0 replies; 24+ messages in thread From: Denis Vlasenko @ 2005-04-06 14:16 UTC (permalink / raw) To: linux-os, Dave Korn Cc: Denis Vlasenko, Christophe Saout, Andrew Morton, Jan Hubicka, Gerold Jury, jakub, Linux Kernel Mailing List, gcc On Wednesday 06 April 2005 16:18, Richard B. Johnson wrote: > > Attached is inline ix86 memcpy() plus test code that tests its > corner-cases. The in-line code makes no jumps, but uses longword > copies, word copies and any spare byte copy. It works at all > offsets, doesn't require alignment but would work fastest if > both source and destination were longword aligned. Yours is: "shr $1, %%ecx\n" \ "pushf\n" \ "shr $1, %%ecx\n" \ "pushf\n" \ <=== not needed "rep\n" \ "movsl\n" \ "popf\n" \ <=== not needed "adcl %%ecx, %%ecx\n" \ "rep\n" \ "movsw\n" \ "popf\n" \ "adcl %%ecx, %%ecx\n" \ "rep\n" \ "movsb\n" \ You struggle too much for that movsw. -mm one (which happen to be mine) is: "movl %ecx,%4" "shr $2,%ecx" "rep ; movsl" "movl %4,%%ecx" "andl $3,%%ecx" "jz 1ft" /* pay 2 byte penalty for a chance to skip microcoded rep */ "rep ; movsb" "1:" and I can still drop that jz. It is there just to have a chance to skip rep movsb, it was measured to be slow enough to matter. rep movs are a bit slow to start, on small blocks it is measurable. However, maybe it is even better without jz, need to benchmark 'cold path' (i.e. where branch predictor have no data to predict it) somehow. -- vda ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 10:14 ` Denis Vlasenko 2005-04-06 11:05 ` Dave Korn @ 2005-04-06 12:05 ` Christophe Saout 2005-04-06 12:36 ` Andrew Haley 2005-04-06 15:18 ` Paolo Bonzini 1 sibling, 2 replies; 24+ messages in thread From: Christophe Saout @ 2005-04-06 12:05 UTC (permalink / raw) To: Denis Vlasenko Cc: gcc, Linux Kernel Mailing List, jakub, Gerold Jury, Jan Hubicka, Andrew Morton [-- Attachment #1: Type: text/plain, Size: 1628 bytes --] Am Mittwoch, den 06.04.2005, 13:14 +0300 schrieb Denis Vlasenko: > Oh shit. I was trying to be too clever. I still run with this patch, > so it must be happening very rarely. Yes, that's right, it happened with code that's not in the mainline tree but could have happened anywhere. > Does this one compile ok? Yes, the case that failed is now okay. I changed it slightly to assign esi and edi directy on top of the functions, no asm section needed here. The compiler will make sure that they have the correct values when needed. In the case above the compiler now uses %ebx to save the loop counter instead of %esi. In drivers/cdrom/cdrom.c I'm observing one very strange thing though. It appears that the compiler decided to put the local variable edi on the stack for some unexplicable reason (or possibly there is?). Since the asm sections are memory barriers the compiler then saves the value of %edi on the stack before entering the next assembler section. 1f1c: a5 movsl %ds:(%esi),%es:(%edi) 1f1d: 89 7d 84 mov %edi,0xffffff84(%ebp) 1f20: a5 movsl %ds:(%esi),%es:(%edi) 1f21: 89 7d 84 mov %edi,0xffffff84(%ebp) 1f24: 66 a5 movsw %ds:(%esi),%es:(%edi) (this is a constant 10 byte memcpy) The only thing that would avoid this is to either tell the compiler to never put esi/edi in memory (which I think is not possibly across different versions of gcc) or to always generate a single asm section for all the different cases. [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 189 bytes --] ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 12:05 ` Christophe Saout @ 2005-04-06 12:36 ` Andrew Haley 2005-04-06 15:18 ` Paolo Bonzini 1 sibling, 0 replies; 24+ messages in thread From: Andrew Haley @ 2005-04-06 12:36 UTC (permalink / raw) To: Christophe Saout Cc: Denis Vlasenko, gcc, Linux Kernel Mailing List, jakub, Gerold Jury, Jan Hubicka, Andrew Morton I'm having a little difficulty understanding what this is for. Is it that gcc's builtin memcpy expander generates bad code, or that older versions of gcc generate bad code, or what? gcc generates too much code? Andrew. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-06 12:05 ` Christophe Saout 2005-04-06 12:36 ` Andrew Haley @ 2005-04-06 15:18 ` Paolo Bonzini 1 sibling, 0 replies; 24+ messages in thread From: Paolo Bonzini @ 2005-04-06 15:18 UTC (permalink / raw) To: linux-kernel; +Cc: gcc > The only thing that would avoid this is to either tell the compiler to > never put esi/edi in memory (which I think is not possibly across > different versions of gcc) or to always generate a single asm section > for all the different cases. Use __asm__ ("%esi") and __asm__ ("%edi"). It is not guaranteed that they access the registers always (you can still have copy propagation etcetera); but, if your __asm__ statement constraints match the register you specify, then you can be reasonably sure that good code is produced. Paolo ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [BUG mm] "fixed" i386 memcpy inlining buggy 2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout 2005-04-06 10:14 ` Denis Vlasenko @ 2005-04-06 16:11 ` Denis Vlasenko 1 sibling, 0 replies; 24+ messages in thread From: Denis Vlasenko @ 2005-04-06 16:11 UTC (permalink / raw) To: Christophe Saout Cc: Andrew Morton, Jan Hubicka, Gerold Jury, jakub, Linux Kernel Mailing List, gcc [-- Attachment #1: Type: text/plain, Size: 620 bytes --] On Tuesday 05 April 2005 19:34, Christophe Saout wrote: > the new i386 memcpy macro is a ticking timebomb. > > I've been debugging a new mISDN crash, just to find out that a memcpy > was not inlined correctly. > > Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed). Updated patch against 2.6.11 follows. This one, like the original patch, is run tested too. This time I took no chances, esi/edi contents are explicitly propagated from one asm() block to another. I didn't do it before, not expecting that gcc can be soooo incredibly clever. Sorry. Christophe does this one look/compile ok? -- vda [-- Attachment #2: string2.h.diff --] [-- Type: text/x-diff, Size: 3288 bytes --] --- linux-2.6.11.src/include/asm-i386/string.h.orig Thu Mar 3 09:31:08 2005 +++ linux-2.6.11.src/include/asm-i386/string.h Wed Apr 6 19:08:39 2005 @@ -198,47 +198,80 @@ static inline void * __memcpy(void * to, int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" + "movl %4,%%ecx\n\t" + "andl $3,%%ecx\n\t" +#if 1 /* want to pay 2 byte penalty for a chance to skip microcoded rep? */ + "jz 1f\n\t" +#endif + "rep ; movsb\n\t" + "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) + : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) : "memory"); return (to); } /* - * This looks horribly ugly, but the compiler can optimize it totally, + * This looks ugly, but the compiler can optimize it totally, * as the count is constant. */ static inline void * __constant_memcpy(void * to, const void * from, size_t n) { - if (n <= 128) - return __builtin_memcpy(to, from, n); - -#define COMMON(x) \ -__asm__ __volatile__( \ - "rep ; movsl" \ - x \ - : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ - : "0" (n/4),"1" ((long) to),"2" ((long) from) \ - : "memory"); -{ - int d0, d1, d2; + long esi, edi; + if (!n) return to; +#if 1 /* want to do small copies with non-string ops? */ + switch (n) { + case 1: *(char*)to = *(char*)from; return to; + case 2: *(short*)to = *(short*)from; return to; + case 4: *(int*)to = *(int*)from; return to; +#if 1 /* including those doable with two moves? */ + case 3: *(short*)to = *(short*)from; + *((char*)to+2) = *((char*)from+2); return to; + case 5: *(int*)to = *(int*)from; + *((char*)to+4) = *((char*)from+4); return to; + case 6: *(int*)to = *(int*)from; + *((short*)to+2) = *((short*)from+2); return to; + case 8: *(int*)to = *(int*)from; + *((int*)to+1) = *((int*)from+1); return to; +#endif + } +#endif + esi = (long) from; + edi = (long) to; + if (n >= 5*4) { + /* large block: use rep prefix */ + int ecx; + __asm__ __volatile__( + "rep ; movsl" + : "=&c" (ecx), "=&D" (edi), "=&S" (esi) + : "0" (n/4), "1" (edi),"2" (esi) + : "memory" + ); + } else { + /* small block: don't clobber ecx + smaller code */ + if (n >= 4*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + if (n >= 3*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + if (n >= 2*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + if (n >= 1*4) __asm__ __volatile__("movsl" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + } switch (n % 4) { - case 0: COMMON(""); return to; - case 1: COMMON("\n\tmovsb"); return to; - case 2: COMMON("\n\tmovsw"); return to; - default: COMMON("\n\tmovsw\n\tmovsb"); return to; + /* tail */ + case 0: return to; + case 1: __asm__ __volatile__("movsb" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + return to; + case 2: __asm__ __volatile__("movsw" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + return to; + default: __asm__ __volatile__("movsw\n\tmovsb" + :"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); + return to; } } - -#undef COMMON -} #define __HAVE_ARCH_MEMCPY ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH] fix i386 memcpy 2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko ` (2 preceding siblings ...) 2005-03-29 15:13 ` Jakub Jelinek @ 2005-03-29 20:22 ` Denis Vlasenko 2005-03-29 20:24 ` Denis Vlasenko 3 siblings, 1 reply; 24+ messages in thread From: Denis Vlasenko @ 2005-03-29 20:22 UTC (permalink / raw) To: Andrew Morton, Denis Vlasenko; +Cc: linux-kernel, gcc [-- Attachment #1: Type: text/plain, Size: 2961 bytes --] This patch shortens non-constant memcpy() by two bytes and fixes spurious out-of-line constant memcpy(). Patch is run-tested (I run on patched kernel right now). Benchmark and code generation test program will be mailed as reply. # size vmlinux.org vmlinux text data bss dec hex filename 3954591 1553426 236544 5744561 57a7b1 vmlinux.org 3952615 1553426 236544 5742585 579ff9 vmlinux Example of changes (part of dump_fpu() body): old.............................................. new...................... 8d 83 40 02 00 00 lea 0x240(%ebx),%eax 8d b3 40 02 00 00 lea 0x240(%ebx),%esi 74 31 je c0108b27 <dump_fpu+0x9c> 74 2e je c0108b1d <dump_fpu+0x92> 6a 1c push $0x1c 8b 7d 0c mov 0xc(%ebp),%edi 50 push %eax b9 07 00 00 00 mov $0x7,%ecx 56 push %esi f3 a5 repz movsl %ds:(%esi),%es:(%edi) e8 49 21 10 00 call c020ac48 <memcpy> 8b 55 0c mov 0xc(%ebp),%edx 83 c4 0c add $0xc,%esp 83 c2 1c add $0x1c,%edx 83 c6 1c add $0x1c,%esi 8d 83 60 02 00 00 lea 0x260(%ebx),%eax 81 c3 60 02 00 00 add $0x260,%ebx b9 07 00 00 00 mov $0x7,%ecx bf 07 00 00 00 mov $0x7,%edi 89 d7 mov %edx,%edi 6a 0a push $0xa 89 c6 mov %eax,%esi 53 push %ebx a5 movsl %ds:(%esi),%es:(%edi) 56 push %esi a5 movsl %ds:(%esi),%es:(%edi) e8 2f 21 10 00 call c020ac48 <memcpy> 66 a5 movsw %ds:(%esi),%es:(%edi) 83 c4 0c add $0xc,%esp 83 c2 0a add $0xa,%edx 83 c6 0a add $0xa,%esi 83 c0 10 add $0x10,%eax 83 c3 10 add $0x10,%ebx 49 dec %ecx 4f dec %edi 79 ef jns c0108b0a <dump_fpu+0x7f> 79 eb jns c0108b10 <dump_fpu+0x85> eb 0a jmp c0108b27 <dump_fpu+0x9c> eb 0c jmp c0108b33 <dump_fpu+0xa8> 8b 7d 0c mov 0xc(%ebp),%edi 6a 6c push $0x6c b9 1b 00 00 00 mov $0x1b,%ecx 50 push %eax f3 a5 repz movsl %ds:(%esi),%es:(%edi) 56 push %esi 8b 45 f0 mov 0xfffffff0(%ebp),%eax e8 18 21 10 00 call c020ac48 <memcpy> 5a pop %edx 83 c4 0c add $0xc,%esp 5b pop %ebx 8b 45 f0 mov 0xfffffff0(%ebp),%eax 8d 65 f4 lea 0xfffffff4(%ebp),%esp 5b pop %ebx 5e pop %esi -- vda [-- Attachment #2: string.memcpy.diff --] [-- Type: text/x-diff, Size: 2953 bytes --] --- linux-2.6.11.src/include/asm-i386/string.h.orig Thu Mar 3 09:31:08 2005 +++ linux-2.6.11.src/include/asm-i386/string.h Tue Mar 29 22:05:00 2005 @@ -198,46 +198,75 @@ static inline void * __memcpy(void * to, int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" - "testb $2,%b4\n\t" - "je 1f\n\t" - "movsw\n" - "1:\ttestb $1,%b4\n\t" - "je 2f\n\t" - "movsb\n" - "2:" + "movl %4,%%ecx\n\t" + "andl $3,%%ecx\n\t" + "jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */ + "rep ; movsb\n\t" + "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) - :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from) + : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) : "memory"); return (to); } /* - * This looks horribly ugly, but the compiler can optimize it totally, + * This looks ugly, but the compiler can optimize it totally, * as the count is constant. */ static inline void * __constant_memcpy(void * to, const void * from, size_t n) { - if (n <= 128) - return __builtin_memcpy(to, from, n); - -#define COMMON(x) \ -__asm__ __volatile__( \ - "rep ; movsl" \ - x \ - : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ - : "0" (n/4),"1" ((long) to),"2" ((long) from) \ - : "memory"); -{ - int d0, d1, d2; +#if 1 /* want to do small copies with non-string ops? */ + switch (n) { + case 0: return to; + case 1: *(char*)to = *(char*)from; return to; + case 2: *(short*)to = *(short*)from; return to; + case 4: *(int*)to = *(int*)from; return to; +#if 1 /* including those doable with two moves? */ + case 3: *(short*)to = *(short*)from; + *((char*)to+2) = *((char*)from+2); return to; + case 5: *(int*)to = *(int*)from; + *((char*)to+4) = *((char*)from+4); return to; + case 6: *(int*)to = *(int*)from; + *((short*)to+2) = *((short*)from+2); return to; + case 8: *(int*)to = *(int*)from; + *((int*)to+1) = *((int*)from+1); return to; +#endif + } +#else + if (!n) return to; +#endif + { + /* load esi/edi */ + int esi, edi; + __asm__ __volatile__( + "" + : "=&D" (edi), "=&S" (esi) + : "0" ((long) to),"1" ((long) from) + : "memory" + ); + } + if (n >= 5*4) { + /* large block: use rep prefix */ + int ecx; + __asm__ __volatile__( + "rep ; movsl" + : "=&c" (ecx) + : "0" (n/4) + ); + } else { + /* small block: don't clobber ecx + smaller code */ + if (n >= 4*4) __asm__ __volatile__("movsl"); + if (n >= 3*4) __asm__ __volatile__("movsl"); + if (n >= 2*4) __asm__ __volatile__("movsl"); + if (n >= 1*4) __asm__ __volatile__("movsl"); + } switch (n % 4) { - case 0: COMMON(""); return to; - case 1: COMMON("\n\tmovsb"); return to; - case 2: COMMON("\n\tmovsw"); return to; - default: COMMON("\n\tmovsw\n\tmovsb"); return to; + /* tail */ + case 0: return to; + case 1: __asm__ __volatile__("movsb"); return to; + case 2: __asm__ __volatile__("movsw"); return to; + default: __asm__ __volatile__("movsw\n\tmovsb"); return to; } -} - -#undef COMMON } #define __HAVE_ARCH_MEMCPY ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH] fix i386 memcpy 2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko @ 2005-03-29 20:24 ` Denis Vlasenko 0 siblings, 0 replies; 24+ messages in thread From: Denis Vlasenko @ 2005-03-29 20:24 UTC (permalink / raw) To: Denis Vlasenko, Andrew Morton; +Cc: linux-kernel, gcc [-- Attachment #1: Type: text/plain, Size: 298 bytes --] On Tuesday 29 March 2005 23:22, Denis Vlasenko wrote: > This patch shortens non-constant memcpy() by two bytes > and fixes spurious out-of-line constant memcpy(). > > Patch is run-tested (I run on patched kernel right now). > > Benchmark and code generation test program will be mailed as reply. [-- Attachment #2: bench.c --] [-- Type: text/x-csrc, Size: 3755 bytes --] /* Compile with: gcc -Os -fomit-frame-pointer -falign-functions=32 */ /* results: processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 11 model name : Intel(R) Celeron stepping : 1 cpu MHz : 1196.236 cache size : 256 KB movsl_X wins : N<=3 rep_movsl wins : N>=6 ('assign' wins always at the cost of much larger code) */ #include <time.h> #include <stdio.h> #define N 5 #define MOVSL1 __asm__ __volatile__("movsl") #define MOVSL2 MOVSL1;MOVSL1 #define MOVSL3 MOVSL2;MOVSL1 #define MOVSL4 MOVSL3;MOVSL1 #define MOVSL5 MOVSL4;MOVSL1 #define MOVSL6 MOVSL5;MOVSL1 #define MOVSL7 MOVSL6;MOVSL1 #define MOVSL8 MOVSL7;MOVSL1 #define MOVSL9 MOVSL8;MOVSL1 #define MOVSL10 MOVSL9;MOVSL1 #define MOVSL11 MOVSL10;MOVSL1 #define MOVSL12 MOVSL11;MOVSL1 #define MOVSL13 MOVSL12;MOVSL1 #define MOVSL14 MOVSL13;MOVSL1 #define MOVSL15 MOVSL14;MOVSL1 #define MOVSL16 MOVSL15;MOVSL1 #define MOVSL17 MOVSL16;MOVSL1 #define MOVSL18 MOVSL17;MOVSL1 #define MOVSL19 MOVSL18;MOVSL1 #define MOVSL_(n) MOVSL##n #define MOVSL(n) MOVSL_(n) static inline void * rep_movsl(void * to, const void * from, size_t n) { { int esi, edi; __asm__ __volatile__( "" : "=&D" (edi), "=&S" (esi) : "0" ((long) to),"1" ((long) from) : "memory" ); } { int ecx; __asm__ __volatile__( "rep ; movsl" : "=&c" (ecx) : "0" (n/4) ); } } static inline void * movsl_X(void * to, const void * from, size_t n) { { int esi, edi; __asm__ __volatile__( "" : "=&D" (edi), "=&S" (esi) : "0" ((long) to),"1" ((long) from) : "memory" ); } MOVSL(N); } static inline void * assign(void * to, const void * from, size_t n) { switch (n) { case 4: *(unsigned long *)to = *(const unsigned long *)from; return to; case 8: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); return to; case 12: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); *(2+(unsigned long *)to) = *(2+(const unsigned long *)from); return to; case 16: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); *(2+(unsigned long *)to) = *(2+(const unsigned long *)from); *(3+(unsigned long *)to) = *(3+(const unsigned long *)from); return to; case 20: *(unsigned long *)to = *(const unsigned long *)from; *(1+(unsigned long *)to) = *(1+(const unsigned long *)from); *(2+(unsigned long *)to) = *(2+(const unsigned long *)from); *(3+(unsigned long *)to) = *(3+(const unsigned long *)from); *(4+(unsigned long *)to) = *(4+(const unsigned long *)from); return to; default: return rep_movsl(to, from, n); } } char f[256],t[256]; char *fp = f; char *tp = t; void r() { rep_movsl(f,t,N*4); } void m() { movsl_X(f,t,N*4); } void a() { assign(f,t,N*4); } void rp() { rep_movsl(fp,tp,N*4); } void mp() { movsl_X(fp,tp,N*4); } void ap() { assign(fp,tp,N*4); } int measure(void (*f)()) { int cnt = 0; time_t t = time(0); while(t==time(0)) f(); /* cache hot */ t = time(0); while(t==time(0)) { f(); f(); f(); f(); f(); f(); f(); f(); f(); f(); f(); f(); f(); f(); f(); f(); cnt += 16; } return cnt; } int main() { printf("On global array:\n"); printf("rep movsl(%d) per sec: %d\n", N, measure(r)); printf(" movsl_X(%d) per sec: %d\n", N, measure(m)); printf(" assign(%d) per sec: %d\n", N, measure(a)); printf("Indirect:\n"); printf("rep movsl(%d) per sec: %d\n", N, measure(rp)); printf(" movsl_X(%d) per sec: %d\n", N, measure(mp)); printf(" assign(%d) per sec: %d\n", N, measure(ap)); return 0; } [-- Attachment #3: codecheck.c --] [-- Type: text/x-csrc, Size: 5993 bytes --] /* Compile with: gcc -Os -fomit-frame-pointer */ /* Check for correctness/size: objdump -r -d <file.o> | $PAGER */ typedef unsigned int size_t; static inline void * __constant_memcpy(void * to, const void * from, size_t n) { #if 1 /* want to do small copies with non-string ops? */ switch (n) { case 0: return to; case 1: *(char*)to = *(char*)from; return to; case 2: *(short*)to = *(short*)from; return to; case 4: *(int*)to = *(int*)from; return to; #if 1 /* including those doable with two moves? */ case 3: *(short*)to = *(short*)from; *((char*)to+2) = *((char*)from+2); return to; case 5: *(int*)to = *(int*)from; *((char*)to+4) = *((char*)from+4); return to; case 6: *(int*)to = *(int*)from; *((short*)to+2) = *((short*)from+2); return to; case 8: *(int*)to = *(int*)from; *((int*)to+1) = *((int*)from+1); return to; #endif } #else if (!n) return to; #endif { /* load esi/edi */ int esi, edi; __asm__ __volatile__( "" : "=&D" (edi), "=&S" (esi) : "0" ((long) to),"1" ((long) from) : "memory" ); } if (n >= 5*4) { /* large block: use rep prefix */ int ecx; __asm__ __volatile__( "rep ; movsl" : "=&c" (ecx) : "0" (n/4) ); } else { /* small block: don't clobber ecx + smaller code */ if (n >= 4*4) __asm__ __volatile__("movsl"); if (n >= 3*4) __asm__ __volatile__("movsl"); if (n >= 2*4) __asm__ __volatile__("movsl"); if (n >= 1*4) __asm__ __volatile__("movsl"); } switch (n % 4) { /* tail */ case 0: return to; case 1: __asm__ __volatile__("movsb"); return to; case 2: __asm__ __volatile__("movsw"); return to; default: __asm__ __volatile__("movsw\n\tmovsb"); return to; } } static inline void * __memcpy(void * to, const void * from, size_t n) { int d0, d1, d2; __asm__ __volatile__( "rep ; movsl\n\t" "movl %4,%%ecx\n\t" "andl $3,%%ecx\n\t" "jz 1f\n\t" /* pay 2 byte penalty for a chance to skip microcoded rep */ "rep ; movsb\n\t" "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n/4), "g" (n), "1" ((long) to), "2" ((long) from) : "memory"); return (to); } #define memcpy(t, f, n) \ (__builtin_constant_p(n) ? \ __constant_memcpy((t),(f),(n)) : \ __memcpy((t),(f),(n))) int f00(char *a, char *b) __attribute__ ((section ("ff00"))); int f00(char *a, char *b) { memcpy(a,b,0); } int f01(char *a, char *b) __attribute__ ((section ("ff01"))); int f01(char *a, char *b) { memcpy(a,b,1); } int f02(char *a, char *b) __attribute__ ((section ("ff02"))); int f02(char *a, char *b) { memcpy(a,b,2); } int f03(char *a, char *b) __attribute__ ((section ("ff03"))); int f03(char *a, char *b) { memcpy(a,b,3); } int f04(char *a, char *b) __attribute__ ((section ("ff04"))); int f04(char *a, char *b) { memcpy(a,b,4); } int f05(char *a, char *b) __attribute__ ((section ("ff05"))); int f05(char *a, char *b) { memcpy(a,b,5); } int f06(char *a, char *b) __attribute__ ((section ("ff06"))); int f06(char *a, char *b) { memcpy(a,b,6); } int f07(char *a, char *b) __attribute__ ((section ("ff07"))); int f07(char *a, char *b) { memcpy(a,b,7); } int f08(char *a, char *b) __attribute__ ((section ("ff08"))); int f08(char *a, char *b) { memcpy(a,b,8); } int f09(char *a, char *b) __attribute__ ((section ("ff09"))); int f09(char *a, char *b) { memcpy(a,b,9); } int f10(char *a, char *b) __attribute__ ((section ("ff10"))); int f10(char *a, char *b) { memcpy(a,b,10); } int f11(char *a, char *b) __attribute__ ((section ("ff11"))); int f11(char *a, char *b) { memcpy(a,b,11); } int f12(char *a, char *b) __attribute__ ((section ("ff12"))); int f12(char *a, char *b) { memcpy(a,b,12); } int f13(char *a, char *b) __attribute__ ((section ("ff13"))); int f13(char *a, char *b) { memcpy(a,b,13); } int f14(char *a, char *b) __attribute__ ((section ("ff14"))); int f14(char *a, char *b) { memcpy(a,b,14); } int f15(char *a, char *b) __attribute__ ((section ("ff15"))); int f15(char *a, char *b) { memcpy(a,b,15); } int f16(char *a, char *b) __attribute__ ((section ("ff16"))); int f16(char *a, char *b) { memcpy(a,b,16); } int f17(char *a, char *b) __attribute__ ((section ("ff17"))); int f17(char *a, char *b) { memcpy(a,b,17); } int f18(char *a, char *b) __attribute__ ((section ("ff18"))); int f18(char *a, char *b) { memcpy(a,b,18); } int f19(char *a, char *b) __attribute__ ((section ("ff19"))); int f19(char *a, char *b) { memcpy(a,b,19); } int f20(char *a, char *b) __attribute__ ((section ("ff20"))); int f20(char *a, char *b) { memcpy(a,b,20); } int f21(char *a, char *b) __attribute__ ((section ("ff21"))); int f21(char *a, char *b) { memcpy(a,b,21); } int f22(char *a, char *b) __attribute__ ((section ("ff22"))); int f22(char *a, char *b) { memcpy(a,b,22); } int f23(char *a, char *b) __attribute__ ((section ("ff23"))); int f23(char *a, char *b) { memcpy(a,b,23); } int f24(char *a, char *b) __attribute__ ((section ("ff24"))); int f24(char *a, char *b) { memcpy(a,b,24); } int f25(char *a, char *b) __attribute__ ((section ("ff25"))); int f25(char *a, char *b) { memcpy(a,b,25); } int f26(char *a, char *b) __attribute__ ((section ("ff26"))); int f26(char *a, char *b) { memcpy(a,b,26); } int f27(char *a, char *b) __attribute__ ((section ("ff27"))); int f27(char *a, char *b) { memcpy(a,b,27); } int f28(char *a, char *b) __attribute__ ((section ("ff28"))); int f28(char *a, char *b) { memcpy(a,b,28); } int f29(char *a, char *b) __attribute__ ((section ("ff29"))); int f29(char *a, char *b) { memcpy(a,b,29); } int f3k(char *a, char *b) __attribute__ ((section ("ff3k"))); int f3k(char *a, char *b) { memcpy(a,b,3000); } int f(char *a, char *b) { memcpy(a,b,0); memcpy(a,b,1); memcpy(a,b,2); memcpy(a,b,3); memcpy(a,b,4); memcpy(a,b,5); memcpy(a,b,6); memcpy(a,b,7); memcpy(a,b,8); memcpy(a,b,9); memcpy(a,b,10); memcpy(a,b,11); memcpy(a,b,12); memcpy(a,b,13); memcpy(a,b,14); memcpy(a,b,15); memcpy(a,b,16); memcpy(a,b,17); memcpy(a,b,18); memcpy(a,b,19); memcpy(a,b,20); memcpy(a,b,21); memcpy(a,b,22); memcpy(a,b,23); memcpy(a,b,24); memcpy(a,b,25); memcpy(a,b,3000); } ^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2005-04-07 7:32 UTC | newest] Thread overview: 24+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2005-03-29 14:37 memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel Denis Vlasenko 2005-03-29 15:06 ` Richard Guenther 2005-03-29 15:08 ` Nathan Sidwell 2005-03-29 15:13 ` Jakub Jelinek 2005-03-29 15:42 ` Andrew Pinski 2005-03-30 2:27 ` Gerold Jury 2005-03-30 6:15 ` Denis Vlasenko 2005-04-01 21:43 ` Jan Hubicka 2005-04-02 12:18 ` Denis Vlasenko 2005-04-02 12:26 ` Denis Vlasenko 2005-04-05 16:34 ` [BUG mm] "fixed" i386 memcpy inlining buggy Christophe Saout 2005-04-06 10:14 ` Denis Vlasenko 2005-04-06 11:05 ` Dave Korn 2005-04-06 11:13 ` Dave Korn 2005-04-06 11:53 ` Dave Korn 2005-04-06 11:56 ` Dave Korn 2005-04-06 13:18 ` Richard B. Johnson 2005-04-06 14:16 ` Denis Vlasenko 2005-04-06 12:05 ` Christophe Saout 2005-04-06 12:36 ` Andrew Haley 2005-04-06 15:18 ` Paolo Bonzini 2005-04-06 16:11 ` Denis Vlasenko 2005-03-29 20:22 ` [PATCH] fix i386 memcpy Denis Vlasenko 2005-03-29 20:24 ` Denis Vlasenko
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox