* [PATCH] x86-32: use __builtin_{memcpy,memset}
@ 2010-05-11 9:05 Jan Beulich
2010-05-11 10:39 ` Keir Fraser
0 siblings, 1 reply; 4+ messages in thread
From: Jan Beulich @ 2010-05-11 9:05 UTC (permalink / raw)
To: xen-devel; +Cc: Charles Arnold
[-- Attachment #1: Type: text/plain, Size: 3907 bytes --]
Following a change in Linux 2.6.33, make x86-32 always use
__builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around
certain intermediate gcc revisions generating out-of-range-array-index
warnings with the current inline implementation.
It may be worthwhile considering to make this the case for x86-64 too.
At the same time eliminate the redundant inline assembly in the C
file, and instead use the inline functions coming from the header.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- 2010-05-04.orig/xen/arch/x86/string.c 2007-11-26 16:57:03.000000000 +0100
+++ 2010-05-04/xen/arch/x86/string.c 2010-05-11 09:45:27.000000000 +0200
@@ -11,44 +11,13 @@
#undef memcpy
void *memcpy(void *dest, const void *src, size_t n)
{
- long d0, d1, d2;
-
- asm volatile (
-#ifdef __i386__
- " rep movsl ; "
-#else
- " rep movsq ; "
- " testb $4,%b4 ; "
- " je 0f ; "
- " movsl ; "
- "0: ; "
-#endif
- " testb $2,%b4 ; "
- " je 1f ; "
- " movsw ; "
- "1: testb $1,%b4 ; "
- " je 2f ; "
- " movsb ; "
- "2: "
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
- : "memory");
-
- return dest;
+ return __variable_memcpy(dest, src, n);
}
#undef memset
void *memset(void *s, int c, size_t n)
{
- long d0, d1;
-
- asm volatile (
- "rep stosb"
- : "=&c" (d0), "=&D" (d1)
- : "a" (c), "1" (s), "0" (n)
- : "memory");
-
- return s;
+ return __memset_generic(s, c, n);
}
#undef memmove
--- 2010-05-04.orig/xen/include/asm-x86/string.h 2009-10-07 13:31:36.000000000 +0200
+++ 2010-05-04/xen/include/asm-x86/string.h 2010-05-11 10:21:04.000000000 +0200
@@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo
return to;
}
+#define __HAVE_ARCH_MEMCPY
+#if defined(__i386__) && __GNUC__ >= 4
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+#else
+
/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
@@ -95,7 +100,6 @@ static always_inline void * __constant_m
return to;
}
-#define __HAVE_ARCH_MEMCPY
/* align source to a 64-bit boundary */
static always_inline
void *__var_memcpy(void *t, const void *f, size_t n)
@@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s
__var_memcpy((t),(f),(n)));
}
+#endif /* !__i386__ || __GNUC__ < 4 */
+
/* Some version of gcc don't have this builtin. It's non-critical anyway. */
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *dest, const void *src, size_t n);
-static inline void *__memset_generic(void *s, char c, size_t count)
+static inline void *__memset_generic(void *s, int c, size_t count)
{
long d0, d1;
__asm__ __volatile__ (
@@ -134,6 +140,11 @@ static inline void *__memset_generic(voi
return s;
}
+#define __HAVE_ARCH_MEMSET
+#if defined(__i386__) && __GNUC__ >= 4
+#define memset(s, c, n) __builtin_memset(s, c, n)
+#else
+
/* we might want to write optimized versions of these later */
#define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
@@ -238,11 +249,12 @@ static always_inline void *__constant_c_
#define MEMSET_PATTERN_MUL 0x01010101UL
#endif
-#define __HAVE_ARCH_MEMSET
#define memset(s, c, count) (__memset((s),(c),(count)))
#define __memset(s, c, count) \
(__builtin_constant_p(c) ? \
__constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
__var_x_memset((s),(c),(count)))
+#endif /* !__i386__ || __GNUC__ < 4 */
+
#endif /* __X86_STRING_H__ */
[-- Attachment #2: x86-memcpy.patch --]
[-- Type: text/plain, Size: 3901 bytes --]
Following a change in Linux 2.6.33, make x86-32 always use
__builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around
certain intermediate gcc revisions generating out-of-range-array-index
warnings with the current inline implementation.
It may be worthwhile considering to make this the case for x86-64 too.
At the same time eliminate the redundant inline assembly in the C
file, and instead use the inline functions coming from the header.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
--- 2010-05-04.orig/xen/arch/x86/string.c 2007-11-26 16:57:03.000000000 +0100
+++ 2010-05-04/xen/arch/x86/string.c 2010-05-11 09:45:27.000000000 +0200
@@ -11,44 +11,13 @@
#undef memcpy
void *memcpy(void *dest, const void *src, size_t n)
{
- long d0, d1, d2;
-
- asm volatile (
-#ifdef __i386__
- " rep movsl ; "
-#else
- " rep movsq ; "
- " testb $4,%b4 ; "
- " je 0f ; "
- " movsl ; "
- "0: ; "
-#endif
- " testb $2,%b4 ; "
- " je 1f ; "
- " movsw ; "
- "1: testb $1,%b4 ; "
- " je 2f ; "
- " movsb ; "
- "2: "
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
- : "memory");
-
- return dest;
+ return __variable_memcpy(dest, src, n);
}
#undef memset
void *memset(void *s, int c, size_t n)
{
- long d0, d1;
-
- asm volatile (
- "rep stosb"
- : "=&c" (d0), "=&D" (d1)
- : "a" (c), "1" (s), "0" (n)
- : "memory");
-
- return s;
+ return __memset_generic(s, c, n);
}
#undef memmove
--- 2010-05-04.orig/xen/include/asm-x86/string.h 2009-10-07 13:31:36.000000000 +0200
+++ 2010-05-04/xen/include/asm-x86/string.h 2010-05-11 10:21:04.000000000 +0200
@@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo
return to;
}
+#define __HAVE_ARCH_MEMCPY
+#if defined(__i386__) && __GNUC__ >= 4
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+#else
+
/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
@@ -95,7 +100,6 @@ static always_inline void * __constant_m
return to;
}
-#define __HAVE_ARCH_MEMCPY
/* align source to a 64-bit boundary */
static always_inline
void *__var_memcpy(void *t, const void *f, size_t n)
@@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s
__var_memcpy((t),(f),(n)));
}
+#endif /* !__i386__ || __GNUC__ < 4 */
+
/* Some version of gcc don't have this builtin. It's non-critical anyway. */
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *dest, const void *src, size_t n);
-static inline void *__memset_generic(void *s, char c, size_t count)
+static inline void *__memset_generic(void *s, int c, size_t count)
{
long d0, d1;
__asm__ __volatile__ (
@@ -134,6 +140,11 @@ static inline void *__memset_generic(voi
return s;
}
+#define __HAVE_ARCH_MEMSET
+#if defined(__i386__) && __GNUC__ >= 4
+#define memset(s, c, n) __builtin_memset(s, c, n)
+#else
+
/* we might want to write optimized versions of these later */
#define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
@@ -238,11 +249,12 @@ static always_inline void *__constant_c_
#define MEMSET_PATTERN_MUL 0x01010101UL
#endif
-#define __HAVE_ARCH_MEMSET
#define memset(s, c, count) (__memset((s),(c),(count)))
#define __memset(s, c, count) \
(__builtin_constant_p(c) ? \
__constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
__var_x_memset((s),(c),(count)))
+#endif /* !__i386__ || __GNUC__ < 4 */
+
#endif /* __X86_STRING_H__ */
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] x86-32: use __builtin_{memcpy,memset}
2010-05-11 9:05 [PATCH] x86-32: use __builtin_{memcpy,memset} Jan Beulich
@ 2010-05-11 10:39 ` Keir Fraser
2010-05-11 12:04 ` Keir Fraser
0 siblings, 1 reply; 4+ messages in thread
From: Keir Fraser @ 2010-05-11 10:39 UTC (permalink / raw)
To: Jan Beulich, xen-devel@lists.xensource.com; +Cc: Charles Arnold
[-- Attachment #1: Type: text/plain, Size: 4649 bytes --]
On 11/05/2010 10:05, "Jan Beulich" <JBeulich@novell.com> wrote:
> Following a change in Linux 2.6.33, make x86-32 always use
> __builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around
> certain intermediate gcc revisions generating out-of-range-array-index
> warnings with the current inline implementation.
>
> It may be worthwhile considering to make this the case for x86-64 too.
>
> At the same time eliminate the redundant inline assembly in the C
> file, and instead use the inline functions coming from the header.
Hm, well, I hate having to change this stuff as we always seem to end up
broken on some gcc or other. But otoh this will eliminate a bunch of code if
we do this unconditionally, and I'm particularly not keen on doing this only
for x86-32 and particular versions of gcc. I suggest the attached patch: it
should work fine so long as all our supported versions of gcc have
__builtin_memcpy and __builtin_memset. Given we nowadays only support GCC
3.4+, I imagine we are okay in this regard.
What do you think to this alternative patch?
-- Keir
> Signed-off-by: Jan Beulich <jbeulich@novell.com>
>
> --- 2010-05-04.orig/xen/arch/x86/string.c 2007-11-26 16:57:03.000000000 +0100
> +++ 2010-05-04/xen/arch/x86/string.c 2010-05-11 09:45:27.000000000 +0200
> @@ -11,44 +11,13 @@
> #undef memcpy
> void *memcpy(void *dest, const void *src, size_t n)
> {
> - long d0, d1, d2;
> -
> - asm volatile (
> -#ifdef __i386__
> - " rep movsl ; "
> -#else
> - " rep movsq ; "
> - " testb $4,%b4 ; "
> - " je 0f ; "
> - " movsl ; "
> - "0: ; "
> -#endif
> - " testb $2,%b4 ; "
> - " je 1f ; "
> - " movsw ; "
> - "1: testb $1,%b4 ; "
> - " je 2f ; "
> - " movsb ; "
> - "2: "
> - : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> - : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
> - : "memory");
> -
> - return dest;
> + return __variable_memcpy(dest, src, n);
> }
>
> #undef memset
> void *memset(void *s, int c, size_t n)
> {
> - long d0, d1;
> -
> - asm volatile (
> - "rep stosb"
> - : "=&c" (d0), "=&D" (d1)
> - : "a" (c), "1" (s), "0" (n)
> - : "memory");
> -
> - return s;
> + return __memset_generic(s, c, n);
> }
>
> #undef memmove
> --- 2010-05-04.orig/xen/include/asm-x86/string.h 2009-10-07 13:31:36.000000000
> +0200
> +++ 2010-05-04/xen/include/asm-x86/string.h 2010-05-11 10:21:04.000000000
> +0200
> @@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo
> return to;
> }
>
> +#define __HAVE_ARCH_MEMCPY
> +#if defined(__i386__) && __GNUC__ >= 4
> +#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
> +#else
> +
> /*
> * This looks horribly ugly, but the compiler can optimize it totally,
> * as the count is constant.
> @@ -95,7 +100,6 @@ static always_inline void * __constant_m
> return to;
> }
>
> -#define __HAVE_ARCH_MEMCPY
> /* align source to a 64-bit boundary */
> static always_inline
> void *__var_memcpy(void *t, const void *f, size_t n)
> @@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s
> __var_memcpy((t),(f),(n)));
> }
>
> +#endif /* !__i386__ || __GNUC__ < 4 */
> +
> /* Some version of gcc don't have this builtin. It's non-critical anyway. */
> #define __HAVE_ARCH_MEMMOVE
> extern void *memmove(void *dest, const void *src, size_t n);
>
> -static inline void *__memset_generic(void *s, char c, size_t count)
> +static inline void *__memset_generic(void *s, int c, size_t count)
> {
> long d0, d1;
> __asm__ __volatile__ (
> @@ -134,6 +140,11 @@ static inline void *__memset_generic(voi
> return s;
> }
>
> +#define __HAVE_ARCH_MEMSET
> +#if defined(__i386__) && __GNUC__ >= 4
> +#define memset(s, c, n) __builtin_memset(s, c, n)
> +#else
> +
> /* we might want to write optimized versions of these later */
> #define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
>
> @@ -238,11 +249,12 @@ static always_inline void *__constant_c_
> #define MEMSET_PATTERN_MUL 0x01010101UL
> #endif
>
> -#define __HAVE_ARCH_MEMSET
> #define memset(s, c, count) (__memset((s),(c),(count)))
> #define __memset(s, c, count) \
> (__builtin_constant_p(c) ? \
> __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) :
> \
> __var_x_memset((s),(c),(count)))
>
> +#endif /* !__i386__ || __GNUC__ < 4 */
> +
> #endif /* __X86_STRING_H__ */
>
>
>
[-- Attachment #2: 00-string --]
[-- Type: application/octet-stream, Size: 9162 bytes --]
diff -r 60b3417b1499 xen/arch/x86/string.c
--- a/xen/arch/x86/string.c Tue May 11 11:23:54 2010 +0100
+++ b/xen/arch/x86/string.c Tue May 11 11:37:09 2010 +0100
@@ -14,25 +14,12 @@
long d0, d1, d2;
asm volatile (
-#ifdef __i386__
- " rep movsl ; "
-#else
- " rep movsq ; "
- " testb $4,%b4 ; "
- " je 0f ; "
- " movsl ; "
- "0: ; "
-#endif
- " testb $2,%b4 ; "
- " je 1f ; "
- " movsw ; "
- "1: testb $1,%b4 ; "
- " je 2f ; "
- " movsb ; "
- "2: "
+ " rep ; movs"__OS" ; "
+ " mov %4,%3 ; "
+ " rep ; movsb "
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
- : "memory");
+ : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (dest), "2" (src)
+ : "memory" );
return dest;
}
@@ -55,7 +42,7 @@
void *memmove(void *dest, const void *src, size_t n)
{
long d0, d1, d2;
-
+
if ( dest < src )
return memcpy(dest, src, n);
diff -r 60b3417b1499 xen/include/asm-x86/string.h
--- a/xen/include/asm-x86/string.h Tue May 11 11:23:54 2010 +0100
+++ b/xen/include/asm-x86/string.h Tue May 11 11:37:09 2010 +0100
@@ -3,246 +3,14 @@
#include <xen/config.h>
-static inline void *__variable_memcpy(void *to, const void *from, size_t n)
-{
- long d0, d1, d2;
- __asm__ __volatile__ (
- " rep ; movs"__OS"\n"
- " mov %4,%3 \n"
- " rep ; movsb \n"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (to), "2" (from)
- : "memory" );
- return to;
-}
+#define __HAVE_ARCH_MEMCPY
+#define memcpy(t,f,n) (__builtin_memcpy((t),(f),(n)))
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as the count is constant.
- */
-static always_inline void * __constant_memcpy(
- void * to, const void * from, size_t n)
-{
- switch ( n )
- {
- case 0:
- return to;
- case 1:
- *(u8 *)to = *(const u8 *)from;
- return to;
- case 2:
- *(u16 *)to = *(const u16 *)from;
- return to;
- case 3:
- *(u16 *)to = *(const u16 *)from;
- *(2+(u8 *)to) = *(2+(const u8 *)from);
- return to;
- case 4:
- *(u32 *)to = *(const u32 *)from;
- return to;
- case 5:
- *(u32 *)to = *(const u32 *)from;
- *(4+(u8 *)to) = *(4+(const u8 *)from);
- return to;
- case 6:
- *(u32 *)to = *(const u32 *)from;
- *(2+(u16 *)to) = *(2+(const u16 *)from);
- return to;
- case 7:
- *(u32 *)to = *(const u32 *)from;
- *(2+(u16 *)to) = *(2+(const u16 *)from);
- *(6+(u8 *)to) = *(6+(const u8 *)from);
- return to;
- case 8:
- *(u64 *)to = *(const u64 *)from;
- return to;
- case 12:
- *(u64 *)to = *(const u64 *)from;
- *(2+(u32 *)to) = *(2+(const u32 *)from);
- return to;
- case 16:
- *(u64 *)to = *(const u64 *)from;
- *(1+(u64 *)to) = *(1+(const u64 *)from);
- return to;
- case 20:
- *(u64 *)to = *(const u64 *)from;
- *(1+(u64 *)to) = *(1+(const u64 *)from);
- *(4+(u32 *)to) = *(4+(const u32 *)from);
- return to;
- }
-#define COMMON(x) \
- __asm__ __volatile__ ( \
- "rep ; movs"__OS \
- x \
- : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
- : "0" (n/BYTES_PER_LONG), "1" (to), "2" (from) \
- : "memory" );
- {
- long d0, d1, d2;
- switch ( n % BYTES_PER_LONG )
- {
- case 0: COMMON(""); return to;
- case 1: COMMON("\n\tmovsb"); return to;
- case 2: COMMON("\n\tmovsw"); return to;
- case 3: COMMON("\n\tmovsw\n\tmovsb"); return to;
- case 4: COMMON("\n\tmovsl"); return to;
- case 5: COMMON("\n\tmovsl\n\tmovsb"); return to;
- case 6: COMMON("\n\tmovsl\n\tmovsw"); return to;
- case 7: COMMON("\n\tmovsl\n\tmovsw\n\tmovsb"); return to;
- }
- }
-#undef COMMON
- return to;
-}
-
-#define __HAVE_ARCH_MEMCPY
-/* align source to a 64-bit boundary */
-static always_inline
-void *__var_memcpy(void *t, const void *f, size_t n)
-{
- int off = (unsigned long)f & 0x7;
- /* just do alignment if needed and if size is worth */
- if ( (n > 32) && off ) {
- size_t n1 = 8 - off;
- __variable_memcpy(t, f, n1);
- __variable_memcpy(t + n1, f + n1, n - n1);
- return t;
- } else {
- return (__variable_memcpy(t, f, n));
- }
-}
-
-#define memcpy(t,f,n) (__memcpy((t),(f),(n)))
-static always_inline
-void *__memcpy(void *t, const void *f, size_t n)
-{
- return (__builtin_constant_p(n) ?
- __constant_memcpy((t),(f),(n)) :
- __var_memcpy((t),(f),(n)));
-}
-
-/* Some version of gcc don't have this builtin. It's non-critical anyway. */
+/* Some versions of gcc don't have this builtin. It's non-critical anyway. */
#define __HAVE_ARCH_MEMMOVE
extern void *memmove(void *dest, const void *src, size_t n);
-static inline void *__memset_generic(void *s, char c, size_t count)
-{
- long d0, d1;
- __asm__ __volatile__ (
- "rep ; stosb"
- : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (count) : "memory" );
- return s;
-}
-
-/* we might want to write optimized versions of these later */
-#define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
-
-/*
- * memset(x,0,y) is a reasonably common thing to do, so we want to fill
- * things 32 bits at a time even when we don't know the size of the
- * area at compile-time..
- */
-static inline void *__constant_c_memset(void *s, unsigned long c, size_t count)
-{
- long d0, d1;
- __asm__ __volatile__(
- " rep ; stos"__OS"\n"
- " mov %3,%4 \n"
- " rep ; stosb \n"
- : "=&c" (d0), "=&D" (d1)
- : "a" (c), "r" (count%BYTES_PER_LONG),
- "0" (count/BYTES_PER_LONG), "1" (s)
- : "memory" );
- return s;
-}
-
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as we by now know that both pattern and count is constant..
- */
-static always_inline void *__constant_c_and_count_memset(
- void *s, unsigned long pattern, size_t count)
-{
- switch ( count )
- {
- case 0:
- return s;
- case 1:
- *(u8 *)s = pattern;
- return s;
- case 2:
- *(u16 *)s = pattern;
- return s;
- case 3:
- *(u16 *)s = pattern;
- *(2+(u8 *)s) = pattern;
- return s;
- case 4:
- *(u32 *)s = pattern;
- return s;
- case 5:
- *(u32 *)s = pattern;
- *(4+(u8 *)s) = pattern;
- return s;
- case 6:
- *(u32 *)s = pattern;
- *(2+(u16 *)s) = pattern;
- return s;
- case 7:
- *(u32 *)s = pattern;
- *(2+(u16 *)s) = pattern;
- *(6+(u8 *)s) = pattern;
- return s;
- case 8:
- *(u64 *)s = pattern;
- return s;
- }
-#define COMMON(x) \
- __asm__ __volatile__ ( \
- "rep ; stos"__OS \
- x \
- : "=&c" (d0), "=&D" (d1) \
- : "a" (pattern), "0" (count/BYTES_PER_LONG), "1" (s) \
- : "memory" )
- {
- long d0, d1;
- switch ( count % BYTES_PER_LONG )
- {
- case 0: COMMON(""); return s;
- case 1: COMMON("\n\tstosb"); return s;
- case 2: COMMON("\n\tstosw"); return s;
- case 3: COMMON("\n\tstosw\n\tstosb"); return s;
- case 4: COMMON("\n\tstosl"); return s;
- case 5: COMMON("\n\tstosl\n\tstosb"); return s;
- case 6: COMMON("\n\tstosl\n\tstosw"); return s;
- case 7: COMMON("\n\tstosl\n\tstosw\n\tstosb"); return s;
- }
- }
-#undef COMMON
- return s;
-}
-
-#define __constant_c_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_c_and_count_memset((s),(c),(count)) : \
- __constant_c_memset((s),(c),(count)))
-
-#define __var_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_count_memset((s),(c),(count)) : \
- __memset_generic((s),(c),(count)))
-
-#ifdef CONFIG_X86_64
-#define MEMSET_PATTERN_MUL 0x0101010101010101UL
-#else
-#define MEMSET_PATTERN_MUL 0x01010101UL
-#endif
-
#define __HAVE_ARCH_MEMSET
-#define memset(s, c, count) (__memset((s),(c),(count)))
-#define __memset(s, c, count) \
-(__builtin_constant_p(c) ? \
- __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
- __var_x_memset((s),(c),(count)))
+#define memset(s,c,n) (__builtin_memset((s),(c),(n)))
#endif /* __X86_STRING_H__ */
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] x86-32: use __builtin_{memcpy,memset}
2010-05-11 10:39 ` Keir Fraser
@ 2010-05-11 12:04 ` Keir Fraser
2010-05-11 12:33 ` Jan Beulich
0 siblings, 1 reply; 4+ messages in thread
From: Keir Fraser @ 2010-05-11 12:04 UTC (permalink / raw)
To: Jan Beulich, xen-devel@lists.xensource.com; +Cc: Charles Arnold
[-- Attachment #1: Type: text/plain, Size: 840 bytes --]
On 11/05/2010 11:39, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> Hm, well, I hate having to change this stuff as we always seem to end up
> broken on some gcc or other. But otoh this will eliminate a bunch of code if
> we do this unconditionally, and I'm particularly not keen on doing this only
> for x86-32 and particular versions of gcc. I suggest the attached patch: it
> should work fine so long as all our supported versions of gcc have
> __builtin_memcpy and __builtin_memset. Given we nowadays only support GCC
> 3.4+, I imagine we are okay in this regard.
>
> What do you think to this alternative patch?
Jan,
What about this alternative version which distinguishes between GCC 4.3+
(which should auto-inline quite sensibly) and earlier versions (which need
help via macro mapping explicitly to __builtin_*)?
-- Keir
[-- Attachment #2: 00-string --]
[-- Type: application/octet-stream, Size: 9628 bytes --]
diff -r 2b5e14e4c5e5 xen/arch/x86/string.c
--- a/xen/arch/x86/string.c Tue May 11 12:37:26 2010 +0100
+++ b/xen/arch/x86/string.c Tue May 11 13:02:40 2010 +0100
@@ -14,25 +14,12 @@
long d0, d1, d2;
asm volatile (
-#ifdef __i386__
- " rep movsl ; "
-#else
- " rep movsq ; "
- " testb $4,%b4 ; "
- " je 0f ; "
- " movsl ; "
- "0: ; "
-#endif
- " testb $2,%b4 ; "
- " je 1f ; "
- " movsw ; "
- "1: testb $1,%b4 ; "
- " je 2f ; "
- " movsb ; "
- "2: "
+ " rep ; movs"__OS" ; "
+ " mov %4,%3 ; "
+ " rep ; movsb "
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
- : "memory");
+ : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (dest), "2" (src)
+ : "memory" );
return dest;
}
@@ -55,7 +42,7 @@
void *memmove(void *dest, const void *src, size_t n)
{
long d0, d1, d2;
-
+
if ( dest < src )
return memcpy(dest, src, n);
diff -r 2b5e14e4c5e5 xen/include/asm-x86/string.h
--- a/xen/include/asm-x86/string.h Tue May 11 12:37:26 2010 +0100
+++ b/xen/include/asm-x86/string.h Tue May 11 13:02:40 2010 +0100
@@ -3,246 +3,24 @@
#include <xen/config.h>
-static inline void *__variable_memcpy(void *to, const void *from, size_t n)
-{
- long d0, d1, d2;
- __asm__ __volatile__ (
- " rep ; movs"__OS"\n"
- " mov %4,%3 \n"
- " rep ; movsb \n"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (to), "2" (from)
- : "memory" );
- return to;
-}
+#define __HAVE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMMOVE
/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as the count is constant.
+ * According to Andi Kleen, GCC earlier than 4.3 does not reliably replace
+ * calls to string functions with good specialised built-in versions. Thus we
+ * explicitly map certain functions to __builtin_*() using macros.
*/
-static always_inline void * __constant_memcpy(
- void * to, const void * from, size_t n)
-{
- switch ( n )
- {
- case 0:
- return to;
- case 1:
- *(u8 *)to = *(const u8 *)from;
- return to;
- case 2:
- *(u16 *)to = *(const u16 *)from;
- return to;
- case 3:
- *(u16 *)to = *(const u16 *)from;
- *(2+(u8 *)to) = *(2+(const u8 *)from);
- return to;
- case 4:
- *(u32 *)to = *(const u32 *)from;
- return to;
- case 5:
- *(u32 *)to = *(const u32 *)from;
- *(4+(u8 *)to) = *(4+(const u8 *)from);
- return to;
- case 6:
- *(u32 *)to = *(const u32 *)from;
- *(2+(u16 *)to) = *(2+(const u16 *)from);
- return to;
- case 7:
- *(u32 *)to = *(const u32 *)from;
- *(2+(u16 *)to) = *(2+(const u16 *)from);
- *(6+(u8 *)to) = *(6+(const u8 *)from);
- return to;
- case 8:
- *(u64 *)to = *(const u64 *)from;
- return to;
- case 12:
- *(u64 *)to = *(const u64 *)from;
- *(2+(u32 *)to) = *(2+(const u32 *)from);
- return to;
- case 16:
- *(u64 *)to = *(const u64 *)from;
- *(1+(u64 *)to) = *(1+(const u64 *)from);
- return to;
- case 20:
- *(u64 *)to = *(const u64 *)from;
- *(1+(u64 *)to) = *(1+(const u64 *)from);
- *(4+(u32 *)to) = *(4+(const u32 *)from);
- return to;
- }
-#define COMMON(x) \
- __asm__ __volatile__ ( \
- "rep ; movs"__OS \
- x \
- : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
- : "0" (n/BYTES_PER_LONG), "1" (to), "2" (from) \
- : "memory" );
- {
- long d0, d1, d2;
- switch ( n % BYTES_PER_LONG )
- {
- case 0: COMMON(""); return to;
- case 1: COMMON("\n\tmovsb"); return to;
- case 2: COMMON("\n\tmovsw"); return to;
- case 3: COMMON("\n\tmovsw\n\tmovsb"); return to;
- case 4: COMMON("\n\tmovsl"); return to;
- case 5: COMMON("\n\tmovsl\n\tmovsb"); return to;
- case 6: COMMON("\n\tmovsl\n\tmovsw"); return to;
- case 7: COMMON("\n\tmovsl\n\tmovsw\n\tmovsb"); return to;
- }
- }
-#undef COMMON
- return to;
-}
+#if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+#define memcpy(t,f,n) (__builtin_memcpy((t),(f),(n)))
+#define memset(s,c,n) (__builtin_memset((s),(c),(n)))
+#else
+extern void *memcpy(void *dest, const void *src, size_t n);
+extern void *memset(void *s, int c, size_t n);
+#endif
-#define __HAVE_ARCH_MEMCPY
-/* align source to a 64-bit boundary */
-static always_inline
-void *__var_memcpy(void *t, const void *f, size_t n)
-{
- int off = (unsigned long)f & 0x7;
- /* just do alignment if needed and if size is worth */
- if ( (n > 32) && off ) {
- size_t n1 = 8 - off;
- __variable_memcpy(t, f, n1);
- __variable_memcpy(t + n1, f + n1, n - n1);
- return t;
- } else {
- return (__variable_memcpy(t, f, n));
- }
-}
-
-#define memcpy(t,f,n) (__memcpy((t),(f),(n)))
-static always_inline
-void *__memcpy(void *t, const void *f, size_t n)
-{
- return (__builtin_constant_p(n) ?
- __constant_memcpy((t),(f),(n)) :
- __var_memcpy((t),(f),(n)));
-}
-
-/* Some version of gcc don't have this builtin. It's non-critical anyway. */
-#define __HAVE_ARCH_MEMMOVE
+/* Some versions of gcc don't have this built in. It's non-critical anyway. */
extern void *memmove(void *dest, const void *src, size_t n);
-static inline void *__memset_generic(void *s, char c, size_t count)
-{
- long d0, d1;
- __asm__ __volatile__ (
- "rep ; stosb"
- : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (count) : "memory" );
- return s;
-}
-
-/* we might want to write optimized versions of these later */
-#define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
-
-/*
- * memset(x,0,y) is a reasonably common thing to do, so we want to fill
- * things 32 bits at a time even when we don't know the size of the
- * area at compile-time..
- */
-static inline void *__constant_c_memset(void *s, unsigned long c, size_t count)
-{
- long d0, d1;
- __asm__ __volatile__(
- " rep ; stos"__OS"\n"
- " mov %3,%4 \n"
- " rep ; stosb \n"
- : "=&c" (d0), "=&D" (d1)
- : "a" (c), "r" (count%BYTES_PER_LONG),
- "0" (count/BYTES_PER_LONG), "1" (s)
- : "memory" );
- return s;
-}
-
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as we by now know that both pattern and count is constant..
- */
-static always_inline void *__constant_c_and_count_memset(
- void *s, unsigned long pattern, size_t count)
-{
- switch ( count )
- {
- case 0:
- return s;
- case 1:
- *(u8 *)s = pattern;
- return s;
- case 2:
- *(u16 *)s = pattern;
- return s;
- case 3:
- *(u16 *)s = pattern;
- *(2+(u8 *)s) = pattern;
- return s;
- case 4:
- *(u32 *)s = pattern;
- return s;
- case 5:
- *(u32 *)s = pattern;
- *(4+(u8 *)s) = pattern;
- return s;
- case 6:
- *(u32 *)s = pattern;
- *(2+(u16 *)s) = pattern;
- return s;
- case 7:
- *(u32 *)s = pattern;
- *(2+(u16 *)s) = pattern;
- *(6+(u8 *)s) = pattern;
- return s;
- case 8:
- *(u64 *)s = pattern;
- return s;
- }
-#define COMMON(x) \
- __asm__ __volatile__ ( \
- "rep ; stos"__OS \
- x \
- : "=&c" (d0), "=&D" (d1) \
- : "a" (pattern), "0" (count/BYTES_PER_LONG), "1" (s) \
- : "memory" )
- {
- long d0, d1;
- switch ( count % BYTES_PER_LONG )
- {
- case 0: COMMON(""); return s;
- case 1: COMMON("\n\tstosb"); return s;
- case 2: COMMON("\n\tstosw"); return s;
- case 3: COMMON("\n\tstosw\n\tstosb"); return s;
- case 4: COMMON("\n\tstosl"); return s;
- case 5: COMMON("\n\tstosl\n\tstosb"); return s;
- case 6: COMMON("\n\tstosl\n\tstosw"); return s;
- case 7: COMMON("\n\tstosl\n\tstosw\n\tstosb"); return s;
- }
- }
-#undef COMMON
- return s;
-}
-
-#define __constant_c_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_c_and_count_memset((s),(c),(count)) : \
- __constant_c_memset((s),(c),(count)))
-
-#define __var_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_count_memset((s),(c),(count)) : \
- __memset_generic((s),(c),(count)))
-
-#ifdef CONFIG_X86_64
-#define MEMSET_PATTERN_MUL 0x0101010101010101UL
-#else
-#define MEMSET_PATTERN_MUL 0x01010101UL
-#endif
-
-#define __HAVE_ARCH_MEMSET
-#define memset(s, c, count) (__memset((s),(c),(count)))
-#define __memset(s, c, count) \
-(__builtin_constant_p(c) ? \
- __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
- __var_x_memset((s),(c),(count)))
-
#endif /* __X86_STRING_H__ */
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] x86-32: use __builtin_{memcpy,memset}
2010-05-11 12:04 ` Keir Fraser
@ 2010-05-11 12:33 ` Jan Beulich
0 siblings, 0 replies; 4+ messages in thread
From: Jan Beulich @ 2010-05-11 12:33 UTC (permalink / raw)
To: Keir Fraser, xen-devel@lists.xensource.com; +Cc: Charles Arnold
>>> Keir Fraser <keir.fraser@eu.citrix.com> 11.05.10 14:04 >>>
>What about this alternative version which distinguishes between GCC 4.3+
>(which should auto-inline quite sensibly) and earlier versions (which need
>help via macro mapping explicitly to __builtin_*)?
I don't think this will do what we expect in the presence of -fno-builtin
(see also my other response).
Jan
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2010-05-11 12:33 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-11 9:05 [PATCH] x86-32: use __builtin_{memcpy,memset} Jan Beulich
2010-05-11 10:39 ` Keir Fraser
2010-05-11 12:04 ` Keir Fraser
2010-05-11 12:33 ` Jan Beulich
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).