xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86-32: use __builtin_{memcpy,memset}
@ 2010-05-11  9:05 Jan Beulich
  2010-05-11 10:39 ` Keir Fraser
  0 siblings, 1 reply; 4+ messages in thread
From: Jan Beulich @ 2010-05-11  9:05 UTC (permalink / raw)
  To: xen-devel; +Cc: Charles Arnold

[-- Attachment #1: Type: text/plain, Size: 3907 bytes --]

Following a change in Linux 2.6.33, make x86-32 always use
__builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around
certain intermediate gcc revisions generating out-of-range-array-index
warnings with the current inline implementation.

It may be worthwhile considering to make this the case for x86-64 too.

At the same time eliminate the redundant inline assembly in the C
file, and instead use the inline functions coming from the header.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

--- 2010-05-04.orig/xen/arch/x86/string.c	2007-11-26 16:57:03.000000000 +0100
+++ 2010-05-04/xen/arch/x86/string.c	2010-05-11 09:45:27.000000000 +0200
@@ -11,44 +11,13 @@
 #undef memcpy
 void *memcpy(void *dest, const void *src, size_t n)
 {
-    long d0, d1, d2;
-
-    asm volatile (
-#ifdef __i386__
-        "   rep movsl        ; "
-#else
-        "   rep movsq        ; "
-        "   testb $4,%b4     ; "
-        "   je 0f            ; "
-        "   movsl            ; "
-        "0:                  ; "
-#endif
-        "   testb $2,%b4     ; "
-        "   je 1f            ; "
-        "   movsw            ; "
-        "1: testb $1,%b4     ; "
-        "   je 2f            ; "
-        "   movsb            ; "
-        "2:                    "
-        : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-        : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
-        : "memory");
-
-    return dest;
+    return __variable_memcpy(dest, src, n);
 }
 
 #undef memset
 void *memset(void *s, int c, size_t n)
 {
-    long d0, d1;
-
-    asm volatile (
-        "rep stosb"
-        : "=&c" (d0), "=&D" (d1)
-        : "a" (c), "1" (s), "0" (n)
-        : "memory");
-
-    return s;
+    return __memset_generic(s, c, n);
 }
 
 #undef memmove
--- 2010-05-04.orig/xen/include/asm-x86/string.h	2009-10-07 13:31:36.000000000 +0200
+++ 2010-05-04/xen/include/asm-x86/string.h	2010-05-11 10:21:04.000000000 +0200
@@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo
     return to;
 }
 
+#define __HAVE_ARCH_MEMCPY
+#if defined(__i386__) && __GNUC__ >= 4
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+#else
+
 /*
  * This looks horribly ugly, but the compiler can optimize it totally,
  * as the count is constant.
@@ -95,7 +100,6 @@ static always_inline void * __constant_m
     return to;
 }
 
-#define __HAVE_ARCH_MEMCPY
 /* align source to a 64-bit boundary */
 static always_inline
 void *__var_memcpy(void *t, const void *f, size_t n)
@@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s
             __var_memcpy((t),(f),(n)));
 }
 
+#endif /* !__i386__ || __GNUC__ < 4 */
+
 /* Some version of gcc don't have this builtin. It's non-critical anyway. */
 #define __HAVE_ARCH_MEMMOVE
 extern void *memmove(void *dest, const void *src, size_t n);
 
-static inline void *__memset_generic(void *s, char c, size_t count)
+static inline void *__memset_generic(void *s, int c, size_t count)
 {
     long d0, d1;
     __asm__ __volatile__ (
@@ -134,6 +140,11 @@ static inline void *__memset_generic(voi
     return s;
 }
 
+#define __HAVE_ARCH_MEMSET
+#if defined(__i386__) && __GNUC__ >= 4
+#define memset(s, c, n) __builtin_memset(s, c, n)
+#else
+
 /* we might want to write optimized versions of these later */
 #define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
 
@@ -238,11 +249,12 @@ static always_inline void *__constant_c_
 #define MEMSET_PATTERN_MUL 0x01010101UL
 #endif
 
-#define __HAVE_ARCH_MEMSET
 #define memset(s, c, count) (__memset((s),(c),(count)))
 #define __memset(s, c, count) \
 (__builtin_constant_p(c) ? \
  __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
  __var_x_memset((s),(c),(count)))
 
+#endif /* !__i386__ || __GNUC__ < 4 */
+
 #endif /* __X86_STRING_H__ */




[-- Attachment #2: x86-memcpy.patch --]
[-- Type: text/plain, Size: 3901 bytes --]

Following a change in Linux 2.6.33, make x86-32 always use
__builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around
certain intermediate gcc revisions generating out-of-range-array-index
warnings with the current inline implementation.

It may be worthwhile considering to make this the case for x86-64 too.

At the same time eliminate the redundant inline assembly in the C
file, and instead use the inline functions coming from the header.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

--- 2010-05-04.orig/xen/arch/x86/string.c	2007-11-26 16:57:03.000000000 +0100
+++ 2010-05-04/xen/arch/x86/string.c	2010-05-11 09:45:27.000000000 +0200
@@ -11,44 +11,13 @@
 #undef memcpy
 void *memcpy(void *dest, const void *src, size_t n)
 {
-    long d0, d1, d2;
-
-    asm volatile (
-#ifdef __i386__
-        "   rep movsl        ; "
-#else
-        "   rep movsq        ; "
-        "   testb $4,%b4     ; "
-        "   je 0f            ; "
-        "   movsl            ; "
-        "0:                  ; "
-#endif
-        "   testb $2,%b4     ; "
-        "   je 1f            ; "
-        "   movsw            ; "
-        "1: testb $1,%b4     ; "
-        "   je 2f            ; "
-        "   movsb            ; "
-        "2:                    "
-        : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-        : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
-        : "memory");
-
-    return dest;
+    return __variable_memcpy(dest, src, n);
 }
 
 #undef memset
 void *memset(void *s, int c, size_t n)
 {
-    long d0, d1;
-
-    asm volatile (
-        "rep stosb"
-        : "=&c" (d0), "=&D" (d1)
-        : "a" (c), "1" (s), "0" (n)
-        : "memory");
-
-    return s;
+    return __memset_generic(s, c, n);
 }
 
 #undef memmove
--- 2010-05-04.orig/xen/include/asm-x86/string.h	2009-10-07 13:31:36.000000000 +0200
+++ 2010-05-04/xen/include/asm-x86/string.h	2010-05-11 10:21:04.000000000 +0200
@@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo
     return to;
 }
 
+#define __HAVE_ARCH_MEMCPY
+#if defined(__i386__) && __GNUC__ >= 4
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+#else
+
 /*
  * This looks horribly ugly, but the compiler can optimize it totally,
  * as the count is constant.
@@ -95,7 +100,6 @@ static always_inline void * __constant_m
     return to;
 }
 
-#define __HAVE_ARCH_MEMCPY
 /* align source to a 64-bit boundary */
 static always_inline
 void *__var_memcpy(void *t, const void *f, size_t n)
@@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s
             __var_memcpy((t),(f),(n)));
 }
 
+#endif /* !__i386__ || __GNUC__ < 4 */
+
 /* Some version of gcc don't have this builtin. It's non-critical anyway. */
 #define __HAVE_ARCH_MEMMOVE
 extern void *memmove(void *dest, const void *src, size_t n);
 
-static inline void *__memset_generic(void *s, char c, size_t count)
+static inline void *__memset_generic(void *s, int c, size_t count)
 {
     long d0, d1;
     __asm__ __volatile__ (
@@ -134,6 +140,11 @@ static inline void *__memset_generic(voi
     return s;
 }
 
+#define __HAVE_ARCH_MEMSET
+#if defined(__i386__) && __GNUC__ >= 4
+#define memset(s, c, n) __builtin_memset(s, c, n)
+#else
+
 /* we might want to write optimized versions of these later */
 #define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
 
@@ -238,11 +249,12 @@ static always_inline void *__constant_c_
 #define MEMSET_PATTERN_MUL 0x01010101UL
 #endif
 
-#define __HAVE_ARCH_MEMSET
 #define memset(s, c, count) (__memset((s),(c),(count)))
 #define __memset(s, c, count) \
 (__builtin_constant_p(c) ? \
  __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
  __var_x_memset((s),(c),(count)))
 
+#endif /* !__i386__ || __GNUC__ < 4 */
+
 #endif /* __X86_STRING_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] x86-32: use __builtin_{memcpy,memset}
  2010-05-11  9:05 [PATCH] x86-32: use __builtin_{memcpy,memset} Jan Beulich
@ 2010-05-11 10:39 ` Keir Fraser
  2010-05-11 12:04   ` Keir Fraser
  0 siblings, 1 reply; 4+ messages in thread
From: Keir Fraser @ 2010-05-11 10:39 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xensource.com; +Cc: Charles Arnold

[-- Attachment #1: Type: text/plain, Size: 4649 bytes --]

On 11/05/2010 10:05, "Jan Beulich" <JBeulich@novell.com> wrote:

> Following a change in Linux 2.6.33, make x86-32 always use
> __builtin_mem{cpy,set}() on gcc 4.0+. This particularly works around
> certain intermediate gcc revisions generating out-of-range-array-index
> warnings with the current inline implementation.
> 
> It may be worthwhile considering to make this the case for x86-64 too.
> 
> At the same time eliminate the redundant inline assembly in the C
> file, and instead use the inline functions coming from the header.

Hm, well, I hate having to change this stuff as we always seem to end up
broken on some gcc or other. But otoh this will eliminate a bunch of code if
we do this unconditionally, and I'm particularly not keen on doing this only
for x86-32 and particular versions of gcc. I suggest the attached patch: it
should work fine so long as all our supported versions of gcc have
__builtin_memcpy and __builtin_memset. Given we nowadays only support GCC
3.4+, I imagine we are okay in this regard.

What do you think to this alternative patch?

 -- Keir

> Signed-off-by: Jan Beulich <jbeulich@novell.com>
> 
> --- 2010-05-04.orig/xen/arch/x86/string.c 2007-11-26 16:57:03.000000000 +0100
> +++ 2010-05-04/xen/arch/x86/string.c 2010-05-11 09:45:27.000000000 +0200
> @@ -11,44 +11,13 @@
>  #undef memcpy
>  void *memcpy(void *dest, const void *src, size_t n)
>  {
> -    long d0, d1, d2;
> -
> -    asm volatile (
> -#ifdef __i386__
> -        "   rep movsl        ; "
> -#else
> -        "   rep movsq        ; "
> -        "   testb $4,%b4     ; "
> -        "   je 0f            ; "
> -        "   movsl            ; "
> -        "0:                  ; "
> -#endif
> -        "   testb $2,%b4     ; "
> -        "   je 1f            ; "
> -        "   movsw            ; "
> -        "1: testb $1,%b4     ; "
> -        "   je 2f            ; "
> -        "   movsb            ; "
> -        "2:                    "
> -        : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> -        : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
> -        : "memory");
> -
> -    return dest;
> +    return __variable_memcpy(dest, src, n);
>  }
>  
>  #undef memset
>  void *memset(void *s, int c, size_t n)
>  {
> -    long d0, d1;
> -
> -    asm volatile (
> -        "rep stosb"
> -        : "=&c" (d0), "=&D" (d1)
> -        : "a" (c), "1" (s), "0" (n)
> -        : "memory");
> -
> -    return s;
> +    return __memset_generic(s, c, n);
>  }
>  
>  #undef memmove
> --- 2010-05-04.orig/xen/include/asm-x86/string.h 2009-10-07 13:31:36.000000000
> +0200
> +++ 2010-05-04/xen/include/asm-x86/string.h 2010-05-11 10:21:04.000000000
> +0200
> @@ -16,6 +16,11 @@ static inline void *__variable_memcpy(vo
>      return to;
>  }
>  
> +#define __HAVE_ARCH_MEMCPY
> +#if defined(__i386__) && __GNUC__ >= 4
> +#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
> +#else
> +
>  /*
>   * This looks horribly ugly, but the compiler can optimize it totally,
>   * as the count is constant.
> @@ -95,7 +100,6 @@ static always_inline void * __constant_m
>      return to;
>  }
>  
> -#define __HAVE_ARCH_MEMCPY
>  /* align source to a 64-bit boundary */
>  static always_inline
>  void *__var_memcpy(void *t, const void *f, size_t n)
> @@ -121,11 +125,13 @@ void *__memcpy(void *t, const void *f, s
>              __var_memcpy((t),(f),(n)));
>  }
>  
> +#endif /* !__i386__ || __GNUC__ < 4 */
> +
>  /* Some version of gcc don't have this builtin. It's non-critical anyway. */
>  #define __HAVE_ARCH_MEMMOVE
>  extern void *memmove(void *dest, const void *src, size_t n);
>  
> -static inline void *__memset_generic(void *s, char c, size_t count)
> +static inline void *__memset_generic(void *s, int c, size_t count)
>  {
>      long d0, d1;
>      __asm__ __volatile__ (
> @@ -134,6 +140,11 @@ static inline void *__memset_generic(voi
>      return s;
>  }
>  
> +#define __HAVE_ARCH_MEMSET
> +#if defined(__i386__) && __GNUC__ >= 4
> +#define memset(s, c, n) __builtin_memset(s, c, n)
> +#else
> +
>  /* we might want to write optimized versions of these later */
>  #define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
>  
> @@ -238,11 +249,12 @@ static always_inline void *__constant_c_
>  #define MEMSET_PATTERN_MUL 0x01010101UL
>  #endif
>  
> -#define __HAVE_ARCH_MEMSET
>  #define memset(s, c, count) (__memset((s),(c),(count)))
>  #define __memset(s, c, count) \
>  (__builtin_constant_p(c) ? \
>   __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) :
> \
>   __var_x_memset((s),(c),(count)))
>  
> +#endif /* !__i386__ || __GNUC__ < 4 */
> +
>  #endif /* __X86_STRING_H__ */
> 
> 
> 


[-- Attachment #2: 00-string --]
[-- Type: application/octet-stream, Size: 9162 bytes --]

diff -r 60b3417b1499 xen/arch/x86/string.c
--- a/xen/arch/x86/string.c	Tue May 11 11:23:54 2010 +0100
+++ b/xen/arch/x86/string.c	Tue May 11 11:37:09 2010 +0100
@@ -14,25 +14,12 @@
     long d0, d1, d2;
 
     asm volatile (
-#ifdef __i386__
-        "   rep movsl        ; "
-#else
-        "   rep movsq        ; "
-        "   testb $4,%b4     ; "
-        "   je 0f            ; "
-        "   movsl            ; "
-        "0:                  ; "
-#endif
-        "   testb $2,%b4     ; "
-        "   je 1f            ; "
-        "   movsw            ; "
-        "1: testb $1,%b4     ; "
-        "   je 2f            ; "
-        "   movsb            ; "
-        "2:                    "
+        "   rep ; movs"__OS" ; "
+        "   mov %4,%3        ; "
+        "   rep ; movsb        "
         : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-        : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
-        : "memory");
+        : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (dest), "2" (src)
+        : "memory" );
 
     return dest;
 }
@@ -55,7 +42,7 @@
 void *memmove(void *dest, const void *src, size_t n)
 {
     long d0, d1, d2;
- 
+
     if ( dest < src )
         return memcpy(dest, src, n);
 
diff -r 60b3417b1499 xen/include/asm-x86/string.h
--- a/xen/include/asm-x86/string.h	Tue May 11 11:23:54 2010 +0100
+++ b/xen/include/asm-x86/string.h	Tue May 11 11:37:09 2010 +0100
@@ -3,246 +3,14 @@
 
 #include <xen/config.h>
 
-static inline void *__variable_memcpy(void *to, const void *from, size_t n)
-{
-    long d0, d1, d2;
-    __asm__ __volatile__ (
-        "   rep ; movs"__OS"\n"
-        "   mov %4,%3       \n"
-        "   rep ; movsb     \n"
-        : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-        : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (to), "2" (from)
-        : "memory" );
-    return to;
-}
+#define __HAVE_ARCH_MEMCPY
+#define memcpy(t,f,n) (__builtin_memcpy((t),(f),(n)))
 
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as the count is constant.
- */
-static always_inline void * __constant_memcpy(
-    void * to, const void * from, size_t n)
-{
-    switch ( n )
-    {
-    case 0:
-        return to;
-    case 1:
-        *(u8 *)to = *(const u8 *)from;
-        return to;
-    case 2:
-        *(u16 *)to = *(const u16 *)from;
-        return to;
-    case 3:
-        *(u16 *)to = *(const u16 *)from;
-        *(2+(u8 *)to) = *(2+(const u8 *)from);
-        return to;
-    case 4:
-        *(u32 *)to = *(const u32 *)from;
-        return to;
-    case 5:
-        *(u32 *)to = *(const u32 *)from;
-        *(4+(u8 *)to) = *(4+(const u8 *)from);
-        return to;
-    case 6:
-        *(u32 *)to = *(const u32 *)from;
-        *(2+(u16 *)to) = *(2+(const u16 *)from);
-        return to;
-    case 7:
-        *(u32 *)to = *(const u32 *)from;
-        *(2+(u16 *)to) = *(2+(const u16 *)from);
-        *(6+(u8 *)to) = *(6+(const u8 *)from);
-        return to;
-    case 8:
-        *(u64 *)to = *(const u64 *)from;
-        return to;
-    case 12:
-        *(u64 *)to = *(const u64 *)from;
-        *(2+(u32 *)to) = *(2+(const u32 *)from);
-        return to;
-    case 16:
-        *(u64 *)to = *(const u64 *)from;
-        *(1+(u64 *)to) = *(1+(const u64 *)from);
-        return to;
-    case 20:
-        *(u64 *)to = *(const u64 *)from;
-        *(1+(u64 *)to) = *(1+(const u64 *)from);
-        *(4+(u32 *)to) = *(4+(const u32 *)from);
-        return to;
-    }
-#define COMMON(x)                                       \
-    __asm__ __volatile__ (                              \
-        "rep ; movs"__OS                                \
-        x                                               \
-        : "=&c" (d0), "=&D" (d1), "=&S" (d2)            \
-        : "0" (n/BYTES_PER_LONG), "1" (to), "2" (from)  \
-        : "memory" );
-    {
-        long d0, d1, d2;
-        switch ( n % BYTES_PER_LONG )
-        {
-        case 0: COMMON(""); return to;
-        case 1: COMMON("\n\tmovsb"); return to;
-        case 2: COMMON("\n\tmovsw"); return to;
-        case 3: COMMON("\n\tmovsw\n\tmovsb"); return to;
-        case 4: COMMON("\n\tmovsl"); return to;
-        case 5: COMMON("\n\tmovsl\n\tmovsb"); return to;
-        case 6: COMMON("\n\tmovsl\n\tmovsw"); return to;
-        case 7: COMMON("\n\tmovsl\n\tmovsw\n\tmovsb"); return to;
-        }
-    }
-#undef COMMON
-    return to;
-}
-
-#define __HAVE_ARCH_MEMCPY
-/* align source to a 64-bit boundary */
-static always_inline
-void *__var_memcpy(void *t, const void *f, size_t n)
-{
-    int off = (unsigned long)f & 0x7;
-    /* just do alignment if needed and if size is worth */
-    if ( (n > 32) && off ) {
-        size_t n1 = 8 - off;
-        __variable_memcpy(t, f, n1);
-        __variable_memcpy(t + n1, f + n1, n - n1);
-        return t;
-    } else {
-            return (__variable_memcpy(t, f, n));
-    }
-}
-
-#define memcpy(t,f,n) (__memcpy((t),(f),(n)))
-static always_inline
-void *__memcpy(void *t, const void *f, size_t n)
-{
-    return (__builtin_constant_p(n) ?
-            __constant_memcpy((t),(f),(n)) :
-            __var_memcpy((t),(f),(n)));
-}
-
-/* Some version of gcc don't have this builtin. It's non-critical anyway. */
+/* Some versions of gcc don't have this builtin. It's non-critical anyway. */
 #define __HAVE_ARCH_MEMMOVE
 extern void *memmove(void *dest, const void *src, size_t n);
 
-static inline void *__memset_generic(void *s, char c, size_t count)
-{
-    long d0, d1;
-    __asm__ __volatile__ (
-        "rep ; stosb"
-        : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (count) : "memory" );
-    return s;
-}
-
-/* we might want to write optimized versions of these later */
-#define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
-
-/*
- * memset(x,0,y) is a reasonably common thing to do, so we want to fill
- * things 32 bits at a time even when we don't know the size of the
- * area at compile-time..
- */
-static inline void *__constant_c_memset(void *s, unsigned long c, size_t count)
-{
-    long d0, d1;
-    __asm__ __volatile__(
-        "   rep ; stos"__OS"\n"
-        "   mov  %3,%4      \n"
-        "   rep ; stosb     \n"
-        : "=&c" (d0), "=&D" (d1)
-        : "a" (c), "r" (count%BYTES_PER_LONG),
-          "0" (count/BYTES_PER_LONG), "1" (s)
-        : "memory" );
-    return s;
-}
-
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as we by now know that both pattern and count is constant..
- */
-static always_inline void *__constant_c_and_count_memset(
-    void *s, unsigned long pattern, size_t count)
-{
-    switch ( count )
-    {
-    case 0:
-        return s;
-    case 1:
-        *(u8 *)s = pattern;
-        return s;
-    case 2:
-        *(u16 *)s = pattern;
-        return s;
-    case 3:
-        *(u16 *)s = pattern;
-        *(2+(u8 *)s) = pattern;
-        return s;
-    case 4:
-        *(u32 *)s = pattern;
-        return s;
-    case 5:
-        *(u32 *)s = pattern;
-        *(4+(u8 *)s) = pattern;
-        return s;
-    case 6:
-        *(u32 *)s = pattern;
-        *(2+(u16 *)s) = pattern;
-        return s;
-    case 7:
-        *(u32 *)s = pattern;
-        *(2+(u16 *)s) = pattern;
-        *(6+(u8 *)s) = pattern;
-        return s;
-    case 8:
-        *(u64 *)s = pattern;
-        return s;
-    }
-#define COMMON(x)                                               \
-    __asm__  __volatile__ (                                     \
-        "rep ; stos"__OS                                        \
-        x                                                       \
-        : "=&c" (d0), "=&D" (d1)                                \
-        : "a" (pattern), "0" (count/BYTES_PER_LONG), "1" (s)    \
-        : "memory" )
-    {
-        long d0, d1;
-        switch ( count % BYTES_PER_LONG )
-        {
-        case 0: COMMON(""); return s;
-        case 1: COMMON("\n\tstosb"); return s;
-        case 2: COMMON("\n\tstosw"); return s;
-        case 3: COMMON("\n\tstosw\n\tstosb"); return s;
-        case 4: COMMON("\n\tstosl"); return s;
-        case 5: COMMON("\n\tstosl\n\tstosb"); return s;
-        case 6: COMMON("\n\tstosl\n\tstosw"); return s;
-        case 7: COMMON("\n\tstosl\n\tstosw\n\tstosb"); return s;
-        }
-    }
-#undef COMMON
-    return s;
-}
-
-#define __constant_c_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_c_and_count_memset((s),(c),(count)) : \
- __constant_c_memset((s),(c),(count)))
-
-#define __var_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_count_memset((s),(c),(count)) : \
- __memset_generic((s),(c),(count)))
-
-#ifdef CONFIG_X86_64
-#define MEMSET_PATTERN_MUL 0x0101010101010101UL
-#else
-#define MEMSET_PATTERN_MUL 0x01010101UL
-#endif
-
 #define __HAVE_ARCH_MEMSET
-#define memset(s, c, count) (__memset((s),(c),(count)))
-#define __memset(s, c, count) \
-(__builtin_constant_p(c) ? \
- __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
- __var_x_memset((s),(c),(count)))
+#define memset(s,c,n) (__builtin_memset((s),(c),(n)))
 
 #endif /* __X86_STRING_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] x86-32: use __builtin_{memcpy,memset}
  2010-05-11 10:39 ` Keir Fraser
@ 2010-05-11 12:04   ` Keir Fraser
  2010-05-11 12:33     ` Jan Beulich
  0 siblings, 1 reply; 4+ messages in thread
From: Keir Fraser @ 2010-05-11 12:04 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xensource.com; +Cc: Charles Arnold

[-- Attachment #1: Type: text/plain, Size: 840 bytes --]

On 11/05/2010 11:39, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

> Hm, well, I hate having to change this stuff as we always seem to end up
> broken on some gcc or other. But otoh this will eliminate a bunch of code if
> we do this unconditionally, and I'm particularly not keen on doing this only
> for x86-32 and particular versions of gcc. I suggest the attached patch: it
> should work fine so long as all our supported versions of gcc have
> __builtin_memcpy and __builtin_memset. Given we nowadays only support GCC
> 3.4+, I imagine we are okay in this regard.
> 
> What do you think to this alternative patch?

Jan,

What about this alternative version which distinguishes between GCC 4.3+
(which should auto-inline quite sensibly) and earlier versions (which need
help via macro mapping explicitly to __builtin_*)?

 -- Keir


[-- Attachment #2: 00-string --]
[-- Type: application/octet-stream, Size: 9628 bytes --]

diff -r 2b5e14e4c5e5 xen/arch/x86/string.c
--- a/xen/arch/x86/string.c	Tue May 11 12:37:26 2010 +0100
+++ b/xen/arch/x86/string.c	Tue May 11 13:02:40 2010 +0100
@@ -14,25 +14,12 @@
     long d0, d1, d2;
 
     asm volatile (
-#ifdef __i386__
-        "   rep movsl        ; "
-#else
-        "   rep movsq        ; "
-        "   testb $4,%b4     ; "
-        "   je 0f            ; "
-        "   movsl            ; "
-        "0:                  ; "
-#endif
-        "   testb $2,%b4     ; "
-        "   je 1f            ; "
-        "   movsw            ; "
-        "1: testb $1,%b4     ; "
-        "   je 2f            ; "
-        "   movsb            ; "
-        "2:                    "
+        "   rep ; movs"__OS" ; "
+        "   mov %4,%3        ; "
+        "   rep ; movsb        "
         : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-        : "0" (n/sizeof(long)), "q" (n), "1" (dest), "2" (src)
-        : "memory");
+        : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (dest), "2" (src)
+        : "memory" );
 
     return dest;
 }
@@ -55,7 +42,7 @@
 void *memmove(void *dest, const void *src, size_t n)
 {
     long d0, d1, d2;
- 
+
     if ( dest < src )
         return memcpy(dest, src, n);
 
diff -r 2b5e14e4c5e5 xen/include/asm-x86/string.h
--- a/xen/include/asm-x86/string.h	Tue May 11 12:37:26 2010 +0100
+++ b/xen/include/asm-x86/string.h	Tue May 11 13:02:40 2010 +0100
@@ -3,246 +3,24 @@
 
 #include <xen/config.h>
 
-static inline void *__variable_memcpy(void *to, const void *from, size_t n)
-{
-    long d0, d1, d2;
-    __asm__ __volatile__ (
-        "   rep ; movs"__OS"\n"
-        "   mov %4,%3       \n"
-        "   rep ; movsb     \n"
-        : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-        : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (to), "2" (from)
-        : "memory" );
-    return to;
-}
+#define __HAVE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMMOVE
 
 /*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as the count is constant.
+ * According to Andi Kleen, GCC earlier than 4.3 does not reliably replace
+ * calls to string functions with good specialised built-in versions. Thus we
+ * explicitly map certain functions to __builtin_*() using macros.
  */
-static always_inline void * __constant_memcpy(
-    void * to, const void * from, size_t n)
-{
-    switch ( n )
-    {
-    case 0:
-        return to;
-    case 1:
-        *(u8 *)to = *(const u8 *)from;
-        return to;
-    case 2:
-        *(u16 *)to = *(const u16 *)from;
-        return to;
-    case 3:
-        *(u16 *)to = *(const u16 *)from;
-        *(2+(u8 *)to) = *(2+(const u8 *)from);
-        return to;
-    case 4:
-        *(u32 *)to = *(const u32 *)from;
-        return to;
-    case 5:
-        *(u32 *)to = *(const u32 *)from;
-        *(4+(u8 *)to) = *(4+(const u8 *)from);
-        return to;
-    case 6:
-        *(u32 *)to = *(const u32 *)from;
-        *(2+(u16 *)to) = *(2+(const u16 *)from);
-        return to;
-    case 7:
-        *(u32 *)to = *(const u32 *)from;
-        *(2+(u16 *)to) = *(2+(const u16 *)from);
-        *(6+(u8 *)to) = *(6+(const u8 *)from);
-        return to;
-    case 8:
-        *(u64 *)to = *(const u64 *)from;
-        return to;
-    case 12:
-        *(u64 *)to = *(const u64 *)from;
-        *(2+(u32 *)to) = *(2+(const u32 *)from);
-        return to;
-    case 16:
-        *(u64 *)to = *(const u64 *)from;
-        *(1+(u64 *)to) = *(1+(const u64 *)from);
-        return to;
-    case 20:
-        *(u64 *)to = *(const u64 *)from;
-        *(1+(u64 *)to) = *(1+(const u64 *)from);
-        *(4+(u32 *)to) = *(4+(const u32 *)from);
-        return to;
-    }
-#define COMMON(x)                                       \
-    __asm__ __volatile__ (                              \
-        "rep ; movs"__OS                                \
-        x                                               \
-        : "=&c" (d0), "=&D" (d1), "=&S" (d2)            \
-        : "0" (n/BYTES_PER_LONG), "1" (to), "2" (from)  \
-        : "memory" );
-    {
-        long d0, d1, d2;
-        switch ( n % BYTES_PER_LONG )
-        {
-        case 0: COMMON(""); return to;
-        case 1: COMMON("\n\tmovsb"); return to;
-        case 2: COMMON("\n\tmovsw"); return to;
-        case 3: COMMON("\n\tmovsw\n\tmovsb"); return to;
-        case 4: COMMON("\n\tmovsl"); return to;
-        case 5: COMMON("\n\tmovsl\n\tmovsb"); return to;
-        case 6: COMMON("\n\tmovsl\n\tmovsw"); return to;
-        case 7: COMMON("\n\tmovsl\n\tmovsw\n\tmovsb"); return to;
-        }
-    }
-#undef COMMON
-    return to;
-}
+#if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+#define memcpy(t,f,n) (__builtin_memcpy((t),(f),(n)))
+#define memset(s,c,n) (__builtin_memset((s),(c),(n)))
+#else
+extern void *memcpy(void *dest, const void *src, size_t n);
+extern void *memset(void *s, int c, size_t n);
+#endif
 
-#define __HAVE_ARCH_MEMCPY
-/* align source to a 64-bit boundary */
-static always_inline
-void *__var_memcpy(void *t, const void *f, size_t n)
-{
-    int off = (unsigned long)f & 0x7;
-    /* just do alignment if needed and if size is worth */
-    if ( (n > 32) && off ) {
-        size_t n1 = 8 - off;
-        __variable_memcpy(t, f, n1);
-        __variable_memcpy(t + n1, f + n1, n - n1);
-        return t;
-    } else {
-            return (__variable_memcpy(t, f, n));
-    }
-}
-
-#define memcpy(t,f,n) (__memcpy((t),(f),(n)))
-static always_inline
-void *__memcpy(void *t, const void *f, size_t n)
-{
-    return (__builtin_constant_p(n) ?
-            __constant_memcpy((t),(f),(n)) :
-            __var_memcpy((t),(f),(n)));
-}
-
-/* Some version of gcc don't have this builtin. It's non-critical anyway. */
-#define __HAVE_ARCH_MEMMOVE
+/* Some versions of gcc don't have this built in. It's non-critical anyway. */
 extern void *memmove(void *dest, const void *src, size_t n);
 
-static inline void *__memset_generic(void *s, char c, size_t count)
-{
-    long d0, d1;
-    __asm__ __volatile__ (
-        "rep ; stosb"
-        : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (count) : "memory" );
-    return s;
-}
-
-/* we might want to write optimized versions of these later */
-#define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))
-
-/*
- * memset(x,0,y) is a reasonably common thing to do, so we want to fill
- * things 32 bits at a time even when we don't know the size of the
- * area at compile-time..
- */
-static inline void *__constant_c_memset(void *s, unsigned long c, size_t count)
-{
-    long d0, d1;
-    __asm__ __volatile__(
-        "   rep ; stos"__OS"\n"
-        "   mov  %3,%4      \n"
-        "   rep ; stosb     \n"
-        : "=&c" (d0), "=&D" (d1)
-        : "a" (c), "r" (count%BYTES_PER_LONG),
-          "0" (count/BYTES_PER_LONG), "1" (s)
-        : "memory" );
-    return s;
-}
-
-/*
- * This looks horribly ugly, but the compiler can optimize it totally,
- * as we by now know that both pattern and count is constant..
- */
-static always_inline void *__constant_c_and_count_memset(
-    void *s, unsigned long pattern, size_t count)
-{
-    switch ( count )
-    {
-    case 0:
-        return s;
-    case 1:
-        *(u8 *)s = pattern;
-        return s;
-    case 2:
-        *(u16 *)s = pattern;
-        return s;
-    case 3:
-        *(u16 *)s = pattern;
-        *(2+(u8 *)s) = pattern;
-        return s;
-    case 4:
-        *(u32 *)s = pattern;
-        return s;
-    case 5:
-        *(u32 *)s = pattern;
-        *(4+(u8 *)s) = pattern;
-        return s;
-    case 6:
-        *(u32 *)s = pattern;
-        *(2+(u16 *)s) = pattern;
-        return s;
-    case 7:
-        *(u32 *)s = pattern;
-        *(2+(u16 *)s) = pattern;
-        *(6+(u8 *)s) = pattern;
-        return s;
-    case 8:
-        *(u64 *)s = pattern;
-        return s;
-    }
-#define COMMON(x)                                               \
-    __asm__  __volatile__ (                                     \
-        "rep ; stos"__OS                                        \
-        x                                                       \
-        : "=&c" (d0), "=&D" (d1)                                \
-        : "a" (pattern), "0" (count/BYTES_PER_LONG), "1" (s)    \
-        : "memory" )
-    {
-        long d0, d1;
-        switch ( count % BYTES_PER_LONG )
-        {
-        case 0: COMMON(""); return s;
-        case 1: COMMON("\n\tstosb"); return s;
-        case 2: COMMON("\n\tstosw"); return s;
-        case 3: COMMON("\n\tstosw\n\tstosb"); return s;
-        case 4: COMMON("\n\tstosl"); return s;
-        case 5: COMMON("\n\tstosl\n\tstosb"); return s;
-        case 6: COMMON("\n\tstosl\n\tstosw"); return s;
-        case 7: COMMON("\n\tstosl\n\tstosw\n\tstosb"); return s;
-        }
-    }
-#undef COMMON
-    return s;
-}
-
-#define __constant_c_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_c_and_count_memset((s),(c),(count)) : \
- __constant_c_memset((s),(c),(count)))
-
-#define __var_x_memset(s, c, count) \
-(__builtin_constant_p(count) ? \
- __constant_count_memset((s),(c),(count)) : \
- __memset_generic((s),(c),(count)))
-
-#ifdef CONFIG_X86_64
-#define MEMSET_PATTERN_MUL 0x0101010101010101UL
-#else
-#define MEMSET_PATTERN_MUL 0x01010101UL
-#endif
-
-#define __HAVE_ARCH_MEMSET
-#define memset(s, c, count) (__memset((s),(c),(count)))
-#define __memset(s, c, count) \
-(__builtin_constant_p(c) ? \
- __constant_c_x_memset((s),(MEMSET_PATTERN_MUL*(unsigned char)(c)),(count)) : \
- __var_x_memset((s),(c),(count)))
-
 #endif /* __X86_STRING_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] x86-32: use __builtin_{memcpy,memset}
  2010-05-11 12:04   ` Keir Fraser
@ 2010-05-11 12:33     ` Jan Beulich
  0 siblings, 0 replies; 4+ messages in thread
From: Jan Beulich @ 2010-05-11 12:33 UTC (permalink / raw)
  To: Keir Fraser, xen-devel@lists.xensource.com; +Cc: Charles Arnold

>>> Keir Fraser <keir.fraser@eu.citrix.com> 11.05.10 14:04 >>>
>What about this alternative version which distinguishes between GCC 4.3+
>(which should auto-inline quite sensibly) and earlier versions (which need
>help via macro mapping explicitly to __builtin_*)?

I don't think this will do what we expect in the presence of -fno-builtin
(see also my other response).

Jan

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-05-11 12:33 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-11  9:05 [PATCH] x86-32: use __builtin_{memcpy,memset} Jan Beulich
2010-05-11 10:39 ` Keir Fraser
2010-05-11 12:04   ` Keir Fraser
2010-05-11 12:33     ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).