qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [v2 RESEND 0/2] add avx2 instruction optimization
@ 2015-11-10  3:31 Liang Li
  2015-11-10  3:31 ` [Qemu-devel] [v2 RESEND 1/2] cutils: " Liang Li
  2015-11-10  3:31 ` [Qemu-devel] [v2 RESEND 2/2] configure: add options to config avx2 Liang Li
  0 siblings, 2 replies; 4+ messages in thread
From: Liang Li @ 2015-11-10  3:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, Liang Li, mst, amit.shah, pbonzini

buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

With patch, if build QEMU binary with AVX2 enabled, the binary can run
on both platforms support AVX2 or not.

If build QEMU binary with AVX2 diabled, or if compiler can not support
AVX2, the binary will not contain the AVX2 instruction, and it can run
on both platforms support AVX2 or not.

Liang Li (2):
  cutils: add avx2 instruction optimization
  configure: add options to config avx2

 configure             | 29 ++++++++++++++++++++++
 include/qemu-common.h | 28 +++++++++++++++------
 util/Makefile.objs    |  2 ++
 util/avx2.c           | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++
 util/cutils.c         | 47 +++++++++++++++++++++++++++++++++--
 5 files changed, 165 insertions(+), 9 deletions(-)
 create mode 100644 util/avx2.c

-- 
1.9.1

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Qemu-devel] [v2 RESEND 1/2] cutils: add avx2 instruction optimization
  2015-11-10  3:31 [Qemu-devel] [v2 RESEND 0/2] add avx2 instruction optimization Liang Li
@ 2015-11-10  3:31 ` Liang Li
  2015-11-10  3:31 ` [Qemu-devel] [v2 RESEND 2/2] configure: add options to config avx2 Liang Li
  1 sibling, 0 replies; 4+ messages in thread
From: Liang Li @ 2015-11-10  3:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, Liang Li, mst, amit.shah, pbonzini

buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 include/qemu-common.h | 28 +++++++++++++++------
 util/Makefile.objs    |  2 ++
 util/avx2.c           | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++
 util/cutils.c         | 47 +++++++++++++++++++++++++++++++++--
 4 files changed, 136 insertions(+), 9 deletions(-)
 create mode 100644 util/avx2.c

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 2f74540..9fa7501 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-                   * sizeof(VECTYPE)) == 0
-            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
+
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
+extern bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+
+extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+
+extern bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len);
+
+extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len);
+
+__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function");
+__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function");
+
+
+void *can_use_buffer_find_nonzero_offset_ifunc(void) \
+                     __asm__("can_use_buffer_find_nonzero_offset");
+
+void *buffer_find_nonzero_offset_ifunc(void) \
+                     __asm__("buffer_find_nonzero_offset");
 /*
  * helper to parse debug environment variables
  */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index d7cc399..6aacad7 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-y += avx2.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
@@ -29,3 +30,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
+avx2.o-cflags      := $(AVX2_CFLAGS)
diff --git a/util/avx2.c b/util/avx2.c
new file mode 100644
index 0000000..d90289b
--- /dev/null
+++ b/util/avx2.c
@@ -0,0 +1,68 @@
+#include "qemu-common.h"
+
+#ifdef __AVX2__
+#include <immintrin.h>
+#define AVX2_VECTYPE        __m256i
+#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(AVX2_VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    const AVX2_VECTYPE *p = buf;
+    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+    size_t i;
+
+    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+    if (!len) {
+        return 0;
+    }
+
+    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+        if (!AVX2_ALL_EQ(p[i], zero)) {
+            return i * sizeof(AVX2_VECTYPE);
+        }
+    }
+
+    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+         i < len / sizeof(AVX2_VECTYPE);
+         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+            break;
+        }
+    }
+
+    return i * sizeof(AVX2_VECTYPE);
+}
+
+#else
+/* use the original functions if avx2 is not enabled when buiding*/
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return can_use_buffer_find_nonzero_offset_inner(buf, len);
+}
+
+inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+    return buffer_find_nonzero_offset_inner(buf, len);
+}
+
+#endif
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..5a9763a 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -26,6 +26,7 @@
 #include <math.h>
 #include <limits.h>
 #include <errno.h>
+#include <cpuid.h>
 
 #include "qemu/sockets.h"
 #include "qemu/iov.h"
@@ -161,6 +162,48 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+/* old compiler maynot define bit_AVX2 */
+#ifndef bit_AVX2
+#define bit_AVX2 (1 << 5)
+#endif
+
+static inline bool avx2_support(void)
+{
+    int a, b, c, d;
+
+    if (__get_cpuid_max(0, NULL) < 7) {
+        return false;
+    }
+
+    __cpuid_count(7, 0, a, b, c, d);
+    return b & bit_AVX2;
+}
+
+void *buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+void *can_use_buffer_find_nonzero_offset_ifunc(void)
+{
+    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
+        can_use_buffer_find_nonzero_offset_avx2 :
+        can_use_buffer_find_nonzero_offset_inner;
+
+    return func;
+}
+
+inline bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+                   * sizeof(VECTYPE)) == 0
+            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -181,13 +224,13 @@ int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
     const VECTYPE *p = buf;
     const VECTYPE zero = (VECTYPE){0};
     size_t i;
 
-    assert(can_use_buffer_find_nonzero_offset(buf, len));
+    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
     if (!len) {
         return 0;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Qemu-devel] [v2 RESEND 2/2] configure: add options to config avx2
  2015-11-10  3:31 [Qemu-devel] [v2 RESEND 0/2] add avx2 instruction optimization Liang Li
  2015-11-10  3:31 ` [Qemu-devel] [v2 RESEND 1/2] cutils: " Liang Li
@ 2015-11-10  3:31 ` Liang Li
  2015-11-10  9:01   ` Juan Quintela
  1 sibling, 1 reply; 4+ messages in thread
From: Liang Li @ 2015-11-10  3:31 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, Liang Li, mst, amit.shah, pbonzini

Add the '--enable-avx2' & '--disable-avx2' option so as to config
the AVX2 instruction optimization.

By default, avx2 optimization is enabled, if '--disable-avx2' is not
set, configure will detect if the compiler can support AVX2 option,
if yes, AVX2 optimization is eabled, else disabled.

Signed-off-by: Liang Li <liang.z.li@intel.com>
---
 configure | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/configure b/configure
index 42e57c0..4d81be2 100755
--- a/configure
+++ b/configure
@@ -310,6 +310,7 @@ smartcard=""
 libusb=""
 usb_redir=""
 opengl=""
+avx2="yes"
 zlib="yes"
 lzo=""
 snappy=""
@@ -1057,6 +1058,10 @@ for opt do
   ;;
   --enable-usb-redir) usb_redir="yes"
   ;;
+  --disable-avx2) avx2="no"
+  ;;
+  --enable-avx2) avx2="yes"
+  ;;
   --disable-zlib-test) zlib="no"
   ;;
   --disable-lzo) lzo="no"
@@ -1373,6 +1378,7 @@ disabled with --disable-FEATURE, default is enabled if available:
   smartcard       smartcard support (libcacard)
   libusb          libusb (for usb passthrough)
   usb-redir       usb network redirection support
+  avx2            support of avx2 instruction
   lzo             support of lzo compression library
   snappy          support of snappy compression library
   bzip2           support of bzip2 compression library
@@ -1809,6 +1815,24 @@ EOF
   fi
 fi
 
+########################################
+# avx2 check
+
+if test "$avx2" != "no" ; then
+    cat > $TMPC << EOF
+int main(void) { return 0; }
+EOF
+    if compile_prog "" "-mavx2" ; then
+        avx2="yes"
+    else
+        avx2="no"
+    fi
+fi
+
+if test "$avx2" = "yes" ; then
+    avx2_cflags=" -mavx2"
+fi
+
 ##########################################
 # zlib check
 
@@ -4782,6 +4806,7 @@ echo "libssh2 support   $libssh2"
 echo "TPM passthrough   $tpm_passthrough"
 echo "QOM debugging     $qom_cast_debug"
 echo "vhdx              $vhdx"
+echo "avx2 support      $avx2"
 echo "lzo support       $lzo"
 echo "snappy support    $snappy"
 echo "bzip2 support     $bzip2"
@@ -5166,6 +5191,10 @@ if test "$opengl" = "yes" ; then
   echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
 fi
 
+if test "$avx2" = "yes" ; then
+  echo "AVX2_CFLAGS=$avx2_cflags" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [Qemu-devel] [v2 RESEND 2/2] configure: add options to config avx2
  2015-11-10  3:31 ` [Qemu-devel] [v2 RESEND 2/2] configure: add options to config avx2 Liang Li
@ 2015-11-10  9:01   ` Juan Quintela
  0 siblings, 0 replies; 4+ messages in thread
From: Juan Quintela @ 2015-11-10  9:01 UTC (permalink / raw)
  To: Liang Li; +Cc: amit.shah, pbonzini, qemu-devel, mst

Liang Li <liang.z.li@intel.com> wrote:
> Add the '--enable-avx2' & '--disable-avx2' option so as to config
> the AVX2 instruction optimization.
>
> By default, avx2 optimization is enabled, if '--disable-avx2' is not
> set, configure will detect if the compiler can support AVX2 option,
> if yes, AVX2 optimization is eabled, else disabled.
>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> ---
>  configure | 29 +++++++++++++++++++++++++++++
>  1 file changed, 29 insertions(+)
>
> diff --git a/configure b/configure
> index 42e57c0..4d81be2 100755
> --- a/configure
> +++ b/configure
> @@ -310,6 +310,7 @@ smartcard=""
>  libusb=""
>  usb_redir=""
>  opengl=""
> +avx2="yes"
>  zlib="yes"
>  lzo=""
>  snappy=""
> @@ -1057,6 +1058,10 @@ for opt do
>    ;;
>    --enable-usb-redir) usb_redir="yes"
>    ;;
> +  --disable-avx2) avx2="no"
> +  ;;
> +  --enable-avx2) avx2="yes"
> +  ;;
>    --disable-zlib-test) zlib="no"
>    ;;
>    --disable-lzo) lzo="no"
> @@ -1373,6 +1378,7 @@ disabled with --disable-FEATURE, default is enabled if available:
>    smartcard       smartcard support (libcacard)
>    libusb          libusb (for usb passthrough)
>    usb-redir       usb network redirection support
> +  avx2            support of avx2 instruction
>    lzo             support of lzo compression library
>    snappy          support of snappy compression library
>    bzip2           support of bzip2 compression library
> @@ -1809,6 +1815,24 @@ EOF
>    fi
>  fi
>  
> +########################################
> +# avx2 check
> +
> +if test "$avx2" != "no" ; then
> +    cat > $TMPC << EOF
> +int main(void) { return 0; }
> +EOF
> +    if compile_prog "" "-mavx2" ; then
> +        avx2="yes"
> +    else
> +        avx2="no"

the else bit shouldn't be:

          if test "$avx2" = "yes"; then
              feature_not_found "avx2" "Your compiler don't support avx2"
          fi
          avx=2="no"

??
> +    fi
> +fi
> +
> +if test "$avx2" = "yes" ; then
> +    avx2_cflags=" -mavx2"
> +fi
> +
>  ##########################################
>  # zlib check
>  
> @@ -4782,6 +4806,7 @@ echo "libssh2 support   $libssh2"
>  echo "TPM passthrough   $tpm_passthrough"
>  echo "QOM debugging     $qom_cast_debug"
>  echo "vhdx              $vhdx"
> +echo "avx2 support      $avx2"
>  echo "lzo support       $lzo"
>  echo "snappy support    $snappy"
>  echo "bzip2 support     $bzip2"
> @@ -5166,6 +5191,10 @@ if test "$opengl" = "yes" ; then
>    echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
>  fi
>  
> +if test "$avx2" = "yes" ; then
> +  echo "AVX2_CFLAGS=$avx2_cflags" >> $config_host_mak
> +fi
> +
>  if test "$lzo" = "yes" ; then
>    echo "CONFIG_LZO=y" >> $config_host_mak
>  fi

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-11-10  9:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-11-10  3:31 [Qemu-devel] [v2 RESEND 0/2] add avx2 instruction optimization Liang Li
2015-11-10  3:31 ` [Qemu-devel] [v2 RESEND 1/2] cutils: " Liang Li
2015-11-10  3:31 ` [Qemu-devel] [v2 RESEND 2/2] configure: add options to config avx2 Liang Li
2015-11-10  9:01   ` Juan Quintela

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).