Re: [Qemu-devel] [PATCH v4 2/2] cutils: add avx2 instruction optimization

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Paolo Bonzini <pbonzini@redhat.com>
To: Liang Li <liang.z.li@intel.com>, qemu-devel@nongnu.org
Cc: peter.maydell@linaro.org, mst@redhat.com, rth7680@gmail.com,
	dgilbert@redhat.com, quintela@redhat.com, stefanha@redhat.com,
	amit.shah@redhat.com, rth@twiddle.net
Subject: Re: [Qemu-devel] [PATCH v4 2/2] cutils: add avx2 instruction optimization
Date: Wed, 20 Jan 2016 10:46:15 +0100	[thread overview]
Message-ID: <569F5767.3090303@redhat.com> (raw)
In-Reply-To: <1453280742-20718-3-git-send-email-liang.z.li@intel.com>



On 20/01/2016 10:05, Liang Li wrote:
> buffer_find_nonzero_offset() is a hot function during live migration.
> Now it use SSE2 instructions for optimization. For platform supports
> AVX2 instructions, use AVX2 instructions for optimization can help
> to improve the performance about 30% comparing to SSE2.
> 
> Zero page check can be faster with this optimization, the test result
> shows that for an 8GiB RAM idle guest just boots, this patch can help
> to shorten the total live migration time about 6%.
> 
> This patch use the ifunc mechanism to select the proper function when
> running, for platform supports AVX2, execute the AVX2 instructions,
> else, execute the original instructions.
> 
> Signed-off-by: Liang Li <liang.z.li@intel.com>

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>

> ---
>  include/qemu-common.h |   8 +---
>  util/cutils.c         | 118 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 115 insertions(+), 11 deletions(-)
> 
> diff --git a/include/qemu-common.h b/include/qemu-common.h
> index 22b010c..f4c8c24 100644
> --- a/include/qemu-common.h
> +++ b/include/qemu-common.h
> @@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
>  #endif
>  
>  #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
> -static inline bool
> -can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> -{
> -    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> -                   * sizeof(VECTYPE)) == 0
> -            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
> -}
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
>  size_t buffer_find_nonzero_offset(const void *buf, size_t len);
>  
>  /*
> diff --git a/util/cutils.c b/util/cutils.c
> index cfeb848..5c8ee5c 100644
> --- a/util/cutils.c
> +++ b/util/cutils.c
> @@ -161,6 +161,14 @@ int qemu_fdatasync(int fd)
>  #endif
>  }
>  
> +static bool
> +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +                   * sizeof(VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
> +}
> +
>  /*
>   * Searches for an area with non-zero content in a buffer
>   *
> @@ -169,8 +177,8 @@ int qemu_fdatasync(int fd)
>   * and addr must be a multiple of sizeof(VECTYPE) due to
>   * restriction of optimizations in this function.
>   *
> - * can_use_buffer_find_nonzero_offset() can be used to check
> - * these requirements.
> + * can_use_buffer_find_nonzero_offset_inner() can be used to
> + * check these requirements.
>   *
>   * The return value is the offset of the non-zero area rounded
>   * down to a multiple of sizeof(VECTYPE) for the first
> @@ -181,13 +189,13 @@ int qemu_fdatasync(int fd)
>   * If the buffer is all zero the return value is equal to len.
>   */
>  
> -size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
>  {
>      const VECTYPE *p = buf;
>      const VECTYPE zero = (VECTYPE){0};
>      size_t i;
>  
> -    assert(can_use_buffer_find_nonzero_offset(buf, len));
> +    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
>  
>      if (!len) {
>          return 0;
> @@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len)
>      return i * sizeof(VECTYPE);
>  }
>  
> +#ifdef CONFIG_AVX2_OPT
> +#pragma GCC push_options
> +#pragma GCC target("avx2")
> +#include <cpuid.h>
> +#include <immintrin.h>
> +
> +#define AVX2_VECTYPE        __m256i
> +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> +#define AVX2_ALL_EQ(v1, v2) \
> +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> +
> +static bool
> +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> +                   * sizeof(AVX2_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
> +}
> +
> +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> +{
> +    const AVX2_VECTYPE *p = buf;
> +    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
> +    size_t i;
> +
> +    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
> +
> +    if (!len) {
> +        return 0;
> +    }
> +
> +    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
> +        if (!AVX2_ALL_EQ(p[i], zero)) {
> +            return i * sizeof(AVX2_VECTYPE);
> +        }
> +    }
> +
> +    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
> +         i < len / sizeof(AVX2_VECTYPE);
> +         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
> +        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
> +        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
> +        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
> +        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
> +        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
> +        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
> +        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
> +            break;
> +        }
> +    }
> +
> +    return i * sizeof(AVX2_VECTYPE);
> +}
> +
> +static bool avx2_support(void)
> +{
> +    int a, b, c, d;
> +
> +    if (__get_cpuid_max(0, NULL) < 7) {
> +        return false;
> +    }
> +
> +    __cpuid_count(7, 0, a, b, c, d);
> +
> +    return b & bit_AVX2;
> +}
> +
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \
> +         __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc")));
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len) \
> +         __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc")));
> +
> +static void *buffer_find_nonzero_offset_ifunc(void)
> +{
> +    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
> +        buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner;
> +
> +    return func;
> +}
> +
> +static void *can_use_buffer_find_nonzero_offset_ifunc(void)
> +{
> +    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
> +        can_use_buffer_find_nonzero_offset_avx2 :
> +        can_use_buffer_find_nonzero_offset_inner;
> +
> +    return func;
> +}
> +#pragma GCC pop_options
> +#else
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return buffer_find_nonzero_offset_inner(buf, len);
> +}
> +#endif
> +
>  /*
>   * Checks if a buffer is all zeroes
>   *
>

next prev parent reply	other threads:[~2016-01-20  9:46 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-20  9:05 [Qemu-devel] [PATCH v4 0/3] add avx2 instruction optimization Liang Li
2016-01-20  9:05 ` [Qemu-devel] [PATCH v4 1/2] configure: detect ifunc and avx2 attribute Liang Li
2016-01-20  9:50   ` Paolo Bonzini
2016-01-20 10:43     ` Li, Liang Z
2016-01-20  9:05 ` [Qemu-devel] [PATCH v4 2/2] cutils: add avx2 instruction optimization Liang Li
2016-01-20  9:46   ` Paolo Bonzini [this message]
2016-01-20 10:22 ` [Qemu-devel] [PATCH v4 0/3] " 陈博
2016-01-20 15:25   ` Eric Blake

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=569F5767.3090303@redhat.com \
    --to=pbonzini@redhat.com \
    --cc=amit.shah@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=liang.z.li@intel.com \
    --cc=mst@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=quintela@redhat.com \
    --cc=rth7680@gmail.com \
    --cc=rth@twiddle.net \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).