All of lore.kernel.org
 help / color / mirror / Atom feed
From: Juan Quintela <quintela@redhat.com>
To: ling xu <ling1.xu@intel.com>
Cc: qemu-devel@nongnu.org,  dgilbert@redhat.com,
	 Zhou Zhao <zhou.zhao@intel.com>,  Jun Jin <jun.i.jin@intel.com>
Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Date: Mon, 08 Aug 2022 15:12:27 +0200	[thread overview]
Message-ID: <87r11qnakk.fsf@secure.mitica> (raw)
In-Reply-To: <20220808074837.1484760-2-ling1.xu@intel.com> (ling xu's message of "Mon, 8 Aug 2022 15:48:36 +0800")

ling xu <ling1.xu@intel.com> wrote:
> This commit update runtime check of AVX512, and implements avx512 of
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 version
> can achieve almost 60%-70% performance improvement on unit test provided
> by Qemu. In addition, we provide one more unit test called
> "test_encode_decode_random", in which dirty data are randomly located in
> 4K page, and this case can achieve almost 140% performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>  meson.build        |  16 ++++
>  meson_options.txt  |   2 +
>  migration/ram.c    |  41 ++++++++++
>  migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
>  migration/xbzrle.h |   4 +
>  5 files changed, 244 insertions(+)
>
> diff --git a/meson.build b/meson.build
> index 294e9a8f32..4222b77e9f 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>      int main(int argc, char *argv[]) { return bar(argv[0]); }
>    '''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {
> +
> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);
> +      return res[1];
> +    }
> +    int main(int argc, char *argv[]) { return bar(argv[0]); }
> +  '''), error_message: 'AVX512BW not available').allowed())
> +
>  have_pvrdma = get_option('pvrdma') \
>    .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>    .require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt
> index e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
>         description: 'AVX2 optimizations')
>  option('avx512f', type: 'feature', value: 'disabled',
>         description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +       description: 'AVX512BW optimizations')
>  option('keyring', type: 'feature', value: 'auto',
>         description: 'Linux keyring support')
>  

[no clue about meson, it looks ok]

> diff --git a/migration/ram.c b/migration/ram.c
> index dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
>  
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;

An all caps global variable?

> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    IS_CPU_SUPPORT_AVX512BW = false;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                IS_CPU_SUPPORT_AVX512BW = true;
> +            }
> +        }
> +    }
> +    return ;
> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>      /* XBZRLE encoding (if there is no overflow) */
> +    #if defined(CONFIG_AVX512BW_OPT)
> +    if (likely(IS_CPU_SUPPORT_AVX512BW)) {

All distributions are go to have compile time support for AVX, but I am
not sure the percentage of machines that support avx

> +        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
> +                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                               TARGET_PAGE_SIZE);
> +    } else {
> +        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> +                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                           TARGET_PAGE_SIZE);
> +    }

the else part is the same than the #else part
> +    #else
>      encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
>                                         TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
>                                         TARGET_PAGE_SIZE);
> +    #endif

So, why don't just create a new function pointer:

int (*xbzrle_encode_buffer_func)(uint8_t *old_buf, uint8_t *new_buf, int slen,
                                 uint8_t *dst, int dlen) = xbzrle_encode_buffer;


And aad into init_cpu_flag() something in the line of:

	xbzrle_encode_buffer_func = xbrrle_encode_buffer_512;

?


>      /*
>       * Update the cache contents, so that it corresponds to the data
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c
> index 1ba482ded9..4db09fdbdb 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
>  
>      return d;
>  }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +
> +#include <immintrin.h>
> +#include <math.h>
> +#define SET_ZERO512(r) r = _mm512_set1_epi32(0)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen)
> +{

This is just personal taste, but I would rename this to:

xbzrle_encode_buffer_avx512?

> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    int count512s = (slen >> 6);
> +    int res = slen % 64;

res variable here means residual, normally we use "res" with meaning of
"result" in qemu.

> +    bool never_same = true;
> +    while (count512s--) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                               0xffffffffffffffff, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                 0xffffffffffffffff, new_buf + i);
> +        /* in mask bit 1 for same, 0 for diff */
> +        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = 64;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                if (comp == 0xffffffffffffffff) {
> +                    i += 64;
> +                    zrun_len += 64;
> +                    break;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* still has different data after same data */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +            if (never_same || zrun_len) {
> +                /*
> +                 * never_same only acts if
> +                 * data begins with diff in first count512s
> +                 */
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            if ((bytesToCheck == 64) && (comp == 0x0)) {
> +                i += 64;
> +                nzrun_len += 64;
> +                break;
> +            }
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                /* mask like 111000 */
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +    if (res) {
> +        /* the number of data is less than 64 */
> +        unsigned long long mask = pow(2, res);

Not your fault.

21st century.  Someone still use long long in a new API, sniff.

> +        mask -= 1;
> +        __m512i r = SET_ZERO512(r);
> +        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
> +        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = res;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* diff after same */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +
> +            if (never_same || zrun_len) {
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +
> +    if (zrun_len) {
> +        return (zrun_len == slen) ? 0 : d;
> +    }
> +    if (nzrun_len != 0) {
> +        d += uleb128_encode_small(dst + d, nzrun_len);
> +        /* overflow */
> +        if (d + nzrun_len > dlen) {
> +            return -1;
> +        }
> +        nzrun_start = new_buf + i - nzrun_len;
> +        memcpy(dst + d, nzrun_start, nzrun_len);
> +        d += nzrun_len;
> +    }
> +    return d;
> +}
> +#pragma GCC pop_options
> +#endif
> \ No newline at end of file
> diff --git a/migration/xbzrle.h b/migration/xbzrle.h
> index a0db507b9c..6247de5f00 100644
> --- a/migration/xbzrle.h
> +++ b/migration/xbzrle.h
> @@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
>                           uint8_t *dst, int dlen);
>  
>  int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
> +#if defined(CONFIG_AVX512BW_OPT)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen);
> +#endif
>  #endif



  reply	other threads:[~2022-08-08 13:14 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-08  7:48 [PATCH v3 0/2] This patch updates runtime check of AVX512 ling xu
2022-08-08  7:48 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu
2022-08-08 13:12   ` Juan Quintela [this message]
2022-08-09  7:51     ` Xu, Ling1
2022-08-09 18:25       ` Richard Henderson
2022-08-11  7:23         ` Xu, Ling1
2022-08-09 18:41   ` Richard Henderson
2022-08-08  7:48 ` [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer ling xu
2022-08-08  8:08   ` Thomas Huth
2022-08-08  8:30     ` Xu, Ling1
2022-08-09 18:30   ` Richard Henderson
2022-08-08 11:54 ` [PATCH v3 0/2] This patch updates runtime check of AVX512 Juan Quintela
2022-08-09  1:19   ` Xu, Ling1
  -- strict thread matches above, loose matches on Subject: below --
2022-08-08  7:34 ling xu
2022-08-08  7:34 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87r11qnakk.fsf@secure.mitica \
    --to=quintela@redhat.com \
    --cc=dgilbert@redhat.com \
    --cc=jun.i.jin@intel.com \
    --cc=ling1.xu@intel.com \
    --cc=qemu-devel@nongnu.org \
    --cc=zhou.zhao@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.