qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v5 0/2] This patch updates AVX512 support for xbzrle
@ 2022-08-18  9:35 ling xu
  2022-08-18  9:35 ` [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer ling xu
  2022-08-18  9:35 ` [PATCH v5 2/2] Test code and benchmark code ling xu
  0 siblings, 2 replies; 5+ messages in thread
From: ling xu @ 2022-08-18  9:35 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, dgilbert, ling xu, Zhou Zhao, Jun Jin

This patch updates code of avx512 support for xbzrle_encode_buffer
function. We modified runtime check of avx512 and simplified algorithm.
Besides, we provide benchmark in xbzrle-bench.c for performance
comparison.

Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer
  Test code and benchmark code

 meson.build                |  16 ++
 meson_options.txt          |   2 +
 migration/ram.c            |  35 ++-
 migration/xbzrle.c         | 130 +++++++++++
 migration/xbzrle.h         |   4 +
 tests/bench/meson.build    |   4 +
 tests/bench/xbzrle-bench.c | 468 +++++++++++++++++++++++++++++++++++++
 tests/unit/test-xbzrle.c   |  39 +++-
 8 files changed, 690 insertions(+), 8 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer
  2022-08-18  9:35 [PATCH v5 0/2] This patch updates AVX512 support for xbzrle ling xu
@ 2022-08-18  9:35 ` ling xu
  2022-08-24  8:42   ` Juan Quintela
  2022-08-18  9:35 ` [PATCH v5 2/2] Test code and benchmark code ling xu
  1 sibling, 1 reply; 5+ messages in thread
From: ling xu @ 2022-08-18  9:35 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, dgilbert, ling xu, Zhou Zhao, Jun Jin

This commit updates code of avx512 support for xbzrle_encode_buffer function to
accelerate xbzrle encoding speed. We add runtime check of avx512 and add
benchmark for this feature. Compared with C version of
xbzrle_encode_buffer function, avx512 version can achieve 50%-70%
performance improvement on benchmarking. In addition, if dirty data is
randomly located in 4K page, the avx512 version can achieve almost 140%
performance gain.

Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>
---
 meson.build        |  16 ++++++
 meson_options.txt  |   2 +
 migration/ram.c    |  35 ++++++++++--
 migration/xbzrle.c | 130 +++++++++++++++++++++++++++++++++++++++++++++
 migration/xbzrle.h |   4 ++
 5 files changed, 184 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 30a380752c..c9d90a5bff 100644
--- a/meson.build
+++ b/meson.build
@@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
     int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
+  .require(cc.links('''
+    #pragma GCC push_options
+    #pragma GCC target("avx512bw")
+    #include <cpuid.h>
+    #include <immintrin.h>
+    static int bar(void *a) {
+
+      __m512i x = *(__m512i *)a;
+      __m512i res= _mm512_abs_epi8(x);
+      return res[1];
+    }
+    int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
        description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
        description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+       description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
        description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..387bef2675 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
 
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+     uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+    unsigned max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+         /* We must check that AVX is not just available, but usable.  */
+        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+            int bv;
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            __cpuid_count(7, 0, a, b, c, d);
+           /* 0xe6:
+            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+            *                    and ZMM16-ZMM31 state are enabled by OS)
+            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+            */
+            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+            }
+        }
+    }
+    return ;
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +831,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
     /* XBZRLE encoding (if there is no overflow) */
-    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
-                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-                                       TARGET_PAGE_SIZE);
+    encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
+                                            TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+                                            TARGET_PAGE_SIZE);
 
     /*
      * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..6da7f79625 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -174,3 +174,133 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 
     return d;
 }
+
+#if defined(CONFIG_AVX512BW_OPT)
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include <immintrin.h>
+int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                             uint8_t *dst, int dlen)
+{
+    uint32_t zrun_len = 0, nzrun_len = 0;
+    int d = 0, i = 0, num = 0;
+    uint8_t *nzrun_start = NULL;
+    /* add 1 to include residual part in main loop */
+    uint32_t count512s = (slen >> 6) + 1;
+    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
+    uint32_t countResidual = slen & 0b111111;
+    bool never_same = true;
+    uint64_t maskResidual = 1;
+    maskResidual <<= countResidual;
+    maskResidual -=1;
+    uint64_t comp = 0;
+    int bytesToCheck = 0;
+
+    while (count512s) {
+        if (d + 2 > dlen) {
+            return -1;
+        }
+
+        if(count512s != 1){
+            __m512i old_data = _mm512_mask_loadu_epi8(old_data,
+                                                      0xffffffffffffffff, old_buf + i);
+            __m512i new_data = _mm512_mask_loadu_epi8(new_data,
+                                                      0xffffffffffffffff, new_buf + i);
+            comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+            bytesToCheck = 64;
+            count512s--;
+        } else {
+            __m512i old_data = _mm512_mask_loadu_epi8(old_data,
+                                                      maskResidual, old_buf + i);
+            __m512i new_data = _mm512_mask_loadu_epi8(new_data,
+                                                      maskResidual, new_buf + i);
+            comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+            bytesToCheck = countResidual;
+            count512s--;
+        }
+
+        bool is_same = (comp & 0x1);
+        while (bytesToCheck) {
+            if (is_same) {
+                if (nzrun_len) {
+                    d += uleb128_encode_small(dst + d, nzrun_len);
+                    if (d + nzrun_len > dlen) {
+                        return -1;
+                    }
+                    nzrun_start = new_buf + i - nzrun_len;
+                    memcpy(dst + d, nzrun_start, nzrun_len);
+                    d += nzrun_len;
+                    nzrun_len = 0;
+                }
+                /* 64 data at a time for speed */
+                if (count512s && (comp == 0xffffffffffffffff)) {
+                    i += 64;
+                    zrun_len += 64;
+                    break;
+                }
+                never_same = false;
+                num = __builtin_ctzll(~comp);
+                num = (num < bytesToCheck) ? num : bytesToCheck;
+                zrun_len += num;
+                bytesToCheck -= num;
+                comp >>= num;
+                i += num;
+                if (bytesToCheck) {
+                    /* still has different data after same data */
+                    d += uleb128_encode_small(dst + d, zrun_len);
+                    zrun_len = 0;
+                } else {
+                    break;
+                }
+            }
+            if (never_same || zrun_len) {
+                /*
+                 * never_same only acts if
+                 * data begins with diff in first count512s
+                 */
+                d += uleb128_encode_small(dst + d, zrun_len);
+                zrun_len = 0;
+                never_same = false;
+            }
+            /* has diff, 64 data at a time for speed */
+            if ((bytesToCheck == 64) && (comp == 0x0)) {
+                i += 64;
+                nzrun_len += 64;
+                break;
+            }
+            num = __builtin_ctzll(comp);
+            num = (num < bytesToCheck) ? num : bytesToCheck;
+            nzrun_len += num;
+            bytesToCheck -= num;
+            comp >>= num;
+            i += num;
+            if (bytesToCheck) {
+                /* mask like 111000 */
+                d += uleb128_encode_small(dst + d, nzrun_len);
+                /* overflow */
+                if (d + nzrun_len > dlen) {
+                    return -1;
+                }
+                nzrun_start = new_buf + i - nzrun_len;
+                memcpy(dst + d, nzrun_start, nzrun_len);
+                d += nzrun_len;
+                nzrun_len = 0;
+                is_same = true;
+            }
+        }
+    }
+
+    if (nzrun_len != 0) {
+        d += uleb128_encode_small(dst + d, nzrun_len);
+        /* overflow */
+        if (d + nzrun_len > dlen) {
+            return -1;
+        }
+        nzrun_start = new_buf + i - nzrun_len;
+        memcpy(dst + d, nzrun_start, nzrun_len);
+        d += nzrun_len;
+    }
+    return d;
+}
+#pragma GCC pop_options
+#endif
diff --git a/migration/xbzrle.h b/migration/xbzrle.h
index a0db507b9c..6feb49160a 100644
--- a/migration/xbzrle.h
+++ b/migration/xbzrle.h
@@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
                          uint8_t *dst, int dlen);
 
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
+#if defined(CONFIG_AVX512BW_OPT)
+int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                                uint8_t *dst, int dlen);
+#endif
 #endif
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v5 2/2] Test code and benchmark code
  2022-08-18  9:35 [PATCH v5 0/2] This patch updates AVX512 support for xbzrle ling xu
  2022-08-18  9:35 ` [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer ling xu
@ 2022-08-18  9:35 ` ling xu
  1 sibling, 0 replies; 5+ messages in thread
From: ling xu @ 2022-08-18  9:35 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, dgilbert, ling xu, Zhou Zhao, Jun Jin

Test code in test-xbzrle.c, benchmark code in xbzrle-bench.c for
performance benchmarking.

Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>
---
 tests/bench/meson.build    |   4 +
 tests/bench/xbzrle-bench.c | 468 +++++++++++++++++++++++++++++++++++++
 tests/unit/test-xbzrle.c   |  39 +++-
 3 files changed, 506 insertions(+), 5 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

diff --git a/tests/bench/meson.build b/tests/bench/meson.build
index 279a8fcc33..daefead58d 100644
--- a/tests/bench/meson.build
+++ b/tests/bench/meson.build
@@ -3,6 +3,10 @@ qht_bench = executable('qht-bench',
                        sources: 'qht-bench.c',
                        dependencies: [qemuutil])
 
+xbzrle_bench = executable('xbzrle-bench',
+                       sources: 'xbzrle-bench.c',
+                       dependencies: [qemuutil,migration])
+
 executable('atomic_add-bench',
            sources: files('atomic_add-bench.c'),
            dependencies: [qemuutil],
diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
new file mode 100644
index 0000000000..6ffac62e15
--- /dev/null
+++ b/tests/bench/xbzrle-bench.c
@@ -0,0 +1,468 @@
+/*
+ * Xor Based Zero Run Length Encoding unit tests.
+ *
+ * Copyright 2013 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Orit Wasserman  <owasserm@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "../migration/xbzrle.h"
+
+#define XBZRLE_PAGE_SIZE 4096
+
+//int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+//     uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+static bool is_cpu_support_avx512bw;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+    unsigned max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    is_cpu_support_avx512bw = false;
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+         /* We must check that AVX is not just available, but usable.  */
+        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+            int bv;
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            __cpuid_count(7, 0, a, b, c, d);
+           /* 0xe6:
+            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+            *                    and ZMM16-ZMM31 state are enabled by OS)
+            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+            */
+            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+                //xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+                is_cpu_support_avx512bw = true;
+            }
+        }
+    }
+    return ;
+}
+#endif
+
+struct ResTime{
+    float t_raw;
+    float t_512;
+};
+
+static void encode_decode_zero(struct ResTime *res)
+{
+    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0;
+    int dlen = 0, dlen512 = 0;
+    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+    for (i = diff_len; i > 0; i--) {
+        buffer[1000 + i] = i;
+        buffer512[1000 + i] = i;
+    }
+
+    buffer[1000 + diff_len + 3] = 103;
+    buffer[1000 + diff_len + 5] = 105;
+
+    buffer512[1000 + diff_len + 3] = 103;
+    buffer512[1000 + diff_len + 5] = 105;
+
+    /* encode zero page */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
+                       XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    g_assert(dlen == 0);
+
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == 0);
+
+    res->t_raw = time_val;
+    res->t_512 = time_val512;
+
+    g_free(buffer);
+    g_free(compressed);
+    g_free(buffer512);
+    g_free(compressed512);
+
+}
+
+static void test_encode_decode_zero_avx512(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    struct ResTime res;
+    for (i = 0; i < 10000; i++) {
+        encode_decode_zero(&res);
+        time_raw += res.t_raw;
+        time_512 += res.t_512;
+    }
+    printf("Zero test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static void encode_decode_unchanged(struct ResTime *res)
+{
+    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0;
+    int dlen = 0, dlen512 = 0;
+    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+    for (i = diff_len; i > 0; i--) {
+        test[1000 + i] = i + 4;
+        test512[1000 + i] = i + 4;
+    }
+
+    test[1000 + diff_len + 3] = 107;
+    test[1000 + diff_len + 5] = 109;
+
+    test512[1000 + diff_len + 3] = 107;
+    test512[1000 + diff_len + 5] = 109;
+
+    /* test unchanged buffer */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
+                                XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    g_assert(dlen == 0);
+
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == 0);
+
+    res->t_raw = time_val;
+    res->t_512 = time_val512;
+
+    g_free(test);
+    g_free(compressed);
+    g_free(test512);
+    g_free(compressed512);
+
+}
+
+static void test_encode_decode_unchanged_avx512(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    struct ResTime res;
+    for (i = 0; i < 10000; i++) {
+        encode_decode_unchanged(&res);
+        time_raw += res.t_raw;
+        time_512 += res.t_512;
+    }
+    printf("Unchanged test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static void encode_decode_1_byte(struct ResTime *res)
+{
+    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0;
+    uint8_t buf[2];
+    uint8_t buf512[2];
+
+    test[XBZRLE_PAGE_SIZE - 1] = 1;
+    test512[XBZRLE_PAGE_SIZE - 1] = 1;
+
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
+                       XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
+
+    rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
+    g_assert(rc == XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
+
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2));
+
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512,
+                                 XBZRLE_PAGE_SIZE);
+    g_assert(rc512 == XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
+
+    res->t_raw = time_val;
+    res->t_512 = time_val512;
+
+    g_free(buffer);
+    g_free(compressed);
+    g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+}
+
+static void test_encode_decode_1_byte_avx512(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    struct ResTime res;
+    for (i = 0; i < 10000; i++) {
+        encode_decode_1_byte(&res);
+        time_raw += res.t_raw;
+        time_512 += res.t_512;
+    }
+    printf("1 byte test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static void encode_decode_overflow(struct ResTime *res)
+{
+    uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
+
+    for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) {
+        test[i * 2] = 1;
+        test512[i * 2] = 1;
+    }
+
+    /* encode overflow */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
+                              XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    g_assert(rc == -1);
+
+    t_start512 = clock();
+    rc512 = xbzrle_encode_buffer_avx512(buffer512, test512, XBZRLE_PAGE_SIZE,
+                                     compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(rc512 == -1);
+
+    res->t_raw = time_val;
+    res->t_512 = time_val512;
+
+    g_free(buffer);
+    g_free(compressed);
+    g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+}
+
+static void test_encode_decode_overflow_avx512(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    struct ResTime res;
+    for (i = 0; i < 10000; i++) {
+        encode_decode_overflow(&res);
+        time_raw += res.t_raw;
+        time_512 += res.t_512;
+    }
+    printf("Overflow test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static void encode_decode_range_avx512(struct ResTime *res)
+{
+    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
+    int dlen = 0, dlen512 = 0;
+
+    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+    for (i = diff_len; i > 0; i--) {
+        buffer[1000 + i] = i;
+        test[1000 + i] = i + 4;
+        buffer512[1000 + i] = i;
+        test512[1000 + i] = i + 4;
+    }
+
+    buffer[1000 + diff_len + 3] = 103;
+    test[1000 + diff_len + 3] = 107;
+
+    buffer[1000 + diff_len + 5] = 105;
+    test[1000 + diff_len + 5] = 109;
+
+    buffer512[1000 + diff_len + 3] = 103;
+    test512[1000 + diff_len + 3] = 107;
+
+    buffer512[1000 + diff_len + 5] = 105;
+    test512[1000 + diff_len + 5] = 109;
+
+    /* test encode/decode */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
+                                XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
+    g_assert(rc < XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
+
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
+    g_assert(rc512 < XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
+
+    res->t_raw = time_val;
+    res->t_512 = time_val512;
+
+    g_free(buffer);
+    g_free(compressed);
+    g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+}
+
+static void test_encode_decode_avx512(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    struct ResTime res;
+    for (i = 0; i < 10000; i++) {
+        encode_decode_range_avx512(&res);
+        time_raw += res.t_raw;
+        time_512 += res.t_512;
+    }
+    printf("Encode decode test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static void encode_decode_random(struct ResTime *res)
+{
+    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
+    int dlen = 0, dlen512 = 0;
+
+    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
+    /* store the index of diff */
+    int dirty_index[diff_len];
+    for (int j = 0; j < diff_len; j++) {
+        dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
+    }
+    for (i = diff_len - 1; i >= 0; i--) {
+        buffer[dirty_index[i]] = i;
+        test[dirty_index[i]] = i + 4;
+        buffer512[dirty_index[i]] = i;
+        test512[dirty_index[i]] = i + 4;
+    }
+
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
+                                XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
+    g_assert(rc < XBZRLE_PAGE_SIZE);
+
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_avx512(test512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
+    g_assert(rc512 < XBZRLE_PAGE_SIZE);
+
+    res->t_raw = time_val;
+    res->t_512 = time_val512;
+
+    g_free(buffer);
+    g_free(compressed);
+    g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+}
+
+static void test_encode_decode_random_avx512(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    struct ResTime res;
+    for (i = 0; i < 10000; i++) {
+        encode_decode_random(&res);
+        time_raw += res.t_raw;
+        time_512 += res.t_512;
+    }
+    printf("Random test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_rand_int();
+    #if defined(CONFIG_AVX512BW_OPT)
+    if (likely(is_cpu_support_avx512bw)) {
+        g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_avx512);
+        g_test_add_func("/xbzrle/encode_decode_unchanged",
+                        test_encode_decode_unchanged_avx512);
+        g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_avx512);
+        g_test_add_func("/xbzrle/encode_decode_overflow",
+                        test_encode_decode_overflow_avx512);
+        g_test_add_func("/xbzrle/encode_decode", test_encode_decode_avx512);
+        g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random_avx512);
+    }
+    #endif
+    return g_test_run();
+}
diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index ef951b6e54..547046d093 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -16,6 +16,35 @@
 
 #define XBZRLE_PAGE_SIZE 4096
 
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+     uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+    unsigned max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+         /* We must check that AVX is not just available, but usable.  */
+        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+            int bv;
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            __cpuid_count(7, 0, a, b, c, d);
+           /* 0xe6:
+            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+            *                    and ZMM16-ZMM31 state are enabled by OS)
+            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+            */
+            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+            }
+        }
+    }
+    return ;
+}
+#endif
+
 static void test_uleb(void)
 {
     uint32_t i, val;
@@ -54,7 +83,7 @@ static void test_encode_decode_zero(void)
     buffer[1000 + diff_len + 5] = 105;
 
     /* encode zero page */
-    dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
+    dlen = xbzrle_encode_buffer_func(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
                        XBZRLE_PAGE_SIZE);
     g_assert(dlen == 0);
 
@@ -78,7 +107,7 @@ static void test_encode_decode_unchanged(void)
     test[1000 + diff_len + 5] = 109;
 
     /* test unchanged buffer */
-    dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
+    dlen = xbzrle_encode_buffer_func(test, test, XBZRLE_PAGE_SIZE, compressed,
                                 XBZRLE_PAGE_SIZE);
     g_assert(dlen == 0);
 
@@ -96,7 +125,7 @@ static void test_encode_decode_1_byte(void)
 
     test[XBZRLE_PAGE_SIZE - 1] = 1;
 
-    dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
+    dlen = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
                        XBZRLE_PAGE_SIZE);
     g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
 
@@ -121,7 +150,7 @@ static void test_encode_decode_overflow(void)
     }
 
     /* encode overflow */
-    rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
+    rc = xbzrle_encode_buffer_func(buffer, test, XBZRLE_PAGE_SIZE, compressed,
                               XBZRLE_PAGE_SIZE);
     g_assert(rc == -1);
 
@@ -152,7 +181,7 @@ static void encode_decode_range(void)
     test[1000 + diff_len + 5] = 109;
 
     /* test encode/decode */
-    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
+    dlen = xbzrle_encode_buffer_func(test, buffer, XBZRLE_PAGE_SIZE, compressed,
                                 XBZRLE_PAGE_SIZE);
 
     rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer
  2022-08-18  9:35 ` [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer ling xu
@ 2022-08-24  8:42   ` Juan Quintela
  2022-08-26  9:17     ` Xu, Ling1
  0 siblings, 1 reply; 5+ messages in thread
From: Juan Quintela @ 2022-08-24  8:42 UTC (permalink / raw)
  To: ling xu; +Cc: qemu-devel, dgilbert, Zhou Zhao, Jun Jin

ling xu <ling1.xu@intel.com> wrote:
> This commit updates code of avx512 support for xbzrle_encode_buffer function to
> accelerate xbzrle encoding speed. We add runtime check of avx512 and add
> benchmark for this feature. Compared with C version of
> xbzrle_encode_buffer function, avx512 version can achieve 50%-70%
> performance improvement on benchmarking. In addition, if dirty data is
> randomly located in 4K page, the avx512 version can achieve almost 140%
> performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>  meson.build        |  16 ++++++
>  meson_options.txt  |   2 +
>  migration/ram.c    |  35 ++++++++++--
>  migration/xbzrle.c | 130 +++++++++++++++++++++++++++++++++++++++++++++
>  migration/xbzrle.h |   4 ++
>  5 files changed, 184 insertions(+), 3 deletions(-)
>
> diff --git a/meson.build b/meson.build
> index 30a380752c..c9d90a5bff 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>      int main(int argc, char *argv[]) { return bar(argv[0]); }
>    '''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {


> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);

Cast is as ugly as hell, what about:

      __m512i *x = a;
      __m512i res = _mm512_abs_epi8(*x);

??

> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
> +            }
> +        }
> +    }
> +    return ;

This return line is not needed.

> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>      /* XBZRLE encoding (if there is no overflow) */
> -    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> -                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> -                                       TARGET_PAGE_SIZE);
> +    encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
> +                                            TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                            TARGET_PAGE_SIZE);
>  
>      /*
>       * Update the cache contents, so that it corresponds to the data
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c
> index 1ba482ded9..6da7f79625 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,133 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
>  
>      return d;
>  }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +#include <immintrin.h>
> +int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen)
> +{
> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    /* add 1 to include residual part in main loop */
> +    uint32_t count512s = (slen >> 6) + 1;
> +    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
> +    uint32_t countResidual = slen & 0b111111;
> +    bool never_same = true;
> +    uint64_t maskResidual = 1;
> +    maskResidual <<= countResidual;
> +    maskResidual -=1;
> +    uint64_t comp = 0;
> +    int bytesToCheck = 0;
> +
> +    while (count512s) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +
> +        if(count512s != 1){
> +            __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                                                      0xffffffffffffffff, old_buf + i);
> +            __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                      0xffffffffffffffff, new_buf + i);
> +            comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +            bytesToCheck = 64;
> +            count512s--;
> +        } else {
> +            __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                                                      maskResidual, old_buf + i);
> +            __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                      maskResidual, new_buf + i);
> +            comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +            bytesToCheck = countResidual;
> +            count512s--;
> +        }

It is basically the same in both branches of the if, what about:

        int bytesToCheck = 64;
        uint86_t mask = 0xffffffffffffffff;

        /* I am assuming this is the opposit of the if condition */
        if(count512s == 1){
            mask = maskResidual;
            bytesToCheck = countResidual;
        }
        __m512i old_data = _mm512_mask_loadu_epi8(old_data, mask, old_buf + i);
        __m512i new_data = _mm512_mask_loadu_epi8(new_data, mask, new_buf + i);
        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
        count512s--;

BTW, once that we are here, why not to be consistent:

bool is_same;
uint64_t maskResidual;

just use always Cammel case or underscores, but half and half ....

Later, Juan.



^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer
  2022-08-24  8:42   ` Juan Quintela
@ 2022-08-26  9:17     ` Xu, Ling1
  0 siblings, 0 replies; 5+ messages in thread
From: Xu, Ling1 @ 2022-08-26  9:17 UTC (permalink / raw)
  To: quintela@redhat.com
  Cc: qemu-devel@nongnu.org, dgilbert@redhat.com, Zhao, Zhou,
	Jin, Jun I

Hi, juan, 
      Thanks for your time and suggestions on this patch. We have revised our code according to your nice comments. We will submit patch v6 to update these modifications.

Best Regards
Ling

-----Original Message-----
From: Juan Quintela <quintela@redhat.com> 
Sent: Wednesday, August 24, 2022 4:42 PM
To: Xu, Ling1 <ling1.xu@intel.com>
Cc: qemu-devel@nongnu.org; dgilbert@redhat.com; Zhao, Zhou <zhou.zhao@intel.com>; Jin, Jun I <jun.i.jin@intel.com>
Subject: Re: [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer

ling xu <ling1.xu@intel.com> wrote:
> This commit updates code of avx512 support for xbzrle_encode_buffer 
> function to accelerate xbzrle encoding speed. We add runtime check of 
> avx512 and add benchmark for this feature. Compared with C version of 
> xbzrle_encode_buffer function, avx512 version can achieve 50%-70% 
> performance improvement on benchmarking. In addition, if dirty data is 
> randomly located in 4K page, the avx512 version can achieve almost 
> 140% performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>  meson.build        |  16 ++++++
>  meson_options.txt  |   2 +
>  migration/ram.c    |  35 ++++++++++--
>  migration/xbzrle.c | 130 +++++++++++++++++++++++++++++++++++++++++++++
>  migration/xbzrle.h |   4 ++
>  5 files changed, 184 insertions(+), 3 deletions(-)
>
> diff --git a/meson.build b/meson.build index 30a380752c..c9d90a5bff 
> 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>      int main(int argc, char *argv[]) { return bar(argv[0]); }
>    '''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, 
> +cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {


> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);

Cast is as ugly as hell, what about:

      __m512i *x = a;
      __m512i res = _mm512_abs_epi8(*x);

??

> +static void __attribute__((constructor)) init_cpu_flag(void) {
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
> +            }
> +        }
> +    }
> +    return ;

This return line is not needed.

> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page @@ -802,9 +831,9 @@ 
> static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>      /* XBZRLE encoding (if there is no overflow) */
> -    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> -                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> -                                       TARGET_PAGE_SIZE);
> +    encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
> +                                            TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                            TARGET_PAGE_SIZE);
>  
>      /*
>       * Update the cache contents, so that it corresponds to the data 
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 
> 1ba482ded9..6da7f79625 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,133 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, 
> uint8_t *dst, int dlen)
>  
>      return d;
>  }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +#include <immintrin.h>
> +int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen) {
> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    /* add 1 to include residual part in main loop */
> +    uint32_t count512s = (slen >> 6) + 1;
> +    /* countResidual is tail of data, i.e., countResidual = slen % 64 */
> +    uint32_t countResidual = slen & 0b111111;
> +    bool never_same = true;
> +    uint64_t maskResidual = 1;
> +    maskResidual <<= countResidual;
> +    maskResidual -=1;
> +    uint64_t comp = 0;
> +    int bytesToCheck = 0;
> +
> +    while (count512s) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +
> +        if(count512s != 1){
> +            __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                                                      0xffffffffffffffff, old_buf + i);
> +            __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                      0xffffffffffffffff, new_buf + i);
> +            comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +            bytesToCheck = 64;
> +            count512s--;
> +        } else {
> +            __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                                                      maskResidual, old_buf + i);
> +            __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                      maskResidual, new_buf + i);
> +            comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +            bytesToCheck = countResidual;
> +            count512s--;
> +        }

It is basically the same in both branches of the if, what about:

        int bytesToCheck = 64;
        uint86_t mask = 0xffffffffffffffff;

        /* I am assuming this is the opposit of the if condition */
        if(count512s == 1){
            mask = maskResidual;
            bytesToCheck = countResidual;
        }
        __m512i old_data = _mm512_mask_loadu_epi8(old_data, mask, old_buf + i);
        __m512i new_data = _mm512_mask_loadu_epi8(new_data, mask, new_buf + i);
        uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
        count512s--;

BTW, once that we are here, why not to be consistent:

bool is_same;
uint64_t maskResidual;

just use always Cammel case or underscores, but half and half ....

Later, Juan.



^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-08-26  9:21 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-08-18  9:35 [PATCH v5 0/2] This patch updates AVX512 support for xbzrle ling xu
2022-08-18  9:35 ` [PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer ling xu
2022-08-24  8:42   ` Juan Quintela
2022-08-26  9:17     ` Xu, Ling1
2022-08-18  9:35 ` [PATCH v5 2/2] Test code and benchmark code ling xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).