From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Cc: qemu-arm@nongnu.org, Juan Quintela <quintela@redhat.com>
Subject: [PATCH 06/27] migration/xbzrle: Shuffle function order
Date: Sat, 20 May 2023 09:26:13 -0700 [thread overview]
Message-ID: <20230520162634.3991009-7-richard.henderson@linaro.org> (raw)
In-Reply-To: <20230520162634.3991009-1-richard.henderson@linaro.org>
Place the CONFIG_AVX512BW_OPT block at the top,
which will aid function selection in the next patch.
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
migration/xbzrle.c | 244 ++++++++++++++++++++++-----------------------
1 file changed, 122 insertions(+), 122 deletions(-)
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 258e4959c9..751b5428f7 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -15,6 +15,128 @@
#include "qemu/host-utils.h"
#include "xbzrle.h"
+#if defined(CONFIG_AVX512BW_OPT)
+#include <immintrin.h>
+
+int __attribute__((target("avx512bw")))
+xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+ uint8_t *dst, int dlen)
+{
+ uint32_t zrun_len = 0, nzrun_len = 0;
+ int d = 0, i = 0, num = 0;
+ uint8_t *nzrun_start = NULL;
+ /* add 1 to include residual part in main loop */
+ uint32_t count512s = (slen >> 6) + 1;
+ /* countResidual is tail of data, i.e., countResidual = slen % 64 */
+ uint32_t count_residual = slen & 0b111111;
+ bool never_same = true;
+ uint64_t mask_residual = 1;
+ mask_residual <<= count_residual;
+ mask_residual -= 1;
+ __m512i r = _mm512_set1_epi32(0);
+
+ while (count512s) {
+ int bytes_to_check = 64;
+ uint64_t mask = 0xffffffffffffffff;
+ if (count512s == 1) {
+ bytes_to_check = count_residual;
+ mask = mask_residual;
+ }
+ __m512i old_data = _mm512_mask_loadu_epi8(r,
+ mask, old_buf + i);
+ __m512i new_data = _mm512_mask_loadu_epi8(r,
+ mask, new_buf + i);
+ uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+ count512s--;
+
+ bool is_same = (comp & 0x1);
+ while (bytes_to_check) {
+ if (d + 2 > dlen) {
+ return -1;
+ }
+ if (is_same) {
+ if (nzrun_len) {
+ d += uleb128_encode_small(dst + d, nzrun_len);
+ if (d + nzrun_len > dlen) {
+ return -1;
+ }
+ nzrun_start = new_buf + i - nzrun_len;
+ memcpy(dst + d, nzrun_start, nzrun_len);
+ d += nzrun_len;
+ nzrun_len = 0;
+ }
+ /* 64 data at a time for speed */
+ if (count512s && (comp == 0xffffffffffffffff)) {
+ i += 64;
+ zrun_len += 64;
+ break;
+ }
+ never_same = false;
+ num = ctz64(~comp);
+ num = (num < bytes_to_check) ? num : bytes_to_check;
+ zrun_len += num;
+ bytes_to_check -= num;
+ comp >>= num;
+ i += num;
+ if (bytes_to_check) {
+ /* still has different data after same data */
+ d += uleb128_encode_small(dst + d, zrun_len);
+ zrun_len = 0;
+ } else {
+ break;
+ }
+ }
+ if (never_same || zrun_len) {
+ /*
+ * never_same only acts if
+ * data begins with diff in first count512s
+ */
+ d += uleb128_encode_small(dst + d, zrun_len);
+ zrun_len = 0;
+ never_same = false;
+ }
+ /* has diff, 64 data at a time for speed */
+ if ((bytes_to_check == 64) && (comp == 0x0)) {
+ i += 64;
+ nzrun_len += 64;
+ break;
+ }
+ num = ctz64(comp);
+ num = (num < bytes_to_check) ? num : bytes_to_check;
+ nzrun_len += num;
+ bytes_to_check -= num;
+ comp >>= num;
+ i += num;
+ if (bytes_to_check) {
+ /* mask like 111000 */
+ d += uleb128_encode_small(dst + d, nzrun_len);
+ /* overflow */
+ if (d + nzrun_len > dlen) {
+ return -1;
+ }
+ nzrun_start = new_buf + i - nzrun_len;
+ memcpy(dst + d, nzrun_start, nzrun_len);
+ d += nzrun_len;
+ nzrun_len = 0;
+ is_same = true;
+ }
+ }
+ }
+
+ if (nzrun_len != 0) {
+ d += uleb128_encode_small(dst + d, nzrun_len);
+ /* overflow */
+ if (d + nzrun_len > dlen) {
+ return -1;
+ }
+ nzrun_start = new_buf + i - nzrun_len;
+ memcpy(dst + d, nzrun_start, nzrun_len);
+ d += nzrun_len;
+ }
+ return d;
+}
+#endif
+
/*
page = zrun nzrun
| zrun nzrun page
@@ -175,125 +297,3 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
return d;
}
-
-#if defined(CONFIG_AVX512BW_OPT)
-#include <immintrin.h>
-
-int __attribute__((target("avx512bw")))
-xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
- uint8_t *dst, int dlen)
-{
- uint32_t zrun_len = 0, nzrun_len = 0;
- int d = 0, i = 0, num = 0;
- uint8_t *nzrun_start = NULL;
- /* add 1 to include residual part in main loop */
- uint32_t count512s = (slen >> 6) + 1;
- /* countResidual is tail of data, i.e., countResidual = slen % 64 */
- uint32_t count_residual = slen & 0b111111;
- bool never_same = true;
- uint64_t mask_residual = 1;
- mask_residual <<= count_residual;
- mask_residual -= 1;
- __m512i r = _mm512_set1_epi32(0);
-
- while (count512s) {
- int bytes_to_check = 64;
- uint64_t mask = 0xffffffffffffffff;
- if (count512s == 1) {
- bytes_to_check = count_residual;
- mask = mask_residual;
- }
- __m512i old_data = _mm512_mask_loadu_epi8(r,
- mask, old_buf + i);
- __m512i new_data = _mm512_mask_loadu_epi8(r,
- mask, new_buf + i);
- uint64_t comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
- count512s--;
-
- bool is_same = (comp & 0x1);
- while (bytes_to_check) {
- if (d + 2 > dlen) {
- return -1;
- }
- if (is_same) {
- if (nzrun_len) {
- d += uleb128_encode_small(dst + d, nzrun_len);
- if (d + nzrun_len > dlen) {
- return -1;
- }
- nzrun_start = new_buf + i - nzrun_len;
- memcpy(dst + d, nzrun_start, nzrun_len);
- d += nzrun_len;
- nzrun_len = 0;
- }
- /* 64 data at a time for speed */
- if (count512s && (comp == 0xffffffffffffffff)) {
- i += 64;
- zrun_len += 64;
- break;
- }
- never_same = false;
- num = ctz64(~comp);
- num = (num < bytes_to_check) ? num : bytes_to_check;
- zrun_len += num;
- bytes_to_check -= num;
- comp >>= num;
- i += num;
- if (bytes_to_check) {
- /* still has different data after same data */
- d += uleb128_encode_small(dst + d, zrun_len);
- zrun_len = 0;
- } else {
- break;
- }
- }
- if (never_same || zrun_len) {
- /*
- * never_same only acts if
- * data begins with diff in first count512s
- */
- d += uleb128_encode_small(dst + d, zrun_len);
- zrun_len = 0;
- never_same = false;
- }
- /* has diff, 64 data at a time for speed */
- if ((bytes_to_check == 64) && (comp == 0x0)) {
- i += 64;
- nzrun_len += 64;
- break;
- }
- num = ctz64(comp);
- num = (num < bytes_to_check) ? num : bytes_to_check;
- nzrun_len += num;
- bytes_to_check -= num;
- comp >>= num;
- i += num;
- if (bytes_to_check) {
- /* mask like 111000 */
- d += uleb128_encode_small(dst + d, nzrun_len);
- /* overflow */
- if (d + nzrun_len > dlen) {
- return -1;
- }
- nzrun_start = new_buf + i - nzrun_len;
- memcpy(dst + d, nzrun_start, nzrun_len);
- d += nzrun_len;
- nzrun_len = 0;
- is_same = true;
- }
- }
- }
-
- if (nzrun_len != 0) {
- d += uleb128_encode_small(dst + d, nzrun_len);
- /* overflow */
- if (d + nzrun_len > dlen) {
- return -1;
- }
- nzrun_start = new_buf + i - nzrun_len;
- memcpy(dst + d, nzrun_start, nzrun_len);
- d += nzrun_len;
- }
- return d;
-}
-#endif
--
2.34.1
next prev parent reply other threads:[~2023-05-20 16:30 UTC|newest]
Thread overview: 46+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-05-20 16:26 [PATCH 00/27] accel/tcg: Improvements to atomic128.h Richard Henderson
2023-05-20 16:26 ` [PATCH 01/27] util: Introduce host-specific cpuinfo.h Richard Henderson
2023-05-21 10:47 ` Philippe Mathieu-Daudé
2023-05-23 15:56 ` Alex Bennée
2023-05-20 16:26 ` [PATCH 02/27] util: Add cpuinfo-i386.c Richard Henderson
2023-05-21 11:28 ` Philippe Mathieu-Daudé
2023-05-21 15:05 ` Richard Henderson
2023-05-23 16:01 ` Alex Bennée
2023-05-20 16:26 ` [PATCH 03/27] util: Add i386 CPUINFO_ATOMIC_VMOVDQU Richard Henderson
2023-05-20 16:26 ` [PATCH 04/27] tcg/i386: Use host/cpuinfo.h Richard Henderson
2023-05-20 16:26 ` [PATCH 05/27] util/bufferiszero: Use i386 host/cpuinfo.h Richard Henderson
2023-05-20 16:26 ` Richard Henderson [this message]
2023-05-20 16:26 ` [PATCH 07/27] migration/xbzrle: " Richard Henderson
2023-05-20 16:26 ` [PATCH 08/27] migration: Build migration_files once Richard Henderson
2023-05-20 16:26 ` [PATCH 09/27] util: Add cpuinfo-aarch64.c Richard Henderson
2023-05-20 16:26 ` [PATCH 10/27] include/host: Split out atomic128-cas.h Richard Henderson
2023-05-21 10:44 ` Philippe Mathieu-Daudé
2023-05-20 16:26 ` [PATCH 11/27] include/host: Split out atomic128-ldst.h Richard Henderson
2023-05-20 16:26 ` [PATCH 12/27] meson: Fix detect atomic128 support with optimization Richard Henderson
2023-05-21 10:54 ` Philippe Mathieu-Daudé
2023-05-20 16:26 ` [PATCH 13/27] include/qemu: Move CONFIG_ATOMIC128_OPT handling to atomic128.h Richard Henderson
2023-05-20 16:26 ` [PATCH 14/27] target/ppc: Use tcg_gen_qemu_{ld, st}_i128 for LQARX, LQ, STQ Richard Henderson
2023-05-20 16:26 ` [PATCH 15/27] target/s390x: Use tcg_gen_qemu_{ld, st}_i128 for LPQ, STPQ Richard Henderson
2023-05-22 8:35 ` [PATCH 15/27] target/s390x: Use tcg_gen_qemu_{ld,st}_i128 " David Hildenbrand
2023-05-22 14:15 ` Richard Henderson
2023-05-20 16:26 ` [PATCH 16/27] accel/tcg: Unify cpu_{ld,st}*_{be,le}_mmu Richard Henderson
2023-05-21 11:15 ` Philippe Mathieu-Daudé
2023-05-21 15:00 ` Richard Henderson
2023-05-22 6:39 ` Philippe Mathieu-Daudé
2023-05-22 16:24 ` Richard Henderson
2023-05-20 16:26 ` [PATCH 17/27] target/s390x: Use cpu_{ld,st}*_mmu in do_csst Richard Henderson
2023-05-21 11:21 ` Philippe Mathieu-Daudé
2023-05-21 15:01 ` Richard Henderson
2023-05-22 8:43 ` David Hildenbrand
2023-05-20 16:26 ` [PATCH 18/27] target/s390x: Always use cpu_atomic_cmpxchgl_be_mmu " Richard Henderson
2023-05-22 8:44 ` David Hildenbrand
2023-05-20 16:26 ` [PATCH 19/27] accel/tcg: Remove cpu_atomic_{ld,st}o_*_mmu Richard Henderson
2023-05-20 16:26 ` [PATCH 20/27] accel/tcg: Remove prot argument to atomic_mmu_lookup Richard Henderson
2023-05-20 16:26 ` [PATCH 21/27] accel/tcg: Eliminate #if on HAVE_ATOMIC128 and HAVE_CMPXCHG128 Richard Henderson
2023-05-20 16:26 ` [PATCH 22/27] qemu/atomic128: Split atomic16_read Richard Henderson
2023-05-20 16:26 ` [PATCH 23/27] accel/tcg: Correctly use atomic128.h in ldst_atomicity.c.inc Richard Henderson
2023-05-20 16:26 ` [PATCH 24/27] tcg: Split out tcg/debug-assert.h Richard Henderson
2023-05-21 11:25 ` Philippe Mathieu-Daudé
2023-05-20 16:26 ` [PATCH 25/27] qemu/atomic128: Improve cmpxchg fallback for atomic16_set Richard Henderson
2023-05-20 16:26 ` [PATCH 26/27] qemu/atomic128: Add runtime test for FEAT_LSE2 Richard Henderson
2023-05-20 16:26 ` [PATCH 27/27] qemu/atomic128: Add x86_64 atomic128-ldst.h Richard Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230520162634.3991009-7-richard.henderson@linaro.org \
--to=richard.henderson@linaro.org \
--cc=qemu-arm@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=quintela@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).