[PULL 32/35] util/bufferiszero: Optimize SSE2 and AVX2 variants

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Cc: Alexander Monakov <amonakov@ispras.ru>,
	Mikhail Romanov <mmromanov@ispras.ru>
Subject: [PULL 32/35] util/bufferiszero: Optimize SSE2 and AVX2 variants
Date: Mon,  8 Apr 2024 07:49:26 -1000	[thread overview]
Message-ID: <20240408174929.862917-33-richard.henderson@linaro.org> (raw)
In-Reply-To: <20240408174929.862917-1-richard.henderson@linaro.org>

From: Alexander Monakov <amonakov@ispras.ru>

Increase unroll factor in SIMD loops from 4x to 8x in order to move
their bottlenecks from ALU port contention to load issue rate (two loads
per cycle on popular x86 implementations).

Avoid using out-of-bounds pointers in loop boundary conditions.

Follow SSE2 implementation strategy in the AVX2 variant. Avoid use of
PTEST, which is not profitable there (like in the removed SSE4 variant).

Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20240206204809.9859-6-amonakov@ispras.ru>
---
 util/bufferiszero.c | 111 +++++++++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 38 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 00118d649e..02df82b4ff 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -67,62 +67,97 @@ static bool buffer_is_zero_integer(const void *buf, size_t len)
 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
 #include <immintrin.h>
 
-/* Note that each of these vectorized functions require len >= 64.  */
+/* Helper for preventing the compiler from reassociating
+   chains of binary vector operations.  */
+#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
+
+/* Note that these vectorized functions may assume len >= 256.  */
 
 static bool __attribute__((target("sse2")))
 buffer_zero_sse2(const void *buf, size_t len)
 {
-    __m128i t = _mm_loadu_si128(buf);
-    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
-    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
-    __m128i zero = _mm_setzero_si128();
+    /* Unaligned loads at head/tail.  */
+    __m128i v = *(__m128i_u *)(buf);
+    __m128i w = *(__m128i_u *)(buf + len - 16);
+    /* Align head/tail to 16-byte boundaries.  */
+    const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
+    const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
+    __m128i zero = { 0 };
 
-    /* Loop over 16-byte aligned blocks of 64.  */
-    while (likely(p <= e)) {
-        t = _mm_cmpeq_epi8(t, zero);
-        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-3]; w |= e[-4];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-5]; w |= e[-6];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-7]; v |= w;
+
+    /*
+     * Loop over complete 128-byte blocks.
+     * With the head and tail removed, e - p >= 14, so the loop
+     * must iterate at least once.
+     */
+    do {
+        v = _mm_cmpeq_epi8(v, zero);
+        if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
             return false;
         }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    }
+        v = p[0]; w = p[1];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[2]; w |= p[3];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[4]; w |= p[5];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[6]; w |= p[7];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= w;
+        p += 8;
+    } while (p < e - 7);
 
-    /* Finish the aligned tail.  */
-    t |= e[-3];
-    t |= e[-2];
-    t |= e[-1];
-
-    /* Finish the unaligned tail.  */
-    t |= _mm_loadu_si128(buf + len - 16);
-
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
 }
 
 #ifdef CONFIG_AVX2_OPT
 static bool __attribute__((target("avx2")))
 buffer_zero_avx2(const void *buf, size_t len)
 {
-    /* Begin with an unaligned head of 32 bytes.  */
-    __m256i t = _mm256_loadu_si256(buf);
-    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
-    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+    /* Unaligned loads at head/tail.  */
+    __m256i v = *(__m256i_u *)(buf);
+    __m256i w = *(__m256i_u *)(buf + len - 32);
+    /* Align head/tail to 32-byte boundaries.  */
+    const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
+    const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
+    __m256i zero = { 0 };
 
-    /* Loop over 32-byte aligned blocks of 128.  */
-    while (p <= e) {
-        if (unlikely(!_mm256_testz_si256(t, t))) {
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-3]; w |= e[-4];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-5]; w |= e[-6];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-7]; v |= w;
+
+    /* Loop over complete 256-byte blocks.  */
+    for (; p < e - 7; p += 8) {
+        /* PTEST is not profitable here.  */
+        v = _mm256_cmpeq_epi8(v, zero);
+        if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
             return false;
         }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    } ;
+        v = p[0]; w = p[1];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[2]; w |= p[3];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[4]; w |= p[5];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[6]; w |= p[7];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= w;
+    }
 
-    /* Finish the last block of 128 unaligned.  */
-    t |= _mm256_loadu_si256(buf + len - 4 * 32);
-    t |= _mm256_loadu_si256(buf + len - 3 * 32);
-    t |= _mm256_loadu_si256(buf + len - 2 * 32);
-    t |= _mm256_loadu_si256(buf + len - 1 * 32);
-
-    return _mm256_testz_si256(t, t);
+    return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
 }
 #endif /* CONFIG_AVX2_OPT */
 
-- 
2.34.1

next prev parent reply	other threads:[~2024-04-08 17:51 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-08 17:48 [PULL 00/35] misc patch queue Richard Henderson
2024-04-08 17:48 ` [PULL 01/35] tcg/optimize: Do not attempt to constant fold neg_vec Richard Henderson
2024-04-08 17:48 ` [PULL 02/35] linux-user: Fix waitid return of siginfo_t and rusage Richard Henderson
2024-04-08 17:48 ` [PULL 03/35] linux-user: do_setsockopt: fix SOL_ALG.ALG_SET_KEY Richard Henderson
2024-04-08 17:48 ` [PULL 04/35] linux-user: do_setsockopt: make ip_mreq local to the place it is used and inline target_to_host_ip_mreq() Richard Henderson
2024-04-08 17:48 ` [PULL 05/35] linux-user: do_setsockopt: make ip_mreq_source local to the place where it is used Richard Henderson
2024-04-08 17:49 ` [PULL 06/35] linux-user: do_setsockopt: eliminate goto in switch for SO_SNDTIMEO Richard Henderson
2024-04-08 17:49 ` [PULL 07/35] linux-user: Add FITRIM ioctl Richard Henderson
2024-04-08 17:49 ` [PULL 08/35] linux-user: replace calloc() with g_new0() Richard Henderson
2024-04-08 17:49 ` [PULL 09/35] target/hppa: Fix IIAOQ, IIASQ for pa2.0 Richard Henderson
2024-04-08 17:49 ` [PULL 10/35] target/sh4: mac.w: memory accesses are 16-bit words Richard Henderson
2024-04-08 17:49 ` [PULL 11/35] target/sh4: Merge mach and macl into a union Richard Henderson
2024-04-08 17:49 ` [PULL 12/35] target/sh4: Fix mac.l with saturation enabled Richard Henderson
2024-04-08 17:49 ` [PULL 13/35] target/sh4: Fix mac.w " Richard Henderson
2024-04-08 17:49 ` [PULL 14/35] target/sh4: add missing CHECK_NOT_DELAY_SLOT Richard Henderson
2024-04-08 17:49 ` [PULL 15/35] target/m68k: Map FPU exceptions to FPSR register Richard Henderson
2024-04-08 17:49 ` [PULL 16/35] target/m68k: Pass semihosting arg to exit Richard Henderson
2024-04-08 17:49 ` [PULL 17/35] target/m68k: Perform the semihosting test during translate Richard Henderson
2024-04-08 17:49 ` [PULL 18/35] target/m68k: Support semihosting on non-ColdFire targets Richard Henderson
2024-04-08 17:49 ` [PULL 19/35] tcg: Add TCGContext.emit_before_op Richard Henderson
2024-04-08 17:49 ` [PULL 20/35] accel/tcg: Add insn_start to DisasContextBase Richard Henderson
2024-04-08 17:49 ` [PULL 21/35] target/arm: Use insn_start from DisasContextBase Richard Henderson
2024-04-08 17:49 ` [PULL 22/35] target/hppa: " Richard Henderson
2024-04-08 17:49 ` [PULL 23/35] target/i386: Preserve DisasContextBase.insn_start across rewind Richard Henderson
2024-04-08 17:49 ` [PULL 24/35] target/microblaze: Use insn_start from DisasContextBase Richard Henderson
2024-04-08 17:49 ` [PULL 25/35] target/riscv: " Richard Henderson
2024-04-08 17:49 ` [PULL 26/35] target/s390x: " Richard Henderson
2024-04-08 17:49 ` [PULL 27/35] accel/tcg: Improve can_do_io management Richard Henderson
2024-04-08 17:49 ` [PULL 28/35] util/bufferiszero: Remove SSE4.1 variant Richard Henderson
2024-04-08 17:49 ` [PULL 29/35] util/bufferiszero: Remove AVX512 variant Richard Henderson
2024-04-08 17:49 ` [PULL 30/35] util/bufferiszero: Reorganize for early test for acceleration Richard Henderson
2024-04-08 17:49 ` [PULL 31/35] util/bufferiszero: Remove useless prefetches Richard Henderson
2024-04-08 17:49 ` Richard Henderson [this message]
2024-04-08 17:49 ` [PULL 33/35] util/bufferiszero: Improve scalar variant Richard Henderson
2024-04-08 17:49 ` [PULL 34/35] util/bufferiszero: Introduce biz_accel_fn typedef Richard Henderson
2024-04-08 17:49 ` [PULL 35/35] util/bufferiszero: Simplify test_buffer_is_zero_next_accel Richard Henderson
2024-04-09  8:50 ` [PULL 00/35] misc patch queue Peter Maydell
2024-04-09  9:53   ` Philippe Mathieu-Daudé

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:00118d649 dfblob:02df82b4f )
 OR (
bs:"[PULL 32/35] util/bufferiszero: Optimize SSE2 and AVX2 variants" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240408174929.862917-33-richard.henderson@linaro.org \
    --to=richard.henderson@linaro.org \
    --cc=amonakov@ispras.ru \
    --cc=mmromanov@ispras.ru \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).