[PATCH v4 05/10] util/bufferiszero: Optimize SSE2 and AVX2 variants

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Cc: amonakov@ispras.ru, mmromanov@ispras.ru
Subject: [PATCH v4 05/10] util/bufferiszero: Optimize SSE2 and AVX2 variants
Date: Wed, 14 Feb 2024 22:14:44 -1000	[thread overview]
Message-ID: <20240215081449.848220-6-richard.henderson@linaro.org> (raw)
In-Reply-To: <20240215081449.848220-1-richard.henderson@linaro.org>

From: Alexander Monakov <amonakov@ispras.ru>

Increase unroll factor in SIMD loops from 4x to 8x in order to move
their bottlenecks from ALU port contention to load issue rate (two loads
per cycle on popular x86 implementations).

Avoid using out-of-bounds pointers in loop boundary conditions.

Follow SSE2 implementation strategy in the AVX2 variant. Avoid use of
PTEST, which is not profitable there (like in the removed SSE4 variant).

Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20240206204809.9859-6-amonakov@ispras.ru>
---
 util/bufferiszero.c | 111 +++++++++++++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 38 deletions(-)

diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 6ef5f8ec79..2822155c27 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -67,62 +67,97 @@ static bool buffer_is_zero_integer(const void *buf, size_t len)
 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
 #include <immintrin.h>
 
-/* Note that each of these vectorized functions require len >= 64.  */
+/* Helper for preventing the compiler from reassociating
+   chains of binary vector operations.  */
+#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
+
+/* Note that these vectorized functions may assume len >= 256.  */
 
 static bool __attribute__((target("sse2")))
 buffer_zero_sse2(const void *buf, size_t len)
 {
-    __m128i t = _mm_loadu_si128(buf);
-    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
-    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
-    __m128i zero = _mm_setzero_si128();
+    /* Unaligned loads at head/tail.  */
+    __m128i v = *(__m128i_u *)(buf);
+    __m128i w = *(__m128i_u *)(buf + len - 16);
+    /* Align head/tail to 16-byte boundaries.  */
+    const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
+    const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
+    __m128i zero = { 0 };
 
-    /* Loop over 16-byte aligned blocks of 64.  */
-    while (likely(p <= e)) {
-        t = _mm_cmpeq_epi8(t, zero);
-        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-3]; w |= e[-4];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-5]; w |= e[-6];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-7]; v |= w;
+
+    /*
+     * Loop over complete 128-byte blocks.
+     * With the head and tail removed, e - p >= 14, so the loop
+     * must iterate at least once.
+     */
+    do {
+        v = _mm_cmpeq_epi8(v, zero);
+        if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
             return false;
         }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    }
+        v = p[0]; w = p[1];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[2]; w |= p[3];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[4]; w |= p[5];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[6]; w |= p[7];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= w;
+        p += 8;
+    } while (p < e - 7);
 
-    /* Finish the aligned tail.  */
-    t |= e[-3];
-    t |= e[-2];
-    t |= e[-1];
-
-    /* Finish the unaligned tail.  */
-    t |= _mm_loadu_si128(buf + len - 16);
-
-    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
 }
 
 #ifdef CONFIG_AVX2_OPT
 static bool __attribute__((target("avx2")))
 buffer_zero_avx2(const void *buf, size_t len)
 {
-    /* Begin with an unaligned head of 32 bytes.  */
-    __m256i t = _mm256_loadu_si256(buf);
-    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
-    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+    /* Unaligned loads at head/tail.  */
+    __m256i v = *(__m256i_u *)(buf);
+    __m256i w = *(__m256i_u *)(buf + len - 32);
+    /* Align head/tail to 32-byte boundaries.  */
+    const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
+    const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
+    __m256i zero = { 0 };
 
-    /* Loop over 32-byte aligned blocks of 128.  */
-    while (p <= e) {
-        if (unlikely(!_mm256_testz_si256(t, t))) {
+    /* Collect a partial block at tail end.  */
+    v |= e[-1]; w |= e[-2];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-3]; w |= e[-4];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-5]; w |= e[-6];
+    SSE_REASSOC_BARRIER(v, w);
+    v |= e[-7]; v |= w;
+
+    /* Loop over complete 256-byte blocks.  */
+    for (; p < e - 7; p += 8) {
+        /* PTEST is not profitable here.  */
+        v = _mm256_cmpeq_epi8(v, zero);
+        if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
             return false;
         }
-        t = p[-4] | p[-3] | p[-2] | p[-1];
-        p += 4;
-    } ;
+        v = p[0]; w = p[1];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[2]; w |= p[3];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[4]; w |= p[5];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= p[6]; w |= p[7];
+        SSE_REASSOC_BARRIER(v, w);
+        v |= w;
+    }
 
-    /* Finish the last block of 128 unaligned.  */
-    t |= _mm256_loadu_si256(buf + len - 4 * 32);
-    t |= _mm256_loadu_si256(buf + len - 3 * 32);
-    t |= _mm256_loadu_si256(buf + len - 2 * 32);
-    t |= _mm256_loadu_si256(buf + len - 1 * 32);
-
-    return _mm256_testz_si256(t, t);
+    return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
 }
 #endif /* CONFIG_AVX2_OPT */
 
-- 
2.34.1

next prev parent reply	other threads:[~2024-02-15  8:17 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-15  8:14 [PATCH v4 00/10] Optimize buffer_is_zero Richard Henderson
2024-02-15  8:14 ` [PATCH v4 01/10] util/bufferiszero: Remove SSE4.1 variant Richard Henderson
2024-02-15  8:14 ` [PATCH v4 02/10] util/bufferiszero: Remove AVX512 variant Richard Henderson
2024-02-15  8:14 ` [PATCH v4 03/10] util/bufferiszero: Reorganize for early test for acceleration Richard Henderson
2024-02-15  8:14 ` [PATCH v4 04/10] util/bufferiszero: Remove useless prefetches Richard Henderson
2024-02-15  8:14 ` Richard Henderson [this message]
2024-02-15  8:14 ` [PATCH v4 06/10] util/bufferiszero: Improve scalar variant Richard Henderson
2024-02-15  8:14 ` [PATCH v4 07/10] util/bufferiszero: Introduce biz_accel_fn typedef Richard Henderson
2024-02-15  8:34   ` Philippe Mathieu-Daudé
2024-02-15  8:14 ` [PATCH v4 08/10] util/bufferiszero: Simplify test_buffer_is_zero_next_accel Richard Henderson
2024-02-15  8:40   ` Philippe Mathieu-Daudé
2024-02-15  8:14 ` [PATCH v4 09/10] util/bufferiszero: Add simd acceleration for aarch64 Richard Henderson
2024-02-15  8:47   ` Alexander Monakov
2024-02-15 17:47     ` Richard Henderson
2024-02-15 18:46       ` Alexander Monakov
2024-02-15 21:10         ` Richard Henderson
2024-02-15  8:14 ` [RFC PATCH v4 10/10] util/bufferiszero: Add sve " Richard Henderson
2024-02-16  9:33   ` Alex Bennée
2024-02-16 11:05   ` Alex Bennée
2024-02-15  8:57 ` [PATCH v4 00/10] Optimize buffer_is_zero Alexander Monakov
2024-02-15 21:16   ` Richard Henderson
2024-02-15 21:36     ` Alexander Monakov
2024-02-15 22:27       ` Richard Henderson
2024-02-15 23:37         ` Alexander Monakov
2024-02-16  8:11           ` Richard Henderson
2024-02-16 20:20             ` Alexander Monakov
2024-02-16 22:28               ` Richard Henderson

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:6ef5f8ec7 dfblob:2822155c2 )
 OR (
bs:"[PATCH v4 05/10] util/bufferiszero: Optimize SSE2 and AVX2 variants" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240215081449.848220-6-richard.henderson@linaro.org \
    --to=richard.henderson@linaro.org \
    --cc=amonakov@ispras.ru \
    --cc=mmromanov@ispras.ru \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).