From: Alexander Monakov <amonakov@ispras.ru>
To: qemu-devel@nongnu.org
Cc: Mikhail Romanov <mmromanov@ispras.ru>,
Richard Henderson <richard.henderson@linaro.org>,
Paolo Bonzini <pbonzini@redhat.com>,
Alexander Monakov <amonakov@ispras.ru>
Subject: [PATCH v3 5/6] util/bufferiszero: optimize SSE2 and AVX2 variants
Date: Tue, 6 Feb 2024 23:48:08 +0300 [thread overview]
Message-ID: <20240206204809.9859-6-amonakov@ispras.ru> (raw)
In-Reply-To: <20240206204809.9859-1-amonakov@ispras.ru>
Increase unroll factor in SIMD loops from 4x to 8x in order to move
their bottlenecks from ALU port contention to load issue rate (two loads
per cycle on popular x86 implementations).
Avoid using out-of-bounds pointers in loop boundary conditions.
Follow SSE2 implementation strategy in the AVX2 variant. Avoid use of
PTEST, which is not profitable there (like in the removed SSE4 variant).
Signed-off-by: Alexander Monakov <amonakov@ispras.ru>
Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru>
---
util/bufferiszero.c | 108 ++++++++++++++++++++++++++++----------------
1 file changed, 69 insertions(+), 39 deletions(-)
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index cb3eb2543f..d752edd8cc 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -66,62 +66,92 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len)
#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
#include <immintrin.h>
-/* Note that each of these vectorized functions require len >= 64. */
+/* Helper for preventing the compiler from reassociating
+ chains of binary vector operations. */
+#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
+
+/* Note that these vectorized functions may assume len >= 256. */
static bool __attribute__((target("sse2")))
buffer_zero_sse2(const void *buf, size_t len)
{
- __m128i t = _mm_loadu_si128(buf);
- __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
- __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
- __m128i zero = _mm_setzero_si128();
-
- /* Loop over 16-byte aligned blocks of 64. */
- while (likely(p <= e)) {
- t = _mm_cmpeq_epi8(t, zero);
- if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+ /* Unaligned loads at head/tail. */
+ __m128i v = *(__m128i_u *)(buf);
+ __m128i w = *(__m128i_u *)(buf + len - 16);
+ /* Align head/tail to 16-byte boundaries. */
+ __m128i *p = (void *)(((uintptr_t)buf + 16) & -16);
+ __m128i *e = (void *)(((uintptr_t)buf + len - 1) & -16);
+ __m128i zero = { 0 };
+
+ /* Collect a partial block at tail end. */
+ v |= e[-1]; w |= e[-2];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= e[-3]; w |= e[-4];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= e[-5]; w |= e[-6];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= e[-7]; v |= w;
+
+ /* Loop over complete 128-byte blocks. */
+ for (; p < e - 7; p += 8) {
+ v = _mm_cmpeq_epi8(v, zero);
+ if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
return false;
}
- t = p[-4] | p[-3] | p[-2] | p[-1];
- p += 4;
+ v = p[0]; w = p[1];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= p[2]; w |= p[3];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= p[4]; w |= p[5];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= p[6]; w |= p[7];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= w;
}
- /* Finish the aligned tail. */
- t |= e[-3];
- t |= e[-2];
- t |= e[-1];
-
- /* Finish the unaligned tail. */
- t |= _mm_loadu_si128(buf + len - 16);
-
- return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+ return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
}
#ifdef CONFIG_AVX2_OPT
static bool __attribute__((target("avx2")))
buffer_zero_avx2(const void *buf, size_t len)
{
- /* Begin with an unaligned head of 32 bytes. */
- __m256i t = _mm256_loadu_si256(buf);
- __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
- __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
-
- /* Loop over 32-byte aligned blocks of 128. */
- while (p <= e) {
- if (unlikely(!_mm256_testz_si256(t, t))) {
+ /* Unaligned loads at head/tail. */
+ __m256i v = *(__m256i_u *)(buf);
+ __m256i w = *(__m256i_u *)(buf + len - 32);
+ /* Align head/tail to 32-byte boundaries. */
+ __m256i *p = (void *)(((uintptr_t)buf + 32) & -32);
+ __m256i *e = (void *)(((uintptr_t)buf + len - 1) & -32);
+ __m256i zero = { 0 };
+
+ /* Collect a partial block at tail end. */
+ v |= e[-1]; w |= e[-2];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= e[-3]; w |= e[-4];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= e[-5]; w |= e[-6];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= e[-7]; v |= w;
+
+ /* Loop over complete 256-byte blocks. */
+ for (; p < e - 7; p += 8) {
+ /* PTEST is not profitable here. */
+ v = _mm256_cmpeq_epi8(v, zero);
+ if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
return false;
}
- t = p[-4] | p[-3] | p[-2] | p[-1];
- p += 4;
- } ;
-
- /* Finish the last block of 128 unaligned. */
- t |= _mm256_loadu_si256(buf + len - 4 * 32);
- t |= _mm256_loadu_si256(buf + len - 3 * 32);
- t |= _mm256_loadu_si256(buf + len - 2 * 32);
- t |= _mm256_loadu_si256(buf + len - 1 * 32);
+ v = p[0]; w = p[1];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= p[2]; w |= p[3];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= p[4]; w |= p[5];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= p[6]; w |= p[7];
+ SSE_REASSOC_BARRIER(v, w);
+ v |= w;
+ }
- return _mm256_testz_si256(t, t);
+ return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
}
#endif /* CONFIG_AVX2_OPT */
--
2.32.0
next prev parent reply other threads:[~2024-02-06 20:49 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-06 20:48 [PATCH v3 0/6] Optimize buffer_is_zero Alexander Monakov
2024-02-06 20:48 ` [PATCH v3 1/6] util/bufferiszero: remove SSE4.1 variant Alexander Monakov
2024-02-06 22:24 ` Richard Henderson
2024-02-06 20:48 ` [PATCH v3 2/6] util/bufferiszero: introduce an inline wrapper Alexander Monakov
2024-02-06 22:44 ` Richard Henderson
2024-02-07 7:13 ` Alexander Monakov
2024-02-08 20:07 ` Richard Henderson
2024-02-06 20:48 ` [PATCH v3 3/6] util/bufferiszero: remove AVX512 variant Alexander Monakov
2024-02-06 22:28 ` Richard Henderson
2024-02-06 23:56 ` Elena Ufimtseva
2024-02-07 6:29 ` Alexander Monakov
2024-02-07 10:38 ` Joao Martins
2024-02-06 20:48 ` [PATCH v3 4/6] util/bufferiszero: remove useless prefetches Alexander Monakov
2024-02-06 22:29 ` Richard Henderson
2024-02-06 20:48 ` Alexander Monakov [this message]
2024-02-06 23:10 ` [PATCH v3 5/6] util/bufferiszero: optimize SSE2 and AVX2 variants Richard Henderson
2024-02-06 20:48 ` [PATCH v3 6/6] util/bufferiszero: improve scalar variant Alexander Monakov
2024-02-06 22:34 ` Richard Henderson
2024-02-06 22:46 ` Richard Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240206204809.9859-6-amonakov@ispras.ru \
--to=amonakov@ispras.ru \
--cc=mmromanov@ispras.ru \
--cc=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).