From: "Alex Bennée" <alex.bennee@linaro.org>
To: Richard Henderson <richard.henderson@linaro.org>
Cc: qemu-devel@nongnu.org, amonakov@ispras.ru, mmromanov@ispras.ru
Subject: Re: [RFC PATCH v4 10/10] util/bufferiszero: Add sve acceleration for aarch64
Date: Fri, 16 Feb 2024 09:33:26 +0000 [thread overview]
Message-ID: <87frxs69eh.fsf@draig.linaro.org> (raw)
In-Reply-To: <20240215081449.848220-11-richard.henderson@linaro.org> (Richard Henderson's message of "Wed, 14 Feb 2024 22:14:49 -1000")
Richard Henderson <richard.henderson@linaro.org> writes:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>
> RFC because I've not benchmarked this on real hw, only run it
> through qemu for validation.
I think we have an a64fx is the TCWG lab you could probably run the
tests on if you want. Otherwise I might be able to spin up a Graviton
on AWS to run a measurement. Do we have a benchmark test to run?
>
> ---
> host/include/aarch64/host/cpuinfo.h | 1 +
> util/bufferiszero.c | 49 +++++++++++++++++++++++++++++
> util/cpuinfo-aarch64.c | 1 +
> meson.build | 13 ++++++++
> 4 files changed, 64 insertions(+)
>
> diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
> index fe671534e4..b4b816cd07 100644
> --- a/host/include/aarch64/host/cpuinfo.h
> +++ b/host/include/aarch64/host/cpuinfo.h
> @@ -12,6 +12,7 @@
> #define CPUINFO_AES (1u << 3)
> #define CPUINFO_PMULL (1u << 4)
> #define CPUINFO_BTI (1u << 5)
> +#define CPUINFO_SVE (1u << 6)
>
> /* Initialized with a constructor. */
> extern unsigned cpuinfo;
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 2809b09225..af64c9c224 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -270,13 +270,62 @@ static bool buffer_is_zero_simd(const void *buf, size_t len)
> return vaddvq_u32(vceqzq_u32(t0)) == -4;
> }
>
> +#ifdef CONFIG_SVE_OPT
> +#include <arm_sve.h>
> +
> +#ifndef __ARM_FEATURE_SVE
> +__attribute__((target("+sve")))
> +#endif
> +static bool buffer_is_zero_sve(const void *buf, size_t len)
> +{
> + svbool_t p, t = svptrue_b8();
> + size_t i, n;
> +
> + /*
> + * For the first vector, align to 16 -- reading 1 to 256 bytes.
> + * Note this routine is only called with len >= 256, which is the
> + * architectural maximum vector length: the first vector always fits.
> + */
> + i = 0;
> + n = QEMU_ALIGN_PTR_DOWN(buf + svcntb(), 16) - buf;
> + p = svwhilelt_b8(i, n);
> +
> + do {
> + svuint8_t d = svld1_u8(p, buf + i);
> +
> + p = svcmpne_n_u8(t, d, 0);
> + if (unlikely(svptest_any(t, p))) {
> + return false;
> + }
> + i += n;
> + n = svcntb();
> + p = svwhilelt_b8(i, len);
> + } while (svptest_any(t, p));
> +
> + return true;
> +}
> +#endif /* CONFIG_SVE_OPT */
> +
> static biz_accel_fn const accel_table[] = {
> buffer_is_zero_int_ge256,
> buffer_is_zero_simd,
> +#ifdef CONFIG_SVE_OPT
> + buffer_is_zero_sve,
> +#endif
> };
>
> +#ifdef CONFIG_SVE_OPT
> +static unsigned accel_index;
> +static void __attribute__((constructor)) init_accel(void)
> +{
> + accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1);
> + buffer_is_zero_accel = accel_table[accel_index];
> +}
> +#define INIT_ACCEL NULL
> +#else
> static unsigned accel_index = 1;
> #define INIT_ACCEL buffer_is_zero_simd
> +#endif /* CONFIG_SVE_OPT */
>
> bool test_buffer_is_zero_next_accel(void)
> {
> diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
> index 4c8a005715..a1e22ea66e 100644
> --- a/util/cpuinfo-aarch64.c
> +++ b/util/cpuinfo-aarch64.c
> @@ -61,6 +61,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
> info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
> info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
> info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
> + info |= (hwcap & HWCAP_SVE ? CPUINFO_SVE : 0);
>
> unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2);
> info |= (hwcap2 & HWCAP2_BTI ? CPUINFO_BTI : 0);
> diff --git a/meson.build b/meson.build
> index c1dc83e4c0..89a8241bc0 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2822,6 +2822,18 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
> void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
> '''))
>
> +config_host_data.set('CONFIG_SVE_OPT', cc.compiles('''
> + #include <arm_sve.h>
> + #ifndef __ARM_FEATURE_SVE
> + __attribute__((target("+sve")))
> + #endif
> + void foo(void *p) {
> + svbool_t t = svptrue_b8();
> + svuint8_t d = svld1_u8(t, p);
> + svptest_any(t, svcmpne_n_u8(t, d, 0));
> + }
> + '''))
> +
> have_pvrdma = get_option('pvrdma') \
> .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
> .require(cc.compiles(gnu_source_prefix + '''
> @@ -4232,6 +4244,7 @@ summary_info += {'memory allocator': get_option('malloc')}
> summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')}
> summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')}
> summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')}
> +summary_info += {'sve optimization': config_host_data.get('CONFIG_SVE_OPT')}
> summary_info += {'gcov': get_option('b_coverage')}
> summary_info += {'thread sanitizer': get_option('tsan')}
> summary_info += {'CFI support': get_option('cfi')}
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
next prev parent reply other threads:[~2024-02-16 9:34 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-15 8:14 [PATCH v4 00/10] Optimize buffer_is_zero Richard Henderson
2024-02-15 8:14 ` [PATCH v4 01/10] util/bufferiszero: Remove SSE4.1 variant Richard Henderson
2024-02-15 8:14 ` [PATCH v4 02/10] util/bufferiszero: Remove AVX512 variant Richard Henderson
2024-02-15 8:14 ` [PATCH v4 03/10] util/bufferiszero: Reorganize for early test for acceleration Richard Henderson
2024-02-15 8:14 ` [PATCH v4 04/10] util/bufferiszero: Remove useless prefetches Richard Henderson
2024-02-15 8:14 ` [PATCH v4 05/10] util/bufferiszero: Optimize SSE2 and AVX2 variants Richard Henderson
2024-02-15 8:14 ` [PATCH v4 06/10] util/bufferiszero: Improve scalar variant Richard Henderson
2024-02-15 8:14 ` [PATCH v4 07/10] util/bufferiszero: Introduce biz_accel_fn typedef Richard Henderson
2024-02-15 8:34 ` Philippe Mathieu-Daudé
2024-02-15 8:14 ` [PATCH v4 08/10] util/bufferiszero: Simplify test_buffer_is_zero_next_accel Richard Henderson
2024-02-15 8:40 ` Philippe Mathieu-Daudé
2024-02-15 8:14 ` [PATCH v4 09/10] util/bufferiszero: Add simd acceleration for aarch64 Richard Henderson
2024-02-15 8:47 ` Alexander Monakov
2024-02-15 17:47 ` Richard Henderson
2024-02-15 18:46 ` Alexander Monakov
2024-02-15 21:10 ` Richard Henderson
2024-02-15 8:14 ` [RFC PATCH v4 10/10] util/bufferiszero: Add sve " Richard Henderson
2024-02-16 9:33 ` Alex Bennée [this message]
2024-02-16 11:05 ` Alex Bennée
2024-02-15 8:57 ` [PATCH v4 00/10] Optimize buffer_is_zero Alexander Monakov
2024-02-15 21:16 ` Richard Henderson
2024-02-15 21:36 ` Alexander Monakov
2024-02-15 22:27 ` Richard Henderson
2024-02-15 23:37 ` Alexander Monakov
2024-02-16 8:11 ` Richard Henderson
2024-02-16 20:20 ` Alexander Monakov
2024-02-16 22:28 ` Richard Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87frxs69eh.fsf@draig.linaro.org \
--to=alex.bennee@linaro.org \
--cc=amonakov@ispras.ru \
--cc=mmromanov@ispras.ru \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).