From: "Alex Bennée" <alex.bennee@linaro.org>
To: Richard Henderson <richard.henderson@linaro.org>
Cc: qemu-devel@nongnu.org, amonakov@ispras.ru, mmromanov@ispras.ru
Subject: Re: [RFC PATCH v4 10/10] util/bufferiszero: Add sve acceleration for aarch64
Date: Fri, 16 Feb 2024 09:33:26 +0000 [thread overview]
Message-ID: <87frxs69eh.fsf@draig.linaro.org> (raw)
In-Reply-To: <20240215081449.848220-11-richard.henderson@linaro.org> (Richard Henderson's message of "Wed, 14 Feb 2024 22:14:49 -1000")
Richard Henderson <richard.henderson@linaro.org> writes:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>
> RFC because I've not benchmarked this on real hw, only run it
> through qemu for validation.
I think we have an a64fx is the TCWG lab you could probably run the
tests on if you want. Otherwise I might be able to spin up a Graviton
on AWS to run a measurement. Do we have a benchmark test to run?
>
> ---
> host/include/aarch64/host/cpuinfo.h | 1 +
> util/bufferiszero.c | 49 +++++++++++++++++++++++++++++
> util/cpuinfo-aarch64.c | 1 +
> meson.build | 13 ++++++++
> 4 files changed, 64 insertions(+)
>
> diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
> index fe671534e4..b4b816cd07 100644
> --- a/host/include/aarch64/host/cpuinfo.h
> +++ b/host/include/aarch64/host/cpuinfo.h
> @@ -12,6 +12,7 @@
> #define CPUINFO_AES (1u << 3)
> #define CPUINFO_PMULL (1u << 4)
> #define CPUINFO_BTI (1u << 5)
> +#define CPUINFO_SVE (1u << 6)
>
> /* Initialized with a constructor. */
> extern unsigned cpuinfo;
> diff --git a/util/bufferiszero.c b/util/bufferiszero.c
> index 2809b09225..af64c9c224 100644
> --- a/util/bufferiszero.c
> +++ b/util/bufferiszero.c
> @@ -270,13 +270,62 @@ static bool buffer_is_zero_simd(const void *buf, size_t len)
> return vaddvq_u32(vceqzq_u32(t0)) == -4;
> }
>
> +#ifdef CONFIG_SVE_OPT
> +#include <arm_sve.h>
> +
> +#ifndef __ARM_FEATURE_SVE
> +__attribute__((target("+sve")))
> +#endif
> +static bool buffer_is_zero_sve(const void *buf, size_t len)
> +{
> + svbool_t p, t = svptrue_b8();
> + size_t i, n;
> +
> + /*
> + * For the first vector, align to 16 -- reading 1 to 256 bytes.
> + * Note this routine is only called with len >= 256, which is the
> + * architectural maximum vector length: the first vector always fits.
> + */
> + i = 0;
> + n = QEMU_ALIGN_PTR_DOWN(buf + svcntb(), 16) - buf;
> + p = svwhilelt_b8(i, n);
> +
> + do {
> + svuint8_t d = svld1_u8(p, buf + i);
> +
> + p = svcmpne_n_u8(t, d, 0);
> + if (unlikely(svptest_any(t, p))) {
> + return false;
> + }
> + i += n;
> + n = svcntb();
> + p = svwhilelt_b8(i, len);
> + } while (svptest_any(t, p));
> +
> + return true;
> +}
> +#endif /* CONFIG_SVE_OPT */
> +
> static biz_accel_fn const accel_table[] = {
> buffer_is_zero_int_ge256,
> buffer_is_zero_simd,
> +#ifdef CONFIG_SVE_OPT
> + buffer_is_zero_sve,
> +#endif
> };
>
> +#ifdef CONFIG_SVE_OPT
> +static unsigned accel_index;
> +static void __attribute__((constructor)) init_accel(void)
> +{
> + accel_index = (cpuinfo & CPUINFO_SVE ? 2 : 1);
> + buffer_is_zero_accel = accel_table[accel_index];
> +}
> +#define INIT_ACCEL NULL
> +#else
> static unsigned accel_index = 1;
> #define INIT_ACCEL buffer_is_zero_simd
> +#endif /* CONFIG_SVE_OPT */
>
> bool test_buffer_is_zero_next_accel(void)
> {
> diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
> index 4c8a005715..a1e22ea66e 100644
> --- a/util/cpuinfo-aarch64.c
> +++ b/util/cpuinfo-aarch64.c
> @@ -61,6 +61,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
> info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
> info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
> info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
> + info |= (hwcap & HWCAP_SVE ? CPUINFO_SVE : 0);
>
> unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2);
> info |= (hwcap2 & HWCAP2_BTI ? CPUINFO_BTI : 0);
> diff --git a/meson.build b/meson.build
> index c1dc83e4c0..89a8241bc0 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2822,6 +2822,18 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
> void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
> '''))
>
> +config_host_data.set('CONFIG_SVE_OPT', cc.compiles('''
> + #include <arm_sve.h>
> + #ifndef __ARM_FEATURE_SVE
> + __attribute__((target("+sve")))
> + #endif
> + void foo(void *p) {
> + svbool_t t = svptrue_b8();
> + svuint8_t d = svld1_u8(t, p);
> + svptest_any(t, svcmpne_n_u8(t, d, 0));
> + }
> + '''))
> +
> have_pvrdma = get_option('pvrdma') \
> .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
> .require(cc.compiles(gnu_source_prefix + '''
> @@ -4232,6 +4244,7 @@ summary_info += {'memory allocator': get_option('malloc')}
> summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')}
> summary_info += {'avx512bw optimization': config_host_data.get('CONFIG_AVX512BW_OPT')}
> summary_info += {'avx512f optimization': config_host_data.get('CONFIG_AVX512F_OPT')}
> +summary_info += {'sve optimization': config_host_data.get('CONFIG_SVE_OPT')}
> summary_info += {'gcov': get_option('b_coverage')}
> summary_info += {'thread sanitizer': get_option('tsan')}
> summary_info += {'CFI support': get_option('cfi')}
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
next prev parent reply other threads:[~2024-02-16 9:34 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-15 8:14 [PATCH v4 00/10] Optimize buffer_is_zero Richard Henderson
2024-02-15 8:14 ` [PATCH v4 01/10] util/bufferiszero: Remove SSE4.1 variant Richard Henderson
2024-02-15 8:14 ` [PATCH v4 02/10] util/bufferiszero: Remove AVX512 variant Richard Henderson
2024-02-15 8:14 ` [PATCH v4 03/10] util/bufferiszero: Reorganize for early test for acceleration Richard Henderson
2024-02-15 8:14 ` [PATCH v4 04/10] util/bufferiszero: Remove useless prefetches Richard Henderson
2024-02-15 8:14 ` [PATCH v4 05/10] util/bufferiszero: Optimize SSE2 and AVX2 variants Richard Henderson
2024-02-15 8:14 ` [PATCH v4 06/10] util/bufferiszero: Improve scalar variant Richard Henderson
2024-02-15 8:14 ` [PATCH v4 07/10] util/bufferiszero: Introduce biz_accel_fn typedef Richard Henderson
2024-02-15 8:34 ` Philippe Mathieu-Daudé
2024-02-15 8:14 ` [PATCH v4 08/10] util/bufferiszero: Simplify test_buffer_is_zero_next_accel Richard Henderson
2024-02-15 8:40 ` Philippe Mathieu-Daudé
2024-02-15 8:14 ` [PATCH v4 09/10] util/bufferiszero: Add simd acceleration for aarch64 Richard Henderson
2024-02-15 8:47 ` Alexander Monakov
2024-02-15 17:47 ` Richard Henderson
2024-02-15 18:46 ` Alexander Monakov
2024-02-15 21:10 ` Richard Henderson
2024-02-15 8:14 ` [RFC PATCH v4 10/10] util/bufferiszero: Add sve " Richard Henderson
2024-02-16 9:33 ` Alex Bennée [this message]
2024-02-16 11:05 ` Alex Bennée
2024-02-15 8:57 ` [PATCH v4 00/10] Optimize buffer_is_zero Alexander Monakov
2024-02-15 21:16 ` Richard Henderson
2024-02-15 21:36 ` Alexander Monakov
2024-02-15 22:27 ` Richard Henderson
2024-02-15 23:37 ` Alexander Monakov
2024-02-16 8:11 ` Richard Henderson
2024-02-16 20:20 ` Alexander Monakov
2024-02-16 22:28 ` Richard Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87frxs69eh.fsf@draig.linaro.org \
--to=alex.bennee@linaro.org \
--cc=amonakov@ispras.ru \
--cc=mmromanov@ispras.ru \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.