* [PATCH v2 0/2] Use a more portable way to enable target specific functions @ 2022-12-04 1:51 Richard Henderson 2022-12-04 1:51 ` [PATCH 1/2] util/bufferiszero: Use __attribute__((target)) for avx2/avx512 Richard Henderson 2022-12-04 1:51 ` [PATCH 2/2] meson: Set avx512f option to auto Richard Henderson 0 siblings, 2 replies; 9+ messages in thread From: Richard Henderson @ 2022-12-04 1:51 UTC (permalink / raw) To: qemu-devel; +Cc: pbonzini, tstellar, berrange This is a revision of Tom Stellard's patch from last month which also removes the use of the #pragma. Also, tweak --enable/disable-avx512f. r~ Richard Henderson (2): util/bufferiszero: Use __attribute__((target)) for avx2/avx512 meson: Set avx512f option to auto meson.build | 8 ++------ util/bufferiszero.c | 41 ++++++----------------------------------- meson_options.txt | 2 +- 3 files changed, 9 insertions(+), 42 deletions(-) -- 2.34.1 ^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/2] util/bufferiszero: Use __attribute__((target)) for avx2/avx512 2022-12-04 1:51 [PATCH v2 0/2] Use a more portable way to enable target specific functions Richard Henderson @ 2022-12-04 1:51 ` Richard Henderson 2022-12-05 11:17 ` Daniel P. Berrangé 2022-12-04 1:51 ` [PATCH 2/2] meson: Set avx512f option to auto Richard Henderson 1 sibling, 1 reply; 9+ messages in thread From: Richard Henderson @ 2022-12-04 1:51 UTC (permalink / raw) To: qemu-devel; +Cc: pbonzini, tstellar, berrange Use the attribute, which is supported by clang, instead of the #pragma, which is not supported and, for some reason, also not detected by the meson probe, so we fail by -Werror. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- meson.build | 8 ++------ util/bufferiszero.c | 41 ++++++----------------------------------- 2 files changed, 8 insertions(+), 41 deletions(-) diff --git a/meson.build b/meson.build index 5c6b5a1c75..11b873f911 100644 --- a/meson.build +++ b/meson.build @@ -2324,11 +2324,9 @@ config_host_data.set('CONFIG_CPUID_H', have_cpuid_h) config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX2') \ .require(cc.links(''' - #pragma GCC push_options - #pragma GCC target("avx2") #include <cpuid.h> #include <immintrin.h> - static int bar(void *a) { + static int __attribute__((target("avx2"))) bar(void *a) { __m256i x = *(__m256i *)a; return _mm256_testz_si256(x, x); } @@ -2338,11 +2336,9 @@ config_host_data.set('CONFIG_AVX2_OPT', get_option('avx2') \ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512F') \ .require(cc.links(''' - #pragma GCC push_options - #pragma GCC target("avx512f") #include <cpuid.h> #include <immintrin.h> - static int bar(void *a) { + static int __attribute__((target("avx512f"))) bar(void *a) { __m512i x = *(__m512i *)a; return _mm512_test_epi64_mask(x, x); } diff --git a/util/bufferiszero.c b/util/bufferiszero.c index ec3cd4ca15..1790ded7d4 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -64,18 +64,11 @@ buffer_zero_int(const void *buf, size_t len) } #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) -/* Do not use push_options pragmas unnecessarily, because clang - * does not support them. - */ -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) -#pragma GCC push_options -#pragma GCC target("sse2") -#endif -#include <emmintrin.h> +#include <immintrin.h> /* Note that each of these vectorized functions require len >= 64. */ -static bool +static bool __attribute__((target("sse2"))) buffer_zero_sse2(const void *buf, size_t len) { __m128i t = _mm_loadu_si128(buf); @@ -104,20 +97,9 @@ buffer_zero_sse2(const void *buf, size_t len) return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; } -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) -#pragma GCC pop_options -#endif #ifdef CONFIG_AVX2_OPT -/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8, - * the includes have to be within the corresponding push_options region, and - * therefore the regions themselves have to be ordered with increasing ISA. - */ -#pragma GCC push_options -#pragma GCC target("sse4") -#include <smmintrin.h> - -static bool +static bool __attribute__((target("sse4"))) buffer_zero_sse4(const void *buf, size_t len) { __m128i t = _mm_loadu_si128(buf); @@ -145,12 +127,7 @@ buffer_zero_sse4(const void *buf, size_t len) return _mm_testz_si128(t, t); } -#pragma GCC pop_options -#pragma GCC push_options -#pragma GCC target("avx2") -#include <immintrin.h> - -static bool +static bool __attribute__((target("avx2"))) buffer_zero_avx2(const void *buf, size_t len) { /* Begin with an unaligned head of 32 bytes. */ @@ -176,15 +153,10 @@ buffer_zero_avx2(const void *buf, size_t len) return _mm256_testz_si256(t, t); } -#pragma GCC pop_options #endif /* CONFIG_AVX2_OPT */ #ifdef CONFIG_AVX512F_OPT -#pragma GCC push_options -#pragma GCC target("avx512f") -#include <immintrin.h> - -static bool +static bool __attribute__((target("avx512f"))) buffer_zero_avx512(const void *buf, size_t len) { /* Begin with an unaligned head of 64 bytes. */ @@ -210,8 +182,7 @@ buffer_zero_avx512(const void *buf, size_t len) return !_mm512_test_epi64_mask(t, t); } -#pragma GCC pop_options -#endif +#endif /* CONFIG_AVX512F_OPT */ /* Note that for test_buffer_is_zero_next_accel, the most preferred -- 2.34.1 ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] util/bufferiszero: Use __attribute__((target)) for avx2/avx512 2022-12-04 1:51 ` [PATCH 1/2] util/bufferiszero: Use __attribute__((target)) for avx2/avx512 Richard Henderson @ 2022-12-05 11:17 ` Daniel P. Berrangé 2022-12-05 15:16 ` Richard Henderson 0 siblings, 1 reply; 9+ messages in thread From: Daniel P. Berrangé @ 2022-12-05 11:17 UTC (permalink / raw) To: Richard Henderson; +Cc: qemu-devel, pbonzini, tstellar On Sat, Dec 03, 2022 at 07:51:22PM -0600, Richard Henderson wrote: > Use the attribute, which is supported by clang, instead of > the #pragma, which is not supported and, for some reason, > also not detected by the meson probe, so we fail by -Werror. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > meson.build | 8 ++------ > util/bufferiszero.c | 41 ++++++----------------------------------- > 2 files changed, 8 insertions(+), 41 deletions(-) > > diff --git a/util/bufferiszero.c b/util/bufferiszero.c > index ec3cd4ca15..1790ded7d4 100644 > --- a/util/bufferiszero.c > +++ b/util/bufferiszero.c > @@ -64,18 +64,11 @@ buffer_zero_int(const void *buf, size_t len) > } > > #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) > -/* Do not use push_options pragmas unnecessarily, because clang > - * does not support them. > - */ > -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) > -#pragma GCC push_options > -#pragma GCC target("sse2") > -#endif > -#include <emmintrin.h> So the old code included emmintrin.h, and possibly either immintrin.h / simmintrin.h, but the new code only includes immintrin.h. I'm not saying that's wrong, I'm just wondering why it is changing, as it feels possibly tangential to the pragma -> attribute conversion. Could you mention this in the commit message, or split it to a separate cleanup patch if its a functionally unrelated change. > +#include <immintrin.h> > > /* Note that each of these vectorized functions require len >= 64. */ > > -static bool > +static bool __attribute__((target("sse2"))) > buffer_zero_sse2(const void *buf, size_t len) > { > __m128i t = _mm_loadu_si128(buf); > @@ -104,20 +97,9 @@ buffer_zero_sse2(const void *buf, size_t len) > > return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; > } > -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) > -#pragma GCC pop_options > -#endif > > #ifdef CONFIG_AVX2_OPT > -/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8, > - * the includes have to be within the corresponding push_options region, and > - * therefore the regions themselves have to be ordered with increasing ISA. > - */ > -#pragma GCC push_options > -#pragma GCC target("sse4") > -#include <smmintrin.h> > - > -static bool > +static bool __attribute__((target("sse4"))) > buffer_zero_sse4(const void *buf, size_t len) > { > __m128i t = _mm_loadu_si128(buf); > @@ -145,12 +127,7 @@ buffer_zero_sse4(const void *buf, size_t len) > return _mm_testz_si128(t, t); > } > > -#pragma GCC pop_options > -#pragma GCC push_options > -#pragma GCC target("avx2") > -#include <immintrin.h> > - > -static bool > +static bool __attribute__((target("avx2"))) > buffer_zero_avx2(const void *buf, size_t len) > { > /* Begin with an unaligned head of 32 bytes. */ > @@ -176,15 +153,10 @@ buffer_zero_avx2(const void *buf, size_t len) > > return _mm256_testz_si256(t, t); > } > -#pragma GCC pop_options > #endif /* CONFIG_AVX2_OPT */ > > #ifdef CONFIG_AVX512F_OPT > -#pragma GCC push_options > -#pragma GCC target("avx512f") > -#include <immintrin.h> > - > -static bool > +static bool __attribute__((target("avx512f"))) > buffer_zero_avx512(const void *buf, size_t len) > { > /* Begin with an unaligned head of 64 bytes. */ > @@ -210,8 +182,7 @@ buffer_zero_avx512(const void *buf, size_t len) > return !_mm512_test_epi64_mask(t, t); > > } > -#pragma GCC pop_options > -#endif > +#endif /* CONFIG_AVX512F_OPT */ > > > /* Note that for test_buffer_is_zero_next_accel, the most preferred > -- > 2.34.1 > With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] util/bufferiszero: Use __attribute__((target)) for avx2/avx512 2022-12-05 11:17 ` Daniel P. Berrangé @ 2022-12-05 15:16 ` Richard Henderson 0 siblings, 0 replies; 9+ messages in thread From: Richard Henderson @ 2022-12-05 15:16 UTC (permalink / raw) To: Daniel P. Berrangé; +Cc: qemu-devel, pbonzini, tstellar On 12/5/22 05:17, Daniel P. Berrangé wrote: > On Sat, Dec 03, 2022 at 07:51:22PM -0600, Richard Henderson wrote: >> Use the attribute, which is supported by clang, instead of >> the #pragma, which is not supported and, for some reason, >> also not detected by the meson probe, so we fail by -Werror. >> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> meson.build | 8 ++------ >> util/bufferiszero.c | 41 ++++++----------------------------------- >> 2 files changed, 8 insertions(+), 41 deletions(-) >> > > >> diff --git a/util/bufferiszero.c b/util/bufferiszero.c >> index ec3cd4ca15..1790ded7d4 100644 >> --- a/util/bufferiszero.c >> +++ b/util/bufferiszero.c >> @@ -64,18 +64,11 @@ buffer_zero_int(const void *buf, size_t len) >> } >> >> #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) >> -/* Do not use push_options pragmas unnecessarily, because clang >> - * does not support them. >> - */ >> -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) >> -#pragma GCC push_options >> -#pragma GCC target("sse2") >> -#endif >> -#include <emmintrin.h> > > So the old code included emmintrin.h, and possibly either > immintrin.h / simmintrin.h, but the new code only > includes immintrin.h. > > I'm not saying that's wrong, I'm just wondering why it is > changing, as it feels possibly tangential to the pragma > -> attribute conversion. Could you mention this in the > commit message, or split it to a separate cleanup patch > if its a functionally unrelated change. Adding Include only <immintrin.h> as that is the outermost "official" header for these intrinsics -- emmintrin.h and smmintrin.> are older SSE2 and SSE4 specific headers, while the immintrin.h includes all of the Intel intrinsics. r~ ^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/2] meson: Set avx512f option to auto 2022-12-04 1:51 [PATCH v2 0/2] Use a more portable way to enable target specific functions Richard Henderson 2022-12-04 1:51 ` [PATCH 1/2] util/bufferiszero: Use __attribute__((target)) for avx2/avx512 Richard Henderson @ 2022-12-04 1:51 ` Richard Henderson 2022-12-16 20:47 ` Richard Henderson 2022-12-16 23:08 ` Paolo Bonzini 1 sibling, 2 replies; 9+ messages in thread From: Richard Henderson @ 2022-12-04 1:51 UTC (permalink / raw) To: qemu-devel; +Cc: pbonzini, tstellar, berrange I'm not sure why this option wasn't set the same as avx2. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- meson_options.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson_options.txt b/meson_options.txt index 4b749ca549..f98ee101e2 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -102,7 +102,7 @@ option('membarrier', type: 'feature', value: 'disabled', option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') -option('avx512f', type: 'feature', value: 'disabled', +option('avx512f', type: 'feature', value: 'auto', description: 'AVX512F optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') -- 2.34.1 ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] meson: Set avx512f option to auto 2022-12-04 1:51 ` [PATCH 2/2] meson: Set avx512f option to auto Richard Henderson @ 2022-12-16 20:47 ` Richard Henderson 2022-12-16 23:08 ` Paolo Bonzini 1 sibling, 0 replies; 9+ messages in thread From: Richard Henderson @ 2022-12-16 20:47 UTC (permalink / raw) To: qemu-devel; +Cc: pbonzini, tstellar, berrange Ping. On 12/3/22 17:51, Richard Henderson wrote: > I'm not sure why this option wasn't set the same as avx2. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > meson_options.txt | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/meson_options.txt b/meson_options.txt > index 4b749ca549..f98ee101e2 100644 > --- a/meson_options.txt > +++ b/meson_options.txt > @@ -102,7 +102,7 @@ option('membarrier', type: 'feature', value: 'disabled', > > option('avx2', type: 'feature', value: 'auto', > description: 'AVX2 optimizations') > -option('avx512f', type: 'feature', value: 'disabled', > +option('avx512f', type: 'feature', value: 'auto', > description: 'AVX512F optimizations') > option('keyring', type: 'feature', value: 'auto', > description: 'Linux keyring support') ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] meson: Set avx512f option to auto 2022-12-04 1:51 ` [PATCH 2/2] meson: Set avx512f option to auto Richard Henderson 2022-12-16 20:47 ` Richard Henderson @ 2022-12-16 23:08 ` Paolo Bonzini 2022-12-16 23:50 ` Richard Henderson 2022-12-19 10:21 ` Daniel P. Berrangé 1 sibling, 2 replies; 9+ messages in thread From: Paolo Bonzini @ 2022-12-16 23:08 UTC (permalink / raw) To: Richard Henderson; +Cc: qemu-devel, Tom Stellard, P. Berrange, Daniel [-- Attachment #1: Type: text/plain, Size: 1644 bytes --] Because that's what configure used to do ( https://lists.nongnu.org/archive/html/qemu-devel/2022-02/msg00650.html)... It can surely be changed but AVX512 is known to limit processor frequency. I am not sure if the limitation is per core or extends to multiple cores, and it would be a pity if guests were slowed down even further during migration. Especially after the bulk phase buffer_is_zero performance matters a lot less so you'd pay the price of AVX512 for little gain. After the bulk phase it may even make sense to just use SSE, since even AVX requires a voltage transition[1] from what I saw at https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html. Paolo [1] voltage transitions slow down the processor during the transition Il dom 4 dic 2022, 02:51 Richard Henderson <richard.henderson@linaro.org> ha scritto: > I'm not sure why this option wasn't set the same as avx2. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > meson_options.txt | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/meson_options.txt b/meson_options.txt > index 4b749ca549..f98ee101e2 100644 > --- a/meson_options.txt > +++ b/meson_options.txt > @@ -102,7 +102,7 @@ option('membarrier', type: 'feature', value: > 'disabled', > > option('avx2', type: 'feature', value: 'auto', > description: 'AVX2 optimizations') > -option('avx512f', type: 'feature', value: 'disabled', > +option('avx512f', type: 'feature', value: 'auto', > description: 'AVX512F optimizations') > option('keyring', type: 'feature', value: 'auto', > description: 'Linux keyring support') > -- > 2.34.1 > > [-- Attachment #2: Type: text/html, Size: 2594 bytes --] ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] meson: Set avx512f option to auto 2022-12-16 23:08 ` Paolo Bonzini @ 2022-12-16 23:50 ` Richard Henderson 2022-12-19 10:21 ` Daniel P. Berrangé 1 sibling, 0 replies; 9+ messages in thread From: Richard Henderson @ 2022-12-16 23:50 UTC (permalink / raw) To: Paolo Bonzini; +Cc: qemu-devel, Tom Stellard, P. Berrange, Daniel On 12/16/22 15:08, Paolo Bonzini wrote: > Because that's what configure used to do > (https://lists.nongnu.org/archive/html/qemu-devel/2022-02/msg00650.html). > <https://lists.nongnu.org/archive/html/qemu-devel/2022-02/msg00650.html).>.. Yeah, but I wondered if that was just a bug. > It can surely be changed but AVX512 is known to limit processor frequency. I am not sure > if the limitation is per core or extends to multiple cores, and it would be a pity if > guests were slowed down even further during migration. Hmm. Should we simply remove it? > Especially after the bulk phase buffer_is_zero performance matters a lot less so you'd pay > the price of AVX512 for little gain. After the bulk phase it may even make sense to just > use SSE, since even AVX requires a voltage transition[1] from what I saw at > https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html > <https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html>. Ouch, never heard of that. I'm not going to worry about it, because glibc str* routines make the same choice to use AVX2, as does TCG, so I can only imagine that for the most part we're continually in and out of 256-bit avx mode. Anyway, I'll drop this patch. r~ ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] meson: Set avx512f option to auto 2022-12-16 23:08 ` Paolo Bonzini 2022-12-16 23:50 ` Richard Henderson @ 2022-12-19 10:21 ` Daniel P. Berrangé 1 sibling, 0 replies; 9+ messages in thread From: Daniel P. Berrangé @ 2022-12-19 10:21 UTC (permalink / raw) To: Paolo Bonzini; +Cc: Richard Henderson, qemu-devel, Tom Stellard On Sat, Dec 17, 2022 at 12:08:08AM +0100, Paolo Bonzini wrote: > Because that's what configure used to do ( > https://lists.nongnu.org/archive/html/qemu-devel/2022-02/msg00650.html)... > > It can surely be changed but AVX512 is known to limit processor frequency. > I am not sure if the limitation is per core or extends to multiple cores, > and it would be a pity if guests were slowed down even further during > migration. > > Especially after the bulk phase buffer_is_zero performance matters a lot > less so you'd pay the price of AVX512 for little gain. After the bulk phase > it may even make sense to just use SSE, since even AVX requires a voltage > transition[1] from what I saw at > https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html. Note: s/AVX512/Intel's AVX512 impl/ AMD's Zen4 AVX512 is said to behave quite differently from Intel's. This posting goes into a massive amount of detail: https://www.mersenneforum.org/showthread.php?p=614191 [quote] Since 512-bit instructions are reusing the same 256-bit hardware, 512-bit does not come with additional thermal issues. There is no artificial throttling like on Intel chips. [/quote] [quote] Overall, AMD's AVX512 implementation beat my expectations. I was expecting something similar to Zen1's "double-pumping" of AVX with half the register file and cross-lane instructions being super slow. But this is not the case on Zen4. The lack of power or thermal issues combined with stellar shuffle support makes it completely worthwhile to use from a developer standpoint. If your code can vectorize without excessive wasted computation, then go all the way to 512-bit. AMD not only made this worthwhile, but *incentivizes* it with the power savings. And if in the future AMD decides to widen things up, you may get a 2x speedup for free. [/quote] With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2022-12-19 10:33 UTC | newest] Thread overview: 9+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2022-12-04 1:51 [PATCH v2 0/2] Use a more portable way to enable target specific functions Richard Henderson 2022-12-04 1:51 ` [PATCH 1/2] util/bufferiszero: Use __attribute__((target)) for avx2/avx512 Richard Henderson 2022-12-05 11:17 ` Daniel P. Berrangé 2022-12-05 15:16 ` Richard Henderson 2022-12-04 1:51 ` [PATCH 2/2] meson: Set avx512f option to auto Richard Henderson 2022-12-16 20:47 ` Richard Henderson 2022-12-16 23:08 ` Paolo Bonzini 2022-12-16 23:50 ` Richard Henderson 2022-12-19 10:21 ` Daniel P. Berrangé
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).