[PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
@ 2026-06-14  1:03 Eric Biggers
  2026-06-14  1:23 ` sashiko-bot
  2026-06-14 10:16 ` David Laight
  0 siblings, 2 replies; 6+ messages in thread
From: Eric Biggers @ 2026-06-14  1:03 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel
  Cc: Christoph Hellwig, linux-crypto, x86, Eric Biggers, David Laight,
	linux-raid

Add an implementation of xor_gen() using AVX-512.

It uses 512-bit vectors, i.e. ZMM registers.  It also uses the
vpternlogq instruction to do three-input XORs when applicable.

It's enabled on x86_64 CPUs that have AVX512F && !PREFER_YMM.  In
practice that means:

    - AMD Zen 4 and later (client and server)
    - Intel Sapphire Rapids and later (server)
    - Intel Rocket Lake (client)
    - Intel Nova Lake and later (client)

The !PREFER_YMM condition excludes the older AVX-512 implementations in
Intel Skylake Server and Intel Ice Lake.  They could run this code, but
they're known to have overly-eager downclocking when ZMM registers are
used.  This is the same policy that the crypto and CRC code uses.

Benchmark on AMD Ryzen 9 9950X (Zen 5):

    src_cnt    avx          avx512       Improvement
    =======    ==========   ==========   ===========
    1          56353 MB/s   75388 MB/s   33%
    2          54274 MB/s   68409 MB/s   26%
    3          44649 MB/s   64042 MB/s   43%
    4          41315 MB/s   55002 MB/s   33%

Note: for now I omitted the cpu_has_xfeatures() check that the AVX-512
optimized crypto and CRC code does, since it's not implemented on
User-Mode Linux and it's never been present in the RAID6 code either.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---

Changed in v2:
    - Fixed build on UML
    - Reworked the implementation

 lib/raid/xor/Makefile         |   2 +-
 lib/raid/xor/x86/xor-avx512.c | 121 ++++++++++++++++++++++++++++++++++
 lib/raid/xor/x86/xor_arch.h   |  26 ++++----
 3 files changed, 137 insertions(+), 12 deletions(-)
 create mode 100644 lib/raid/xor/x86/xor-avx512.c

diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90..4af945861a51 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -26,11 +26,11 @@ xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
 xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
 xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
 xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 xor-$(CONFIG_S390)		+= s390/xor.o
 xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
-xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
+xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-avx512.o
 obj-y				+= tests/
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
 
diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
new file mode 100644
index 000000000000..87b981d74c90
--- /dev/null
+++ b/lib/raid/xor/x86/xor-avx512.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AVX-512 optimized implementation of xor_gen()
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <linux/types.h>
+#include <asm/fpu/api.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+
+/*
+ * Implementation notes:
+ *
+ * Unrolling by the number of buffers (2-5) is very important.
+ *
+ * Unrolling by length is less important, especially when using register-indexed
+ * addressing with negative indices from the end of the buffers.  That approach
+ * results in just two loop control instructions being needed per iteration,
+ * regardless of the number of buffers.
+ *
+ * In fact, benchmarks showed that the 2 and 3 buffer cases require only 2x
+ * unrolling by length, while the 4 and 5 buffer cases don't require any
+ * unrolling by length.  Benchmarks also showed that the register-indexed
+ * addressing isn't a bottleneck either; i.e., we can't do any better by
+ * incrementing the pointers as we go along, even with more unrolling.
+ */
+
+static void xor_avx512_2(long bytes, u8 *p0, const u8 *p1)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
+		     "vmovdqa64 64(%0,%1), %%zmm1\n"
+		     "vpxorq (%0,%2), %%zmm0, %%zmm0\n"
+		     "vpxorq 64(%0,%2), %%zmm1, %%zmm1\n"
+		     "vmovdqa64 %%zmm0, (%0,%1)\n"
+		     "vmovdqa64 %%zmm1, 64(%0,%1)\n"
+		     "add $128, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p0 + bytes), "r"(p1 + bytes)
+		     : "memory", "cc");
+}
+
+static void xor_avx512_3(long bytes, u8 *p0, const u8 *p1, const u8 *p2)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
+		     "vmovdqa64 64(%0,%1), %%zmm1\n"
+		     "vmovdqa64 (%0,%2), %%zmm2\n"
+		     "vmovdqa64 64(%0,%2), %%zmm3\n"
+		     "vpternlogq $0x96, (%0,%3), %%zmm2, %%zmm0\n"
+		     "vpternlogq $0x96, 64(%0,%3), %%zmm3, %%zmm1\n"
+		     "vmovdqa64 %%zmm0, (%0,%1)\n"
+		     "vmovdqa64 %%zmm1, 64(%0,%1)\n"
+		     "add $128, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p0 + bytes), "r"(p1 + bytes), "r"(p2 + bytes)
+		     : "memory", "cc");
+}
+
+static void xor_avx512_4(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
+			 const u8 *p3)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
+		     "vmovdqa64 (%0,%2), %%zmm1\n"
+		     "vpxorq (%0,%3), %%zmm0, %%zmm0\n"
+		     "vpternlogq $0x96, (%0,%4), %%zmm1, %%zmm0\n"
+		     "vmovdqa64 %%zmm0, (%0,%1)\n"
+		     "add $64, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p0 + bytes), "r"(p1 + bytes), "r"(p2 + bytes),
+		       "r"(p3 + bytes)
+		     : "memory", "cc");
+}
+
+static void xor_avx512_5(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
+			 const u8 *p3, const u8 *p4)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
+		     "vmovdqa64 (%0,%2), %%zmm1\n"
+		     "vpternlogq $0x96, (%0,%3), %%zmm1, %%zmm0\n"
+		     "vmovdqa64 (%0,%4), %%zmm1\n"
+		     "vpternlogq $0x96, (%0,%5), %%zmm1, %%zmm0\n"
+		     "vmovdqa64 %%zmm0, (%0,%1)\n"
+		     "add $64, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p0 + bytes), "r"(p1 + bytes), "r"(p2 + bytes),
+		       "r"(p3 + bytes), "r"(p4 + bytes)
+		     : "memory", "cc");
+}
+
+DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
+	      xor_avx512_5);
+
+/*
+ * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
+ * 64-byte aligned.
+ */
+static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
+			   unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
+	kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_avx512 = {
+	.name = "avx512",
+	.xor_gen = xor_gen_avx512,
+};
diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
index 99fe85a213c6..b5d49376fc97 100644
--- a/lib/raid/xor/x86/xor_arch.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -4,26 +4,30 @@
 extern struct xor_block_template xor_block_pII_mmx;
 extern struct xor_block_template xor_block_p5_mmx;
 extern struct xor_block_template xor_block_sse;
 extern struct xor_block_template xor_block_sse_pf64;
 extern struct xor_block_template xor_block_avx;
+extern struct xor_block_template xor_block_avx512;
 
-/*
- * When SSE is available, use it as it can write around L2.  We may also be able
- * to load into the L1 only depending on how the cpu deals with a load to a line
- * that is being prefetched.
- *
- * When AVX2 is available, force using it as it is better by all measures.
- *
- * 32-bit without MMX can fall back to the generic routines.
- */
 static __always_inline void __init arch_xor_init(void)
 {
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+	if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    boot_cpu_has(X86_FEATURE_OSXSAVE) &&
+	    !boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
+		/* AVX-512 will be the best; no need to try others. */
+		/* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
+		xor_force(&xor_block_avx512);
+	} else if (boot_cpu_has(X86_FEATURE_AVX) &&
+		   boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		/* AVX will be the best; no need to try others. */
 		xor_force(&xor_block_avx);
 	} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
+		/*
+		 * When SSE is available, use it as it can write around L2.  We
+		 * may also be able to load into the L1 only depending on how
+		 * the cpu deals with a load to a line that is being prefetched.
+		 */
 		xor_register(&xor_block_sse);
 		xor_register(&xor_block_sse_pf64);
 	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
 		xor_register(&xor_block_pII_mmx);
 		xor_register(&xor_block_p5_mmx);

base-commit: 2b07ea76fd28989bde5993532d7a943a6f90e246
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-14  1:03 [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
@ 2026-06-14  1:23 ` sashiko-bot
  2026-06-14 10:16 ` David Laight
  1 sibling, 0 replies; 6+ messages in thread
From: sashiko-bot @ 2026-06-14  1:23 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-raid, yukuai

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] The AVX-512 XOR implementation relies solely on CPUID hardware checks (`boot_cpu_has(X86_FEATURE_AVX512F)`) and skips verifying if the OS actually enabled AVX-512 state management in the XCR0 register, risking a #UD exception.
--

commit d9094ea6909535dc1fa7dcdb6976401cdf1f90e8
Author: Eric Biggers <ebiggers@kernel.org>

lib/raid/xor: x86: Add AVX-512 optimized xor_gen()

This commit adds an AVX-512 optimized implementation of xor_gen() using
512-bit vectors and the vpternlogq instruction. The optimization is enabled
on x86_64 CPUs with AVX512F support that do not have overly-eager
downclocking (!PREFER_YMM). The author intentionally omitted the
cpu_has_xfeatures() check to avoid build issues on User-Mode Linux.

> diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
> index 99fe85a213c66..b5d49376fc974 100644
> --- a/lib/raid/xor/x86/xor_arch.h
> +++ b/lib/raid/xor/x86/xor_arch.h
[ ... ]
>  static __always_inline void __init arch_xor_init(void)
>  {
> -	if (boot_cpu_has(X86_FEATURE_AVX) &&
> -	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
> +	if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
> +	    boot_cpu_has(X86_FEATURE_OSXSAVE) &&
> +	    !boot_cpu_has(X86_FEATURE_PREFER_YMM)) {

[Severity: High]
Could omitting the cpu_has_xfeatures() check in arch_xor_init() result in a
#UD exception?

While CR4.OSXSAVE indicates that XCR0 is enabled, it does not guarantee that
the OS or hypervisor actually enabled the AVX-512 state components (OPMASK,
ZMM_Hi256, Hi16_ZMM) in XCR0. 

If a hypervisor advertises AVX-512 CPUID bits but masks out the corresponding
XCR0 bits, executing EVEX-encoded AVX-512 instructions could trigger an
Invalid Opcode exception and cause a kernel panic.

> +		/* AVX-512 will be the best; no need to try others. */
> +		/* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
> +		xor_force(&xor_block_avx512);
> +	} else if (boot_cpu_has(X86_FEATURE_AVX) &&
> +		   boot_cpu_has(X86_FEATURE_OSXSAVE)) {
> +		/* AVX will be the best; no need to try others. */
>  		xor_force(&xor_block_avx);

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260614010357.69416-1-ebiggers@kernel.org?part=1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-14  1:03 [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
  2026-06-14  1:23 ` sashiko-bot
@ 2026-06-14 10:16 ` David Laight
  2026-06-15 18:44   ` Eric Biggers
  1 sibling, 1 reply; 6+ messages in thread
From: David Laight @ 2026-06-14 10:16 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Andrew Morton, linux-kernel, Christoph Hellwig, linux-crypto, x86,
	linux-raid

On Sat, 13 Jun 2026 18:03:57 -0700
Eric Biggers <ebiggers@kernel.org> wrote:

> Add an implementation of xor_gen() using AVX-512.
> 
> It uses 512-bit vectors, i.e. ZMM registers.  It also uses the
> vpternlogq instruction to do three-input XORs when applicable.
> 
> It's enabled on x86_64 CPUs that have AVX512F && !PREFER_YMM.  In
> practice that means:
> 
>     - AMD Zen 4 and later (client and server)

Doesn't zen4 only have a 256bit bus between the cpu and cache?
So avx512 reads take two clocks.
Since this is memory limited it is unlikely to run faster than the
avx256 version.
OTOH if it doesn't cause down-clocking as well then it won't be slower.

>     - Intel Sapphire Rapids and later (server)
>     - Intel Rocket Lake (client)
>     - Intel Nova Lake and later (client)
> 
> The !PREFER_YMM condition excludes the older AVX-512 implementations in
> Intel Skylake Server and Intel Ice Lake.  They could run this code, but
> they're known to have overly-eager downclocking when ZMM registers are
> used.  This is the same policy that the crypto and CRC code uses.
> 
> Benchmark on AMD Ryzen 9 9950X (Zen 5):
> 
>     src_cnt    avx          avx512       Improvement
>     =======    ==========   ==========   ===========
>     1          56353 MB/s   75388 MB/s   33%
>     2          54274 MB/s   68409 MB/s   26%
>     3          44649 MB/s   64042 MB/s   43%
>     4          41315 MB/s   55002 MB/s   33%
> 
> Note: for now I omitted the cpu_has_xfeatures() check that the AVX-512
> optimized crypto and CRC code does, since it's not implemented on
> User-Mode Linux and it's never been present in the RAID6 code either.
> 
> Signed-off-by: Eric Biggers <ebiggers@kernel.org>

Since I suggested it :-)

Reviewed-By: David Laight <david.laight.linux@gmail.com>

Some 'not very important' comments:

I did wonder whether moving the loop into the asm() would help.
gcc has a nasty habit of pessimising loops when you try to be clever.
It is certainly safer for tight loops like these.
That does have the side effect of making p0 be %1 which doesn't improve
readability. Either used named parameters or possibly just change p0 to p1 (etc)
so they match.

The code should be limited by the memory reads, so the 3-argument xor and
the interleave of the unroll may make no difference.

Some cpu do have constraints on the cache alignment in order to do two
reads per clock, but I've forgotten them and they got better before AVX-512.
If that were affecting this code (on the tested cpu) then I'd expect the
interleaved unroll would improve the _4 and -5 functions.
So it probably doesn't affect this code.

Using the same loop for the avx-256 and sse (and even smaller) functions could
well generate code that runs 'pretty much as fast as possible' on older cpu.
Intel cpu (going back to Sandy bridge) are likely to execute the loop in the
same number of clocks - but clearly copying half or a quarter of the data.
But I've no experience of zen1.

Might be worth doing for avx-256, does any care about anything older :-)

	David


> ---
> 
> Changed in v2:
>     - Fixed build on UML
>     - Reworked the implementation
> 
>  lib/raid/xor/Makefile         |   2 +-
>  lib/raid/xor/x86/xor-avx512.c | 121 ++++++++++++++++++++++++++++++++++
>  lib/raid/xor/x86/xor_arch.h   |  26 ++++----
>  3 files changed, 137 insertions(+), 12 deletions(-)
>  create mode 100644 lib/raid/xor/x86/xor-avx512.c
> 
> diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
> index 4d633dfd5b90..4af945861a51 100644
> --- a/lib/raid/xor/Makefile
> +++ b/lib/raid/xor/Makefile
> @@ -26,11 +26,11 @@ xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
>  xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
>  xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
>  xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
>  xor-$(CONFIG_S390)		+= s390/xor.o
>  xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
> -xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
> +xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-avx512.o
>  obj-y				+= tests/
>  
>  CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
>  CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
>  
> diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
> new file mode 100644
> index 000000000000..87b981d74c90
> --- /dev/null
> +++ b/lib/raid/xor/x86/xor-avx512.c
> @@ -0,0 +1,121 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * AVX-512 optimized implementation of xor_gen()
> + *
> + * Copyright 2026 Google LLC
> + */
> +
> +#include <linux/types.h>
> +#include <asm/fpu/api.h>
> +#include "xor_impl.h"
> +#include "xor_arch.h"
> +
> +/*
> + * Implementation notes:
> + *
> + * Unrolling by the number of buffers (2-5) is very important.
> + *
> + * Unrolling by length is less important, especially when using register-indexed
> + * addressing with negative indices from the end of the buffers.  That approach
> + * results in just two loop control instructions being needed per iteration,
> + * regardless of the number of buffers.
> + *
> + * In fact, benchmarks showed that the 2 and 3 buffer cases require only 2x
> + * unrolling by length, while the 4 and 5 buffer cases don't require any
> + * unrolling by length.  Benchmarks also showed that the register-indexed
> + * addressing isn't a bottleneck either; i.e., we can't do any better by
> + * incrementing the pointers as we go along, even with more unrolling.
> + */
> +
> +static void xor_avx512_2(long bytes, u8 *p0, const u8 *p1)
> +{
> +	long i = -bytes;
> +
> +	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
> +		     "vmovdqa64 64(%0,%1), %%zmm1\n"
> +		     "vpxorq (%0,%2), %%zmm0, %%zmm0\n"
> +		     "vpxorq 64(%0,%2), %%zmm1, %%zmm1\n"
> +		     "vmovdqa64 %%zmm0, (%0,%1)\n"
> +		     "vmovdqa64 %%zmm1, 64(%0,%1)\n"
> +		     "add $128, %0\n"
> +		     "jnz 1b\n"
> +		     : "+&r"(i)
> +		     : "r"(p0 + bytes), "r"(p1 + bytes)
> +		     : "memory", "cc");
> +}
> +
> +static void xor_avx512_3(long bytes, u8 *p0, const u8 *p1, const u8 *p2)
> +{
> +	long i = -bytes;
> +
> +	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
> +		     "vmovdqa64 64(%0,%1), %%zmm1\n"
> +		     "vmovdqa64 (%0,%2), %%zmm2\n"
> +		     "vmovdqa64 64(%0,%2), %%zmm3\n"
> +		     "vpternlogq $0x96, (%0,%3), %%zmm2, %%zmm0\n"
> +		     "vpternlogq $0x96, 64(%0,%3), %%zmm3, %%zmm1\n"
> +		     "vmovdqa64 %%zmm0, (%0,%1)\n"
> +		     "vmovdqa64 %%zmm1, 64(%0,%1)\n"
> +		     "add $128, %0\n"
> +		     "jnz 1b\n"
> +		     : "+&r"(i)
> +		     : "r"(p0 + bytes), "r"(p1 + bytes), "r"(p2 + bytes)
> +		     : "memory", "cc");
> +}
> +
> +static void xor_avx512_4(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
> +			 const u8 *p3)
> +{
> +	long i = -bytes;
> +
> +	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
> +		     "vmovdqa64 (%0,%2), %%zmm1\n"
> +		     "vpxorq (%0,%3), %%zmm0, %%zmm0\n"
> +		     "vpternlogq $0x96, (%0,%4), %%zmm1, %%zmm0\n"
> +		     "vmovdqa64 %%zmm0, (%0,%1)\n"
> +		     "add $64, %0\n"
> +		     "jnz 1b\n"
> +		     : "+&r"(i)
> +		     : "r"(p0 + bytes), "r"(p1 + bytes), "r"(p2 + bytes),
> +		       "r"(p3 + bytes)
> +		     : "memory", "cc");
> +}
> +
> +static void xor_avx512_5(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
> +			 const u8 *p3, const u8 *p4)
> +{
> +	long i = -bytes;
> +
> +	asm volatile("1: vmovdqa64 (%0,%1), %%zmm0\n"
> +		     "vmovdqa64 (%0,%2), %%zmm1\n"
> +		     "vpternlogq $0x96, (%0,%3), %%zmm1, %%zmm0\n"
> +		     "vmovdqa64 (%0,%4), %%zmm1\n"
> +		     "vpternlogq $0x96, (%0,%5), %%zmm1, %%zmm0\n"
> +		     "vmovdqa64 %%zmm0, (%0,%1)\n"
> +		     "add $64, %0\n"
> +		     "jnz 1b\n"
> +		     : "+&r"(i)
> +		     : "r"(p0 + bytes), "r"(p1 + bytes), "r"(p2 + bytes),
> +		       "r"(p3 + bytes), "r"(p4 + bytes)
> +		     : "memory", "cc");
> +}
> +
> +DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
> +	      xor_avx512_5);
> +
> +/*
> + * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
> + * 64-byte aligned.
> + */
> +static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
> +			   unsigned int bytes)
> +{
> +	kernel_fpu_begin();
> +	xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
> +	kernel_fpu_end();
> +}
> +
> +struct xor_block_template xor_block_avx512 = {
> +	.name = "avx512",
> +	.xor_gen = xor_gen_avx512,
> +};
> diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
> index 99fe85a213c6..b5d49376fc97 100644
> --- a/lib/raid/xor/x86/xor_arch.h
> +++ b/lib/raid/xor/x86/xor_arch.h
> @@ -4,26 +4,30 @@
>  extern struct xor_block_template xor_block_pII_mmx;
>  extern struct xor_block_template xor_block_p5_mmx;
>  extern struct xor_block_template xor_block_sse;
>  extern struct xor_block_template xor_block_sse_pf64;
>  extern struct xor_block_template xor_block_avx;
> +extern struct xor_block_template xor_block_avx512;
>  
> -/*
> - * When SSE is available, use it as it can write around L2.  We may also be able
> - * to load into the L1 only depending on how the cpu deals with a load to a line
> - * that is being prefetched.
> - *
> - * When AVX2 is available, force using it as it is better by all measures.
> - *
> - * 32-bit without MMX can fall back to the generic routines.
> - */
>  static __always_inline void __init arch_xor_init(void)
>  {
> -	if (boot_cpu_has(X86_FEATURE_AVX) &&
> -	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
> +	if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
> +	    boot_cpu_has(X86_FEATURE_OSXSAVE) &&
> +	    !boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
> +		/* AVX-512 will be the best; no need to try others. */
> +		/* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
> +		xor_force(&xor_block_avx512);
> +	} else if (boot_cpu_has(X86_FEATURE_AVX) &&
> +		   boot_cpu_has(X86_FEATURE_OSXSAVE)) {
> +		/* AVX will be the best; no need to try others. */
>  		xor_force(&xor_block_avx);
>  	} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
> +		/*
> +		 * When SSE is available, use it as it can write around L2.  We
> +		 * may also be able to load into the L1 only depending on how
> +		 * the cpu deals with a load to a line that is being prefetched.
> +		 */
>  		xor_register(&xor_block_sse);
>  		xor_register(&xor_block_sse_pf64);
>  	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
>  		xor_register(&xor_block_pII_mmx);
>  		xor_register(&xor_block_p5_mmx);
> 
> base-commit: 2b07ea76fd28989bde5993532d7a943a6f90e246


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-14 10:16 ` David Laight
@ 2026-06-15 18:44   ` Eric Biggers
  2026-06-15 22:57     ` David Laight
  2026-06-17  5:52     ` Christoph Hellwig
  0 siblings, 2 replies; 6+ messages in thread
From: Eric Biggers @ 2026-06-15 18:44 UTC (permalink / raw)
  To: David Laight
  Cc: Andrew Morton, linux-kernel, Christoph Hellwig, linux-crypto, x86,
	linux-raid

On Sun, Jun 14, 2026 at 11:16:28AM +0100, David Laight wrote:
> On Sat, 13 Jun 2026 18:03:57 -0700
> Eric Biggers <ebiggers@kernel.org> wrote:
> 
> > Add an implementation of xor_gen() using AVX-512.
> > 
> > It uses 512-bit vectors, i.e. ZMM registers.  It also uses the
> > vpternlogq instruction to do three-input XORs when applicable.
> > 
> > It's enabled on x86_64 CPUs that have AVX512F && !PREFER_YMM.  In
> > practice that means:
> > 
> >     - AMD Zen 4 and later (client and server)
> 
> Doesn't zen4 only have a 256bit bus between the cpu and cache?
> So avx512 reads take two clocks.
> Since this is memory limited it is unlikely to run faster than the
> avx256 version.

On AMD Genoa (Zen 4 server processor), the AVX-512 code added by this
patch is indeed about the same speed as the existing AVX-2 code.

> OTOH if it doesn't cause down-clocking as well then it won't be slower.

Yes, as far as I know that's not an issue on AMD processors, even Zen 4.
The "avoid AVX-512 due to downclocking" rule is historical guidance for
Intel processors that had a bad implementation of AVX-512.  There's no
reason to exclude Zen 4 from executing AVX-512 optimized code.  At worst
it will just be the same, as we're seeing here.

> Since I suggested it :-)
> 
> Reviewed-By: David Laight <david.laight.linux@gmail.com>
> 
> Some 'not very important' comments:
> 
> I did wonder whether moving the loop into the asm() would help.
> gcc has a nasty habit of pessimising loops when you try to be clever.
> It is certainly safer for tight loops like these.

I originally tried leaving the loops to the compiler, but gcc unrolled
the 1x ones by 2x, despite it having no visibility into the asm block.
That broke the intent with the indexed addressing, since to achieve the
unrolling it generated code that incremented the pointers.

So I just ended up moving the loop to the asm, which reliably gives us
the code we want.

> That does have the side effect of making p0 be %1 which doesn't improve
> readability. Either used named parameters or possibly just change p0 to p1 (etc)
> so they match.
> 
> The code should be limited by the memory reads, so the 3-argument xor and
> the interleave of the unroll may make no difference.

The unroll by 2x in the 2 and 3-buffer cases helped a little bit on
Sapphire Rapids.  I don't know exactly why, but it makes sense that
those cases are where the loop overhead is most likely to matter.

> Some cpu do have constraints on the cache alignment in order to do two
> reads per clock, but I've forgotten them and they got better before AVX-512.
> If that were affecting this code (on the tested cpu) then I'd expect the
> interleaved unroll would improve the _4 and -5 functions.
> So it probably doesn't affect this code.

The buffers are always 64-byte aligned here, as documented.

> Using the same loop for the avx-256 and sse (and even smaller) functions could
> well generate code that runs 'pretty much as fast as possible' on older cpu.
> Intel cpu (going back to Sandy bridge) are likely to execute the loop in the
> same number of clocks - but clearly copying half or a quarter of the data.
> But I've no experience of zen1.
> 
> Might be worth doing for avx-256, does any care about anything older :-)

Yes, the existing AVX code is probably excessively unrolled.  It
generates almost 4 KiB of code.

- Eric

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-15 18:44   ` Eric Biggers
@ 2026-06-15 22:57     ` David Laight
  2026-06-17  5:52     ` Christoph Hellwig
  1 sibling, 0 replies; 6+ messages in thread
From: David Laight @ 2026-06-15 22:57 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Andrew Morton, linux-kernel, Christoph Hellwig, linux-crypto, x86,
	linux-raid

On Mon, 15 Jun 2026 11:44:35 -0700
Eric Biggers <ebiggers@kernel.org> wrote:

> On Sun, Jun 14, 2026 at 11:16:28AM +0100, David Laight wrote:
> > On Sat, 13 Jun 2026 18:03:57 -0700
> > Eric Biggers <ebiggers@kernel.org> wrote:
...
> > Some 'not very important' comments:
> > 
> > I did wonder whether moving the loop into the asm() would help.
> > gcc has a nasty habit of pessimising loops when you try to be clever.
> > It is certainly safer for tight loops like these.  
> 
> I originally tried leaving the loops to the compiler, but gcc unrolled
> the 1x ones by 2x, despite it having no visibility into the asm block.
> That broke the intent with the indexed addressing, since to achieve the
> unrolling it generated code that incremented the pointers.

I did suspect that might happen.

> So I just ended up moving the loop to the asm, which reliably gives us
> the code we want.

Yep...

...
> > The code should be limited by the memory reads, so the 3-argument xor and
> > the interleave of the unroll may make no difference.  
> 
> The unroll by 2x in the 2 and 3-buffer cases helped a little bit on
> Sapphire Rapids.  I don't know exactly why, but it makes sense that
> those cases are where the loop overhead is most likely to matter.

Each iteration does 2 (or 3) reads and a write.
The cpu can do two reads and a write every clock.
However Intel cpu can only execute a branch every other clock,
so the shortest loop is two clocks.
That means you need need to unroll once to keep the memory logic busy.

The zen5 seems to be able to execute 1-clock loops, so wouldn't need
the unroll.

> > Some cpu do have constraints on the cache alignment in order to do two
> > reads per clock, but I've forgotten them and they got better before AVX-512.
> > If that were affecting this code (on the tested cpu) then I'd expect the
> > interleaved unroll would improve the _4 and -5 functions.
> > So it probably doesn't affect this code.  
> 
> The buffers are always 64-byte aligned here, as documented.

It is all more complex that that.
Whether you can do two reads/clock depends on whether the reads manage to
avoid needing the same buffers (etc) in the cache logic.
For instance it might not work if the addresses differ by the size of the
cache (one of Agner's books might have the answer).
(It was pretty hard to get two reads/clock on Sandy Bridge.)

Then there are some really strange effects.
On zen5 (at least on the one I've got) 'rep movsb' is very slow (setup and copy)
if (IIRC) (%di - %si) mod 4k is between 1 and 127.
The only other alignment that makes much difference is 64byte aligning %di (which
doubles throughput).

-- David

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-15 18:44   ` Eric Biggers
  2026-06-15 22:57     ` David Laight
@ 2026-06-17  5:52     ` Christoph Hellwig
  1 sibling, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2026-06-17  5:52 UTC (permalink / raw)
  To: Eric Biggers
  Cc: David Laight, Andrew Morton, linux-kernel, Christoph Hellwig,
	linux-crypto, x86, linux-raid

On Mon, Jun 15, 2026 at 11:44:35AM -0700, Eric Biggers wrote:
> > Doesn't zen4 only have a 256bit bus between the cpu and cache?
> > So avx512 reads take two clocks.
> > Since this is memory limited it is unlikely to run faster than the
> > avx256 version.
> 
> On AMD Genoa (Zen 4 server processor), the AVX-512 code added by this
> patch is indeed about the same speed as the existing AVX-2 code.

The same is true for Zen 5 mobile which has the same AVX-512 limitations.
I don't think it's the bus width, but I'll leave the details to the
experts.

> 
> > OTOH if it doesn't cause down-clocking as well then it won't be slower.
> 
> Yes, as far as I know that's not an issue on AMD processors, even Zen 4.
> The "avoid AVX-512 due to downclocking" rule is historical guidance for
> Intel processors that had a bad implementation of AVX-512.  There's no
> reason to exclude Zen 4 from executing AVX-512 optimized code.  At worst
> it will just be the same, as we're seeing here.

It does not cause down clocking.  But for some of the more complicated
code I've seen AVX512 being significantly slower than AVX2 on these.
So we need to watch out and not automatically assume AVX512 is faster.


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-06-17  5:52 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-14  1:03 [PATCH v2] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
2026-06-14  1:23 ` sashiko-bot
2026-06-14 10:16 ` David Laight
2026-06-15 18:44   ` Eric Biggers
2026-06-15 22:57     ` David Laight
2026-06-17  5:52     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox