[PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
@ 2026-06-15 19:03 Eric Biggers
  2026-06-15 19:25 ` sashiko-bot
  2026-06-15 20:10 ` Eric Biggers
  0 siblings, 2 replies; 5+ messages in thread
From: Eric Biggers @ 2026-06-15 19:03 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel
  Cc: Christoph Hellwig, linux-crypto, x86, Eric Biggers, David Laight,
	linux-raid

Add an implementation of xor_gen() using AVX-512.

It uses 512-bit vectors, i.e. ZMM registers.  It also uses the
vpternlogq instruction to do three-input XORs when applicable.

It's enabled on x86_64 CPUs that have AVX512F && !PREFER_YMM.  In
practice that means:

    - AMD Zen 4 and later (client and server)
    - Intel Sapphire Rapids and later (server)
    - Intel Rocket Lake (client)
    - Intel Nova Lake and later (client)

The !PREFER_YMM condition excludes the older AVX-512 implementations in
Intel Skylake Server and Intel Ice Lake.  They could run this code, but
they're known to have overly-eager downclocking when ZMM registers are
used.  This is the same policy that the crypto and CRC code uses.

Benchmark on AMD Ryzen 9 9950X (Zen 5):

    src_cnt    avx          avx512       Improvement
    =======    ==========   ==========   ===========
    1          56353 MB/s   75388 MB/s   33%
    2          54274 MB/s   68409 MB/s   26%
    3          44649 MB/s   64042 MB/s   43%
    4          41315 MB/s   55002 MB/s   33%

Note: for now I omitted the cpu_has_xfeatures() check that the AVX-512
optimized crypto and CRC code does, since it's not implemented on
User-Mode Linux and it's never been present in the RAID6 code either.

Reviewed-by: David Laight <david.laight.linux@gmail.com>
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---

Changed in v3:
    - Renamed p0-p4 to p1-p5 to match the inline asm indices
    - Swapped the base and index registers to the logical order
    - Added David's Reviewed-by

Changed in v2:
    - Fixed build on UML
    - Reworked the implementation

 lib/raid/xor/Makefile         |   2 +-
 lib/raid/xor/x86/xor-avx512.c | 121 ++++++++++++++++++++++++++++++++++
 lib/raid/xor/x86/xor_arch.h   |  26 ++++----
 3 files changed, 137 insertions(+), 12 deletions(-)
 create mode 100644 lib/raid/xor/x86/xor-avx512.c

diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90..4af945861a51 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -26,11 +26,11 @@ xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
 xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
 xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
 xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 xor-$(CONFIG_S390)		+= s390/xor.o
 xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
-xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
+xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-avx512.o
 obj-y				+= tests/
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
 
diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
new file mode 100644
index 000000000000..17f57900d827
--- /dev/null
+++ b/lib/raid/xor/x86/xor-avx512.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AVX-512 optimized implementation of xor_gen()
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <linux/types.h>
+#include <asm/fpu/api.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+
+/*
+ * Implementation notes:
+ *
+ * Unrolling by the number of buffers (2-5) is very important.
+ *
+ * Unrolling by length is less important, especially when using register-indexed
+ * addressing with negative indices from the end of the buffers.  That approach
+ * results in just two loop control instructions being needed per iteration,
+ * regardless of the number of buffers.
+ *
+ * In fact, benchmarks showed that the 2 and 3 buffer cases require only 2x
+ * unrolling by length, while the 4 and 5 buffer cases don't require any
+ * unrolling by length.  Benchmarks also showed that the register-indexed
+ * addressing isn't a bottleneck either; i.e., we can't do any better by
+ * incrementing the pointers as we go along, even with more unrolling.
+ */
+
+static void xor_avx512_2(long bytes, u8 *p1, const u8 *p2)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%1,%0), %%zmm0\n"
+		     "vmovdqa64 64(%1,%0), %%zmm1\n"
+		     "vpxorq (%2,%0), %%zmm0, %%zmm0\n"
+		     "vpxorq 64(%2,%0), %%zmm1, %%zmm1\n"
+		     "vmovdqa64 %%zmm0, (%1,%0)\n"
+		     "vmovdqa64 %%zmm1, 64(%1,%0)\n"
+		     "add $128, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p1 + bytes), "r"(p2 + bytes)
+		     : "memory", "cc");
+}
+
+static void xor_avx512_3(long bytes, u8 *p1, const u8 *p2, const u8 *p3)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%1,%0), %%zmm0\n"
+		     "vmovdqa64 64(%1,%0), %%zmm1\n"
+		     "vmovdqa64 (%2,%0), %%zmm2\n"
+		     "vmovdqa64 64(%2,%0), %%zmm3\n"
+		     "vpternlogq $0x96, (%3,%0), %%zmm2, %%zmm0\n"
+		     "vpternlogq $0x96, 64(%3,%0), %%zmm3, %%zmm1\n"
+		     "vmovdqa64 %%zmm0, (%1,%0)\n"
+		     "vmovdqa64 %%zmm1, 64(%1,%0)\n"
+		     "add $128, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p1 + bytes), "r"(p2 + bytes), "r"(p3 + bytes)
+		     : "memory", "cc");
+}
+
+static void xor_avx512_4(long bytes, u8 *p1, const u8 *p2, const u8 *p3,
+			 const u8 *p4)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%1,%0), %%zmm0\n"
+		     "vmovdqa64 (%2,%0), %%zmm1\n"
+		     "vpxorq (%3,%0), %%zmm0, %%zmm0\n"
+		     "vpternlogq $0x96, (%4,%0), %%zmm1, %%zmm0\n"
+		     "vmovdqa64 %%zmm0, (%1,%0)\n"
+		     "add $64, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p1 + bytes), "r"(p2 + bytes), "r"(p3 + bytes),
+		       "r"(p4 + bytes)
+		     : "memory", "cc");
+}
+
+static void xor_avx512_5(long bytes, u8 *p1, const u8 *p2, const u8 *p3,
+			 const u8 *p4, const u8 *p5)
+{
+	long i = -bytes;
+
+	asm volatile("1: vmovdqa64 (%1,%0), %%zmm0\n"
+		     "vmovdqa64 (%2,%0), %%zmm1\n"
+		     "vpternlogq $0x96, (%3,%0), %%zmm1, %%zmm0\n"
+		     "vmovdqa64 (%4,%0), %%zmm1\n"
+		     "vpternlogq $0x96, (%5,%0), %%zmm1, %%zmm0\n"
+		     "vmovdqa64 %%zmm0, (%1,%0)\n"
+		     "add $64, %0\n"
+		     "jnz 1b\n"
+		     : "+&r"(i)
+		     : "r"(p1 + bytes), "r"(p2 + bytes), "r"(p3 + bytes),
+		       "r"(p4 + bytes), "r"(p5 + bytes)
+		     : "memory", "cc");
+}
+
+DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
+	      xor_avx512_5);
+
+/*
+ * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
+ * 64-byte aligned.
+ */
+static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
+			   unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
+	kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_avx512 = {
+	.name = "avx512",
+	.xor_gen = xor_gen_avx512,
+};
diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
index 99fe85a213c6..b5d49376fc97 100644
--- a/lib/raid/xor/x86/xor_arch.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -4,26 +4,30 @@
 extern struct xor_block_template xor_block_pII_mmx;
 extern struct xor_block_template xor_block_p5_mmx;
 extern struct xor_block_template xor_block_sse;
 extern struct xor_block_template xor_block_sse_pf64;
 extern struct xor_block_template xor_block_avx;
+extern struct xor_block_template xor_block_avx512;
 
-/*
- * When SSE is available, use it as it can write around L2.  We may also be able
- * to load into the L1 only depending on how the cpu deals with a load to a line
- * that is being prefetched.
- *
- * When AVX2 is available, force using it as it is better by all measures.
- *
- * 32-bit without MMX can fall back to the generic routines.
- */
 static __always_inline void __init arch_xor_init(void)
 {
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+	if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    boot_cpu_has(X86_FEATURE_OSXSAVE) &&
+	    !boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
+		/* AVX-512 will be the best; no need to try others. */
+		/* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
+		xor_force(&xor_block_avx512);
+	} else if (boot_cpu_has(X86_FEATURE_AVX) &&
+		   boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		/* AVX will be the best; no need to try others. */
 		xor_force(&xor_block_avx);
 	} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
+		/*
+		 * When SSE is available, use it as it can write around L2.  We
+		 * may also be able to load into the L1 only depending on how
+		 * the cpu deals with a load to a line that is being prefetched.
+		 */
 		xor_register(&xor_block_sse);
 		xor_register(&xor_block_sse_pf64);
 	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
 		xor_register(&xor_block_pII_mmx);
 		xor_register(&xor_block_p5_mmx);

base-commit: 8cd9520d35a6c38db6567e97dd93b1f11f185dc6
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-15 19:03 [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
@ 2026-06-15 19:25 ` sashiko-bot
  2026-06-15 20:10 ` Eric Biggers
  1 sibling, 0 replies; 5+ messages in thread
From: sashiko-bot @ 2026-06-15 19:25 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-raid, yukuai

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] The AVX-512 XOR implementation is forced based solely on `boot_cpu_has(X86_FEATURE_AVX512F)` without checking if the OS actually manages ZMM registers via `cpu_has_xfeatures()`. This introduces an Undefined Opcode (#UD) kernel panic vulnerability on systems where AVX-512 is supported by the CPU but restricted in XCR0.
--

commit b63aaca8e8b386ef1820d1caac3bce2cb328e8c3
Author: Eric Biggers <ebiggers@kernel.org>

lib/raid/xor: x86: Add AVX-512 optimized xor_gen()

This commit adds an AVX-512 implementation of xor_gen() using 512-bit ZMM
registers and the vpternlogq instruction. It enables this implementation on
x86_64 CPUs that support AVX512F but do not prefer YMM registers, avoiding
known downclocking issues on older architectures. The implementation
explicitly omits the cpu_has_xfeatures() check.

> Note: for now I omitted the cpu_has_xfeatures() check that the AVX-512
> optimized crypto and CRC code does, since it's not implemented on
> User-Mode Linux and it's never been present in the RAID6 code either.

[Severity: High]
Does omitting the cpu_has_xfeatures() check introduce a potential Invalid
Opcode exception in virtualized environments?

If a hypervisor advertises AVX512F in CPUID but omits the AVX-512 FPU
extended state from XSTATE, boot_cpu_has(X86_FEATURE_AVX512F) will evaluate
to true, but the OS will not enable AVX-512 in XCR0. Executing 512-bit
EVEX instructions without XCR0[7:5] enabled causes an Invalid Opcode
exception and kernel panic.

Would it be safer to preserve this check, perhaps providing a stub or
using ifdef for User-Mode Linux compatibility instead?

> diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
> index 99fe85a213c66..b5d49376fc974 100644
> --- a/lib/raid/xor/x86/xor_arch.h
> +++ b/lib/raid/xor/x86/xor_arch.h
[ ... ]
> @@ -6,22 +6,26 @@ extern struct xor_block_template xor_block_p5_mmx;
>  extern struct xor_block_template xor_block_sse;
>  extern struct xor_block_template xor_block_sse_pf64;
>  extern struct xor_block_template xor_block_avx;
> +extern struct xor_block_template xor_block_avx512;
>  
>  static __always_inline void __init arch_xor_init(void)
>  {
> -	if (boot_cpu_has(X86_FEATURE_AVX) &&
> -	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
> +	if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
> +	    boot_cpu_has(X86_FEATURE_OSXSAVE) &&
> +	    !boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
> +		/* AVX-512 will be the best; no need to try others. */
> +		/* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
> +		xor_force(&xor_block_avx512);

[Severity: High]
Can this force the AVX-512 implementation on systems that do not actually
support ZMM registers in the OS?

During boot, fpu__init_system_xstate() limits fpu_kernel_cfg.max_features
based on the XSTATE leaf, but does not clear X86_FEATURE_AVX512F from
boot_cpu_data.x86_capability. As a result, this check passes and forces
xor_block_avx512, which then attempts to execute AVX-512 instructions on
an unsupported FPU context.

Is it necessary to include cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)
here to verify the FPU context supports these instructions before forcing
the block?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260615190338.26581-1-ebiggers@kernel.org?part=1

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-15 19:03 [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
  2026-06-15 19:25 ` sashiko-bot
@ 2026-06-15 20:10 ` Eric Biggers
  2026-06-15 21:16   ` Borislav Petkov
  1 sibling, 1 reply; 5+ messages in thread
From: Eric Biggers @ 2026-06-15 20:10 UTC (permalink / raw)
  To: x86
  Cc: Christoph Hellwig, linux-crypto, David Laight, linux-raid,
	Andrew Morton, linux-kernel

On Mon, Jun 15, 2026 at 12:03:38PM -0700, Eric Biggers wrote:
> Note: for now I omitted the cpu_has_xfeatures() check that the AVX-512
> optimized crypto and CRC code does, since it's not implemented on
> User-Mode Linux and it's never been present in the RAID6 code either.

By the way, Sashiko keeps complaining about this decision.

Maybe the x86 maintainers have some advice here?

For context: on x86 processors, executing AVX or AVX512 instructions
requires not just that the CPU supports the feature, but also that the
operating system has set certain bits in XCR0.  For example all EVEX
coded instructions (i.e. AVX-512) require XCR0=111xx111b.  (See Intel
manual "2.6.11.1 State Dependent #UD".)

Therefore most of the kernel's AVX and AVX512 optimized code checks not
just X86_FEATURE_AVX* but also calls cpu_has_xfeatures() to check XCR0.

But "most" isn't all.  The RAID6 code for example doesn't check
cpu_has_xfeatures().  So if you e.g. boot a kernel in QEMU using
"-cpu max,xsave=off", it already crashes when the RAID6 code does its
boot-time benchmark.

Part of the reason for that omission probably is that UML doesn't
provide an implementation of cpu_has_xfeatures().  And the x86 RAID (XOR
and RAID6) code is enabled on UML.

It could be implemented for UML by using the xgetbv instruction, like
what userspace programs do.  (We'd also need to copy the XFEATURE_MASK_*
constants, as UML can't include arch/x86/include/asm/fpu/types.h)

But I wanted to ask: do we really care about the case where features are
"supported" but their XCR0 bits aren't set?  Perhaps the kernel just
doesn't/shouldn't support weird cases like "-cpu max,xsave=off"?

If this case indeed needs to be handled, could we make things easier for
the kernel's AVX and AVX-512 optimized code?  Currently AVX-512 needs:

        if (boot_cpu_has(X86_FEATURE_AVX512F) &&
            cpu_has_xfeatures(XFEATURE_MASK_FP | XFEATURE_MASK_SSE |
                              XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL))

How about we make X86_FEATURE_AVX512F depend on XCR0=111xx111, and
X86_FEATURE_AVX depend on XCR0=xxxxx111?  Then the cpu_has_xfeatures()
check wouldn't be needed.  Is there any reason not to do that?

- Eric

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-15 20:10 ` Eric Biggers
@ 2026-06-15 21:16   ` Borislav Petkov
  2026-06-15 21:29     ` Eric Biggers
  0 siblings, 1 reply; 5+ messages in thread
From: Borislav Petkov @ 2026-06-15 21:16 UTC (permalink / raw)
  To: Eric Biggers, x86
  Cc: Christoph Hellwig, linux-crypto, David Laight, linux-raid,
	Andrew Morton, linux-kernel

On June 15, 2026 8:10:50 PM UTC, Eric Biggers <ebiggers@kernel.org> wrote:
>
>But I wanted to ask: do we really care about the case where features are
>"supported" but their XCR0 bits aren't set?  Perhaps the kernel just
>doesn't/shouldn't support weird cases like "-cpu max,xsave=off"?
>

Yes, our aim is to support only configurations which are actually present in real hardware and not a "oh, it would be good if it did that, just because..."

>If this case indeed needs to be handled, could we make things easier for
>the kernel's AVX and AVX-512 optimized code?  Currently AVX-512 needs:
>
>        if (boot_cpu_has(X86_FEATURE_AVX512F) &&
>            cpu_has_xfeatures(XFEATURE_MASK_FP | XFEATURE_MASK_SSE |
>                              XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL))
>
>How about we make X86_FEATURE_AVX512F depend on XCR0=111xx111, and
>X86_FEATURE_AVX depend on XCR0=xxxxx111?  Then the cpu_has_xfeatures()
>check wouldn't be needed.  Is there any reason not to do that?

 How do you want to accomplish that? Very early during boot on the BSP you sanity-check XCR0 and clear feature flags if components are not set? 

Thx.

-- 
Small device. Typos and formatting crap

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-15 21:16   ` Borislav Petkov
@ 2026-06-15 21:29     ` Eric Biggers
  0 siblings, 0 replies; 5+ messages in thread
From: Eric Biggers @ 2026-06-15 21:29 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: x86, Christoph Hellwig, linux-crypto, David Laight, linux-raid,
	Andrew Morton, linux-kernel

On Mon, Jun 15, 2026 at 09:16:55PM +0000, Borislav Petkov wrote:
> On June 15, 2026 8:10:50 PM UTC, Eric Biggers <ebiggers@kernel.org> wrote:
> >
> >But I wanted to ask: do we really care about the case where features are
> >"supported" but their XCR0 bits aren't set?  Perhaps the kernel just
> >doesn't/shouldn't support weird cases like "-cpu max,xsave=off"?
> >
> 
> Yes, our aim is to support only configurations which are actually
> present in real hardware and not a "oh, it would be good if it did
> that, just because..."

Seems reasonable to me.  Would the same apply to UML here?

> >If this case indeed needs to be handled, could we make things easier for
> >the kernel's AVX and AVX-512 optimized code?  Currently AVX-512 needs:
> >
> >        if (boot_cpu_has(X86_FEATURE_AVX512F) &&
> >            cpu_has_xfeatures(XFEATURE_MASK_FP | XFEATURE_MASK_SSE |
> >                              XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL))
> >
> >How about we make X86_FEATURE_AVX512F depend on XCR0=111xx111, and
> >X86_FEATURE_AVX depend on XCR0=xxxxx111?  Then the cpu_has_xfeatures()
> >check wouldn't be needed.  Is there any reason not to do that?
> 
>  How do you want to accomplish that? Very early during boot on the BSP
>  you sanity-check XCR0 and clear feature flags if components are not
>  set? 

That would be the idea.  Something similar to what
arch/x86/kernel/cpu/cpuid-deps.c does.  Except that seems to only
enforce the dependencies when the kernel itself is disabling things; if
the hypervisor is broken then it just warns.

In any case, I'd like these to go away:

    $ git grep cpu_has_xfeatures | wc -l
    31

- Eric

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-06-15 21:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-15 19:03 [PATCH v3] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
2026-06-15 19:25 ` sashiko-bot
2026-06-15 20:10 ` Eric Biggers
2026-06-15 21:16   ` Borislav Petkov
2026-06-15 21:29     ` Eric Biggers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox