[PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Eric Biggers <ebiggers@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>, linux-kernel@vger.kernel.org
Cc: Christoph Hellwig <hch@lst.de>,
	linux-crypto@vger.kernel.org, x86@kernel.org,
	Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
Date: Thu, 11 Jun 2026 21:40:34 -0700	[thread overview]
Message-ID: <20260612044034.117442-1-ebiggers@kernel.org> (raw)

Add an implementation of xor_gen() using AVX-512.

It uses 512-bit vectors, i.e. ZMM registers.  It also uses the
vpternlogq instruction to do three-input XORs when applicable.

It's enabled on x86_64 CPUs that have AVX512F && !PREFER_YMM.  In
practice that means:

    - AMD Zen 4 and later (client and server)
    - Intel Sapphire Rapids and later (server)
    - Intel Rocket Lake (client)
    - Intel Nova Lake and later (client)

The !PREFER_YMM condition excludes the older AVX-512 implementations in
Intel Skylake Server and Intel Ice Lake.  They could run this code, but
they're known to have overly-eager downclocking when ZMM registers are
used.  This is the same policy that the crypto and CRC code uses.

Benchmark on AMD Ryzen 9 9950X (Zen 5):

    src_cnt    avx2         avx512       Improvement
    =======    ==========   ==========   ===========
    1          68423 MB/s   81940 MB/s   19%
    2          56035 MB/s   74112 MB/s   32%
    3          49396 MB/s   67011 MB/s   35%
    4          43056 MB/s   60823 MB/s   41%

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/raid/xor/Makefile         |   2 +-
 lib/raid/xor/x86/xor-avx512.c | 155 ++++++++++++++++++++++++++++++++++
 lib/raid/xor/x86/xor_arch.h   |  27 +++---
 3 files changed, 172 insertions(+), 12 deletions(-)
 create mode 100644 lib/raid/xor/x86/xor-avx512.c

diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90..4af945861a51 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -26,11 +26,11 @@ xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
 xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
 xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
 xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 xor-$(CONFIG_S390)		+= s390/xor.o
 xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
-xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
+xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-avx512.o
 obj-y				+= tests/
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
 
diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
new file mode 100644
index 000000000000..d2b54aa2be98
--- /dev/null
+++ b/lib/raid/xor/x86/xor-avx512.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AVX-512 optimized implementation of xor_gen()
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/fpu/api.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+
+struct block64 {
+	u8 x[64];
+} __aligned(64);
+
+/*
+ * Use different registers for each unrolled iteration just in case it helps,
+ * though the hardware register renamer should make it unnecessary.
+ */
+
+#define DO_XOR2(i, reg0)                                   \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"          \
+		     "vpxorq %1, %%" reg0 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"          \
+		     : "+m"(p0[i])                         \
+		     : "m"(p1[i]))
+
+#define DO_XOR3(i, reg0, reg1)                                        \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"                     \
+		     "vmovdqa64 %1, %%" reg1 "\n"                     \
+		     "vpternlogq $0x96, %2, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"                     \
+		     : "+m"(p0[i])                                    \
+		     : "m"(p1[i]), "m"(p2[i]))
+
+#define DO_XOR4(i, reg0, reg1)                                        \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"                     \
+		     "vmovdqa64 %1, %%" reg1 "\n"                     \
+		     "vpxorq %2, %%" reg0 ", %%" reg0 "\n"            \
+		     "vpternlogq $0x96, %3, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"                     \
+		     : "+m"(p0[i])                                    \
+		     : "m"(p1[i]), "m"(p2[i]), "m"(p3[i]))
+
+#define DO_XOR5(i, reg0, reg1)                                        \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"                     \
+		     "vmovdqa64 %1, %%" reg1 "\n"                     \
+		     "vpternlogq $0x96, %2, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %3, %%" reg1 "\n"                     \
+		     "vpternlogq $0x96, %4, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"                     \
+		     : "+m"(p0[i])                                    \
+		     : "m"(p1[i]), "m"(p2[i]), "m"(p3[i]), "m"(p4[i]))
+
+static void xor_avx512_2(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1)
+{
+	do {
+		DO_XOR2(0, "zmm0");
+		DO_XOR2(1, "zmm1");
+		DO_XOR2(2, "zmm2");
+		DO_XOR2(3, "zmm3");
+		DO_XOR2(4, "zmm4");
+		DO_XOR2(5, "zmm5");
+		DO_XOR2(6, "zmm6");
+		DO_XOR2(7, "zmm7");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		bytes -= 512;
+	} while (bytes);
+}
+
+static void xor_avx512_3(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1, const struct block64 *p2)
+{
+	do {
+		DO_XOR3(0, "zmm0", "zmm1");
+		DO_XOR3(1, "zmm2", "zmm3");
+		DO_XOR3(2, "zmm4", "zmm5");
+		DO_XOR3(3, "zmm6", "zmm7");
+		DO_XOR3(4, "zmm8", "zmm9");
+		DO_XOR3(5, "zmm10", "zmm11");
+		DO_XOR3(6, "zmm12", "zmm13");
+		DO_XOR3(7, "zmm14", "zmm15");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		p2 += 512 / sizeof(*p2);
+		bytes -= 512;
+	} while (bytes);
+}
+
+static void xor_avx512_4(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1, const struct block64 *p2,
+			 const struct block64 *p3)
+{
+	do {
+		DO_XOR4(0, "zmm0", "zmm1");
+		DO_XOR4(1, "zmm2", "zmm3");
+		DO_XOR4(2, "zmm4", "zmm5");
+		DO_XOR4(3, "zmm6", "zmm7");
+		DO_XOR4(4, "zmm8", "zmm9");
+		DO_XOR4(5, "zmm10", "zmm11");
+		DO_XOR4(6, "zmm12", "zmm13");
+		DO_XOR4(7, "zmm14", "zmm15");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		p2 += 512 / sizeof(*p2);
+		p3 += 512 / sizeof(*p3);
+		bytes -= 512;
+	} while (bytes);
+}
+
+static void xor_avx512_5(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1, const struct block64 *p2,
+			 const struct block64 *p3, const struct block64 *p4)
+{
+	do {
+		DO_XOR5(0, "zmm0", "zmm1");
+		DO_XOR5(1, "zmm2", "zmm3");
+		DO_XOR5(2, "zmm4", "zmm5");
+		DO_XOR5(3, "zmm6", "zmm7");
+		DO_XOR5(4, "zmm8", "zmm9");
+		DO_XOR5(5, "zmm10", "zmm11");
+		DO_XOR5(6, "zmm12", "zmm13");
+		DO_XOR5(7, "zmm14", "zmm15");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		p2 += 512 / sizeof(*p2);
+		p3 += 512 / sizeof(*p3);
+		p4 += 512 / sizeof(*p4);
+		bytes -= 512;
+	} while (bytes);
+}
+
+DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
+	      xor_avx512_5);
+
+/*
+ * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
+ * 64-byte aligned.
+ */
+static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
+			   unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
+	kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_avx512 = {
+	.name = "avx512",
+	.xor_gen = xor_gen_avx512,
+};
diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
index 99fe85a213c6..199124e32c27 100644
--- a/lib/raid/xor/x86/xor_arch.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -1,29 +1,34 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 #include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
 
 extern struct xor_block_template xor_block_pII_mmx;
 extern struct xor_block_template xor_block_p5_mmx;
 extern struct xor_block_template xor_block_sse;
 extern struct xor_block_template xor_block_sse_pf64;
 extern struct xor_block_template xor_block_avx;
+extern struct xor_block_template xor_block_avx512;
 
-/*
- * When SSE is available, use it as it can write around L2.  We may also be able
- * to load into the L1 only depending on how the cpu deals with a load to a line
- * that is being prefetched.
- *
- * When AVX2 is available, force using it as it is better by all measures.
- *
- * 32-bit without MMX can fall back to the generic routines.
- */
 static __always_inline void __init arch_xor_init(void)
 {
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+	if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) {
+		/* AVX-512 will be the best; no need to try others. */
+		/* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
+		xor_force(&xor_block_avx512);
+	} else if (boot_cpu_has(X86_FEATURE_AVX) &&
+		   boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		/* AVX will be the best; no need to try others. */
 		xor_force(&xor_block_avx);
 	} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
+		/*
+		 * When SSE is available, use it as it can write around L2.  We
+		 * may also be able to load into the L1 only depending on how
+		 * the cpu deals with a load to a line that is being prefetched.
+		 */
 		xor_register(&xor_block_sse);
 		xor_register(&xor_block_sse_pf64);
 	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
 		xor_register(&xor_block_pII_mmx);
 		xor_register(&xor_block_p5_mmx);

base-commit: 9716c086c8e8b141d35aa61f2e96a2e83de212a7
-- 
2.54.0

next             reply	other threads:[~2026-06-12  4:42 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-12  4:40 Eric Biggers [this message]
2026-06-12  5:22 ` [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Christoph Hellwig
2026-06-12  5:59   ` Eric Biggers
2026-06-12  9:04   ` David Laight

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:4d633dfd5b9 dfblob:4af945861a5 dfblob:d2b54aa2be9
dfblob:99fe85a213c dfblob:199124e32c2 )
 OR (
bs:"[PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260612044034.117442-1-ebiggers@kernel.org \
    --to=ebiggers@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=hch@lst.de \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.