[PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor

Linux cryptographic layer development
 help / color / mirror / Atom feed

* [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
@ 2026-06-12  4:40 Eric Biggers
  2026-06-12  5:22 ` Christoph Hellwig
  0 siblings, 1 reply; 6+ messages in thread
From: Eric Biggers @ 2026-06-12  4:40 UTC (permalink / raw)
  To: Andrew Morton, linux-kernel
  Cc: Christoph Hellwig, linux-crypto, x86, Eric Biggers

Add an implementation of xor_gen() using AVX-512.

It uses 512-bit vectors, i.e. ZMM registers.  It also uses the
vpternlogq instruction to do three-input XORs when applicable.

It's enabled on x86_64 CPUs that have AVX512F && !PREFER_YMM.  In
practice that means:

    - AMD Zen 4 and later (client and server)
    - Intel Sapphire Rapids and later (server)
    - Intel Rocket Lake (client)
    - Intel Nova Lake and later (client)

The !PREFER_YMM condition excludes the older AVX-512 implementations in
Intel Skylake Server and Intel Ice Lake.  They could run this code, but
they're known to have overly-eager downclocking when ZMM registers are
used.  This is the same policy that the crypto and CRC code uses.

Benchmark on AMD Ryzen 9 9950X (Zen 5):

    src_cnt    avx2         avx512       Improvement
    =======    ==========   ==========   ===========
    1          68423 MB/s   81940 MB/s   19%
    2          56035 MB/s   74112 MB/s   32%
    3          49396 MB/s   67011 MB/s   35%
    4          43056 MB/s   60823 MB/s   41%

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/raid/xor/Makefile         |   2 +-
 lib/raid/xor/x86/xor-avx512.c | 155 ++++++++++++++++++++++++++++++++++
 lib/raid/xor/x86/xor_arch.h   |  27 +++---
 3 files changed, 172 insertions(+), 12 deletions(-)
 create mode 100644 lib/raid/xor/x86/xor-avx512.c

diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90..4af945861a51 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -26,11 +26,11 @@ xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
 xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
 xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
 xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 xor-$(CONFIG_S390)		+= s390/xor.o
 xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
-xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
+xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-avx512.o
 obj-y				+= tests/
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
 
diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
new file mode 100644
index 000000000000..d2b54aa2be98
--- /dev/null
+++ b/lib/raid/xor/x86/xor-avx512.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AVX-512 optimized implementation of xor_gen()
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/fpu/api.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+
+struct block64 {
+	u8 x[64];
+} __aligned(64);
+
+/*
+ * Use different registers for each unrolled iteration just in case it helps,
+ * though the hardware register renamer should make it unnecessary.
+ */
+
+#define DO_XOR2(i, reg0)                                   \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"          \
+		     "vpxorq %1, %%" reg0 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"          \
+		     : "+m"(p0[i])                         \
+		     : "m"(p1[i]))
+
+#define DO_XOR3(i, reg0, reg1)                                        \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"                     \
+		     "vmovdqa64 %1, %%" reg1 "\n"                     \
+		     "vpternlogq $0x96, %2, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"                     \
+		     : "+m"(p0[i])                                    \
+		     : "m"(p1[i]), "m"(p2[i]))
+
+#define DO_XOR4(i, reg0, reg1)                                        \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"                     \
+		     "vmovdqa64 %1, %%" reg1 "\n"                     \
+		     "vpxorq %2, %%" reg0 ", %%" reg0 "\n"            \
+		     "vpternlogq $0x96, %3, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"                     \
+		     : "+m"(p0[i])                                    \
+		     : "m"(p1[i]), "m"(p2[i]), "m"(p3[i]))
+
+#define DO_XOR5(i, reg0, reg1)                                        \
+	asm volatile("vmovdqa64 %0, %%" reg0 "\n"                     \
+		     "vmovdqa64 %1, %%" reg1 "\n"                     \
+		     "vpternlogq $0x96, %2, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %3, %%" reg1 "\n"                     \
+		     "vpternlogq $0x96, %4, %%" reg1 ", %%" reg0 "\n" \
+		     "vmovdqa64 %%" reg0 ", %0\n"                     \
+		     : "+m"(p0[i])                                    \
+		     : "m"(p1[i]), "m"(p2[i]), "m"(p3[i]), "m"(p4[i]))
+
+static void xor_avx512_2(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1)
+{
+	do {
+		DO_XOR2(0, "zmm0");
+		DO_XOR2(1, "zmm1");
+		DO_XOR2(2, "zmm2");
+		DO_XOR2(3, "zmm3");
+		DO_XOR2(4, "zmm4");
+		DO_XOR2(5, "zmm5");
+		DO_XOR2(6, "zmm6");
+		DO_XOR2(7, "zmm7");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		bytes -= 512;
+	} while (bytes);
+}
+
+static void xor_avx512_3(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1, const struct block64 *p2)
+{
+	do {
+		DO_XOR3(0, "zmm0", "zmm1");
+		DO_XOR3(1, "zmm2", "zmm3");
+		DO_XOR3(2, "zmm4", "zmm5");
+		DO_XOR3(3, "zmm6", "zmm7");
+		DO_XOR3(4, "zmm8", "zmm9");
+		DO_XOR3(5, "zmm10", "zmm11");
+		DO_XOR3(6, "zmm12", "zmm13");
+		DO_XOR3(7, "zmm14", "zmm15");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		p2 += 512 / sizeof(*p2);
+		bytes -= 512;
+	} while (bytes);
+}
+
+static void xor_avx512_4(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1, const struct block64 *p2,
+			 const struct block64 *p3)
+{
+	do {
+		DO_XOR4(0, "zmm0", "zmm1");
+		DO_XOR4(1, "zmm2", "zmm3");
+		DO_XOR4(2, "zmm4", "zmm5");
+		DO_XOR4(3, "zmm6", "zmm7");
+		DO_XOR4(4, "zmm8", "zmm9");
+		DO_XOR4(5, "zmm10", "zmm11");
+		DO_XOR4(6, "zmm12", "zmm13");
+		DO_XOR4(7, "zmm14", "zmm15");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		p2 += 512 / sizeof(*p2);
+		p3 += 512 / sizeof(*p3);
+		bytes -= 512;
+	} while (bytes);
+}
+
+static void xor_avx512_5(size_t bytes, struct block64 *p0,
+			 const struct block64 *p1, const struct block64 *p2,
+			 const struct block64 *p3, const struct block64 *p4)
+{
+	do {
+		DO_XOR5(0, "zmm0", "zmm1");
+		DO_XOR5(1, "zmm2", "zmm3");
+		DO_XOR5(2, "zmm4", "zmm5");
+		DO_XOR5(3, "zmm6", "zmm7");
+		DO_XOR5(4, "zmm8", "zmm9");
+		DO_XOR5(5, "zmm10", "zmm11");
+		DO_XOR5(6, "zmm12", "zmm13");
+		DO_XOR5(7, "zmm14", "zmm15");
+		p0 += 512 / sizeof(*p0);
+		p1 += 512 / sizeof(*p1);
+		p2 += 512 / sizeof(*p2);
+		p3 += 512 / sizeof(*p3);
+		p4 += 512 / sizeof(*p4);
+		bytes -= 512;
+	} while (bytes);
+}
+
+DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
+	      xor_avx512_5);
+
+/*
+ * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
+ * 64-byte aligned.
+ */
+static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
+			   unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
+	kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_avx512 = {
+	.name = "avx512",
+	.xor_gen = xor_gen_avx512,
+};
diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
index 99fe85a213c6..199124e32c27 100644
--- a/lib/raid/xor/x86/xor_arch.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -1,29 +1,34 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 #include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
 
 extern struct xor_block_template xor_block_pII_mmx;
 extern struct xor_block_template xor_block_p5_mmx;
 extern struct xor_block_template xor_block_sse;
 extern struct xor_block_template xor_block_sse_pf64;
 extern struct xor_block_template xor_block_avx;
+extern struct xor_block_template xor_block_avx512;
 
-/*
- * When SSE is available, use it as it can write around L2.  We may also be able
- * to load into the L1 only depending on how the cpu deals with a load to a line
- * that is being prefetched.
- *
- * When AVX2 is available, force using it as it is better by all measures.
- *
- * 32-bit without MMX can fall back to the generic routines.
- */
 static __always_inline void __init arch_xor_init(void)
 {
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+	if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) {
+		/* AVX-512 will be the best; no need to try others. */
+		/* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
+		xor_force(&xor_block_avx512);
+	} else if (boot_cpu_has(X86_FEATURE_AVX) &&
+		   boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		/* AVX will be the best; no need to try others. */
 		xor_force(&xor_block_avx);
 	} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
+		/*
+		 * When SSE is available, use it as it can write around L2.  We
+		 * may also be able to load into the L1 only depending on how
+		 * the cpu deals with a load to a line that is being prefetched.
+		 */
 		xor_register(&xor_block_sse);
 		xor_register(&xor_block_sse_pf64);
 	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
 		xor_register(&xor_block_pII_mmx);
 		xor_register(&xor_block_p5_mmx);

base-commit: 9716c086c8e8b141d35aa61f2e96a2e83de212a7
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-12  4:40 [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
@ 2026-06-12  5:22 ` Christoph Hellwig
  2026-06-12  5:59   ` Eric Biggers
  2026-06-12  9:04   ` David Laight
  0 siblings, 2 replies; 6+ messages in thread
From: Christoph Hellwig @ 2026-06-12  5:22 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Andrew Morton, linux-kernel, Christoph Hellwig, linux-crypto, x86,
	Andrea Mazzoleni

On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:
> Add an implementation of xor_gen() using AVX-512.

> Benchmark on AMD Ryzen 9 9950X (Zen 5):

Can you share the benchmark?

In my local tree I have ports of the AVX2 and AVX512 implementations
from snapraid (https://github.com/amadvance/snapraid), which in userspace
give really good performance.  On my Laptop with a AMD Ryzen AI 7 PRO 350
(which is a Zen5 with the slower double pumped AVX512 unit), both of
them get over 1GB/s throughput on the snapraid benchmarks.  I've been
holding them back as I don't have a good kernel benchmarking harness,
and it's missing the quirks for old AVX512 or the newer AMD special
cases.

Attached for reference.

Note that either way I'd prefer if we could get away from the stange
old code organization with the DO{1-4} helpers which don't really
help.

diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90..3d5ebeda241e 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -28,7 +28,7 @@ xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
 xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 xor-$(CONFIG_S390)		+= s390/xor.o
 xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
-xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
+xor-$(CONFIG_X86_64)		+= x86/xor-avx512.o x86/xor-avx.o x86/xor-sse.o
 obj-y				+= tests/
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
diff --git a/lib/raid/xor/x86/xor-avx.c b/lib/raid/xor/x86/xor-avx.c
index f7777d7aa269..cd376a7c52d3 100644
--- a/lib/raid/xor/x86/xor-avx.c
+++ b/lib/raid/xor/x86/xor-avx.c
@@ -1,152 +1,31 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Optimized XOR parity functions for AVX
- *
- * Copyright (C) 2012 Intel Corporation
- * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
- *
- * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ * Copyright (C) 2026 Andrea Mazzoleni
  */
-#include <linux/compiler.h>
 #include <asm/fpu/api.h>
 #include "xor_impl.h"
 #include "xor_arch.h"
 
-#define BLOCK4(i) \
-		BLOCK(32 * i, 0) \
-		BLOCK(32 * (i + 1), 1) \
-		BLOCK(32 * (i + 2), 2) \
-		BLOCK(32 * (i + 3), 3)
-
-#define BLOCK16() \
-		BLOCK4(0) \
-		BLOCK4(4) \
-		BLOCK4(8) \
-		BLOCK4(12)
-
-static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
-		      const unsigned long * __restrict p1)
-{
-	unsigned long lines = bytes >> 9;
-
-	while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
-	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
-		"m" (p0[i / sizeof(*p0)])); \
-	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
-		"=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
-		BLOCK16()
-
-		p0 = (unsigned long *)((uintptr_t)p0 + 512);
-		p1 = (unsigned long *)((uintptr_t)p1 + 512);
-	}
-}
-
-static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
-		      const unsigned long * __restrict p1,
-		      const unsigned long * __restrict p2)
-{
-	unsigned long lines = bytes >> 9;
-
-	while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
-	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p1[i / sizeof(*p1)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p0[i / sizeof(*p0)])); \
-	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
-		"=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
-		BLOCK16()
-
-		p0 = (unsigned long *)((uintptr_t)p0 + 512);
-		p1 = (unsigned long *)((uintptr_t)p1 + 512);
-		p2 = (unsigned long *)((uintptr_t)p2 + 512);
-	}
-}
-
-static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
-		      const unsigned long * __restrict p1,
-		      const unsigned long * __restrict p2,
-		      const unsigned long * __restrict p3)
-{
-	unsigned long lines = bytes >> 9;
-
-	while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
-	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p2[i / sizeof(*p2)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p1[i / sizeof(*p1)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p0[i / sizeof(*p0)])); \
-	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
-		"=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
-		BLOCK16();
-
-		p0 = (unsigned long *)((uintptr_t)p0 + 512);
-		p1 = (unsigned long *)((uintptr_t)p1 + 512);
-		p2 = (unsigned long *)((uintptr_t)p2 + 512);
-		p3 = (unsigned long *)((uintptr_t)p3 + 512);
-	}
-}
-
-static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
-	     const unsigned long * __restrict p1,
-	     const unsigned long * __restrict p2,
-	     const unsigned long * __restrict p3,
-	     const unsigned long * __restrict p4)
-{
-	unsigned long lines = bytes >> 9;
-
-	while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
-	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p3[i / sizeof(*p3)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p2[i / sizeof(*p2)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p1[i / sizeof(*p1)])); \
-	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
-		"m" (p0[i / sizeof(*p0)])); \
-	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
-		"=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
-		BLOCK16()
-
-		p0 = (unsigned long *)((uintptr_t)p0 + 512);
-		p1 = (unsigned long *)((uintptr_t)p1 + 512);
-		p2 = (unsigned long *)((uintptr_t)p2 + 512);
-		p3 = (unsigned long *)((uintptr_t)p3 + 512);
-		p4 = (unsigned long *)((uintptr_t)p4 + 512);
-	}
-}
-
-DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5);
-
 static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt,
 			unsigned int bytes)
 {
+	u8 **v = (u8 **)srcs;
+	u8 *p = dest;
+	unsigned int i, d;
+
 	kernel_fpu_begin();
-	xor_gen_avx_inner(dest, srcs, src_cnt, bytes);
+	for (i = 0; i < bytes; i += 64) {
+		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[i]));
+		asm volatile ("vmovdqa %0,%%ymm1" : : "m" (p[i + 32]));
+		for (d = 0; d < src_cnt; ++d) {
+			asm volatile ("vpxor %0,%%ymm0,%%ymm0"
+				: : "m" (v[d][i]));
+			asm volatile ("vpxor %0,%%ymm1,%%ymm1"
+				: : "m" (v[d][i + 32]));
+		}
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (p[i + 32]));
+	}
 	kernel_fpu_end();
 }
 
diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
new file mode 100644
index 000000000000..9b323a0e1821
--- /dev/null
+++ b/lib/raid/xor/x86/xor-avx512.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2026 Andrea Mazzoleni
+ */
+#include <asm/fpu/api.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+
+static void xor_gen_avx512bw(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes)
+{
+	unsigned int last = src_cnt - 1, i, d;
+	u8 **v = (u8 **)srcs;
+	u8 *p = dest;
+
+	kernel_fpu_begin();
+	for (i = 0; i < bytes; i += 64) {
+		asm volatile("vmovdqa64 %0,%%zmm0" : : "m" (p[i]));
+		for (d = 0; d < last; d += 2)
+			asm volatile("vmovdqa64 %0,%%zmm1\n\t"
+				     "vpternlogq $0x96,%1,%%zmm1,%%zmm0"
+				     : : "m" (v[d][i]), "m" (v[d + 1][i]));
+		if (d == last)
+			asm volatile("vpxorq %0,%%zmm0,%%zmm0"
+				     : : "m" (v[last][i]));
+		asm volatile("vmovntdq %%zmm0,%0" : "=m" (p[i]));
+	}
+	kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_avx512bw = {
+	.name		= "avx512bw",
+	.xor_gen	= xor_gen_avx512bw,
+};
diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
index 99fe85a213c6..73c81221fc01 100644
--- a/lib/raid/xor/x86/xor_arch.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -6,6 +6,7 @@ extern struct xor_block_template xor_block_p5_mmx;
 extern struct xor_block_template xor_block_sse;
 extern struct xor_block_template xor_block_sse_pf64;
 extern struct xor_block_template xor_block_avx;
+extern struct xor_block_template xor_block_avx512bw;
 
 /*
  * When SSE is available, use it as it can write around L2.  We may also be able
@@ -20,7 +21,12 @@ static __always_inline void __init arch_xor_init(void)
 {
 	if (boot_cpu_has(X86_FEATURE_AVX) &&
 	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
-		xor_force(&xor_block_avx);
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_AVX512F) &&
+		    boot_cpu_has(X86_FEATURE_AVX512BW))
+			xor_force(&xor_block_avx512bw);
+		else
+			xor_force(&xor_block_avx);
 	} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
 		xor_register(&xor_block_sse);
 		xor_register(&xor_block_sse_pf64);

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-12  5:22 ` Christoph Hellwig
@ 2026-06-12  5:59   ` Eric Biggers
  2026-06-12  9:04   ` David Laight
  1 sibling, 0 replies; 6+ messages in thread
From: Eric Biggers @ 2026-06-12  5:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andrew Morton, linux-kernel, linux-crypto, x86, Andrea Mazzoleni

On Fri, Jun 12, 2026 at 07:22:47AM +0200, Christoph Hellwig wrote:
> On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:
> > Add an implementation of xor_gen() using AVX-512.
> 
> > Benchmark on AMD Ryzen 9 9950X (Zen 5):
> 
> Can you share the benchmark?

For now I had just hacked up do_xor_speed() as follows and changed
xor_force() to xor_register().  There should be a benchmark added to the
KUnit test similar to the one in the crypto and CRC tests, though.

diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index bd4e6e434418..8c5814af03d5 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -76,15 +76,24 @@ void __init xor_force(struct xor_block_template *tmpl)
 #define REPS		800U
 
 static void __init
-do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2,
+	     void *b3, void *b4, void *b5)
 {
+	for (int src_cnt = 1; src_cnt <= 4; src_cnt++) {
 	int speed;
 	unsigned long reps;
 	ktime_t min, start, t0;
-	void *srcs[1] = { b2 };
+	void *srcs[4] = { b2, b3, b4, b5 };
 
 	preempt_disable();
 
+	/* warm-up */
+	for (int i = 0; i < 8000; i++) {
+		mb(); /* prevent loop optimization */
+		tmpl->xor_gen(b1, srcs, src_cnt, BENCH_SIZE);
+		mb();
+	}
+
 	reps = 0;
 	t0 = ktime_get();
 	/* delay start until time has advanced */
@@ -92,7 +101,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 		cpu_relax();
 	do {
 		mb(); /* prevent loop optimization */
-		tmpl->xor_gen(b1, srcs, 1, BENCH_SIZE);
+		tmpl->xor_gen(b1, srcs, src_cnt, BENCH_SIZE);
 		mb();
 	} while (reps++ < REPS || (t0 = ktime_get()) == start);
 	min = ktime_sub(t0, start);
@@ -105,26 +114,30 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 
 	pr_info("   %-16s: %5d MB/sec\n", tmpl->name, speed);
 }
+}
 
 static int __init calibrate_xor_blocks(void)
 {
-	void *b1, *b2;
+	void *b1, *b2, *b3, *b4, *b5;
 	struct xor_block_template *f, *fastest;
 
 	if (forced_template)
 		return 0;
 
-	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
+	b1 = (void *) __get_free_pages(GFP_KERNEL, 4);
 	if (!b1) {
 		pr_warn("xor: Yikes!  No memory available.\n");
 		return -ENOMEM;
 	}
 	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+	b3 = b2 + 2*PAGE_SIZE + BENCH_SIZE;
+	b4 = b3 + 2*PAGE_SIZE + BENCH_SIZE;
+	b5 = b4 + 2*PAGE_SIZE + BENCH_SIZE;
 
 	pr_info("xor: measuring software checksum speed\n");
 	fastest = template_list;
 	for (f = template_list; f; f = f->next) {
-		do_xor_speed(f, b1, b2);
+		do_xor_speed(f, b1, b2, b3, b4, b5);
 		if (f->speed > fastest->speed)
 			fastest = f;
 	}

> In my local tree I have ports of the AVX2 and AVX512 implementations
> from snapraid (https://github.com/amadvance/snapraid), which in userspace
> give really good performance.  On my Laptop with a AMD Ryzen AI 7 PRO 350
> (which is a Zen5 with the slower double pumped AVX512 unit), both of
> them get over 1GB/s throughput on the snapraid benchmarks.  I've been
> holding them back as I don't have a good kernel benchmarking harness,
> and it's missing the quirks for old AVX512 or the newer AMD special
> cases.
> 
> Attached for reference.
> 
> Note that either way I'd prefer if we could get away from the stange
> old code organization with the DO{1-4} helpers which don't really
> help.

Well, doing the same on your avx512bw version and adding a column to my
table for it (by the way, I think it really just needs avx512f), I get:

        src_cnt    avx          avx512       avx512bw
        =======    ==========   ==========   ==========
        1          68423 MB/s   81940 MB/s   12067 MB/s
        2          56035 MB/s   74112 MB/s   10958 MB/s
        3          49396 MB/s   67011 MB/s   8608 MB/s
        4          43056 MB/s   60823 MB/s   8069 MB/s

So, your version isn't great, I'm afraid.  Making the inner loop be over
src_cnt does simplify the code a lot, but it destroys performance since
it turns into 9 instructions for each 64 bytes in each 3 buffers:

      5b:   89 c1                   mov    %eax,%ecx
      5d:   8d 70 01                lea    0x1(%rax),%esi
      60:   48 8b 0c cb             mov    (%rbx,%rcx,8),%rcx
      64:   48 8b 34 f3             mov    (%rbx,%rsi,8),%rsi
      68:   62 f1 fd 48 6f 0c 11    vmovdqa64 (%rcx,%rdx,1),%zmm1
      6f:   62 f3 f5 48 25 04 16    vpternlogq $0x96,(%rsi,%rdx,1),%zmm1,%zmm0
      76:   96 
      77:   83 c0 02                add    $0x2,%eax
      7a:   39 f8                   cmp    %edi,%eax
      7c:   72 dd                   jb     5b <xor_gen_avx512bw+0x4b>

You could try unrolling by 512 bytes, which should help.

- Eric

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-12  5:22 ` Christoph Hellwig
  2026-06-12  5:59   ` Eric Biggers
@ 2026-06-12  9:04   ` David Laight
  2026-06-13  0:27     ` Eric Biggers
  1 sibling, 1 reply; 6+ messages in thread
From: David Laight @ 2026-06-12  9:04 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Eric Biggers, Andrew Morton, linux-kernel, linux-crypto, x86,
	Andrea Mazzoleni

On Fri, 12 Jun 2026 07:22:47 +0200
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:
> > Add an implementation of xor_gen() using AVX-512.  
> 
> > Benchmark on AMD Ryzen 9 9950X (Zen 5):  
> 
> Can you share the benchmark?
> 
> In my local tree I have ports of the AVX2 and AVX512 implementations
> from snapraid (https://github.com/amadvance/snapraid), which in userspace
> give really good performance.  On my Laptop with a AMD Ryzen AI 7 PRO 350
> (which is a Zen5 with the slower double pumped AVX512 unit), both of
> them get over 1GB/s throughput on the snapraid benchmarks.  I've been
> holding them back as I don't have a good kernel benchmarking harness,
> and it's missing the quirks for old AVX512 or the newer AMD special
> cases.

From my experiments on Intel cpu (and I don't remember the zen-5 being
that different - but I've done less testing on it) you don't need to
unroll loops very much at all.

A reasonable model seems to be that the uops generated by the instruction
decoder get executed when all the prerequisite registers and the required
execution unit are available.
So for a memory copy (and the xor is basically a copy) the control loop
can run way ahead of the read/write instructions.
This means you can get the control loop 'for free' and unrolling further
makes no/little difference.

Each xor is two memory reads and one memory write.
The cpu I was using could only do one write/clock - so you can only do one
xor each clock. I think some of the newer ones can to two writes/clock but
I'm not sure how many reads/clock they can do - might still be 2, don't
think it s 4.
So you should be able to get one xor per clock, but I doubt you'll get two
(and possibly not even 1.3 - which would require 4 memory accesses per clock).

The best loop construct is the one that uses negative offsets from the
end of the buffers, basically:
	buf += len;
	offset = -len;
	do
		f(buf[offset]);
	while (offset += size);
that reduces the loop control to just an 'add' and 'jnz' (which can
get merged into a single u-op).

The cpu have enough execution units to execute two memory reads,
a memory write, an xor the add and jnz every clock.
So even the 'rolled up' loop might run at one xor per clock.
While I think I got a 'one clock loop' on my zen-5 (testing
word-at-a-time strlen) I only managed a two clock loop on the newest
Intel cpu I've got (which isn't that new).
So put two xor in the loop and it shouldn't be limited by the loop
control, but will be limited by the memory accesses instead.

Further unrolling shouldn't help and may make things worse.
The Intel cpu have logic to directly forward the result of an
ALU instruction into the next few instructions, but after that you can
get a stall because of the 'round trip' via the register file.
So part way down an unrolled nn(%reg) sequence you can get a stall.
An extra 'add $0,%reg' in the middle of the unrolled loop will
'refresh' the register and speed things up.
(I hit that with a loop that needed a rather more complicated control
structure.)

You definitely need to use the pmc clock counter and data dependencies
against the rdpmc instruction to get sensible performance figures.
The can reasonably reliably measure down to less than 20 clocks.

	David

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-12  9:04   ` David Laight
@ 2026-06-13  0:27     ` Eric Biggers
  2026-06-13  8:48       ` David Laight
  0 siblings, 1 reply; 6+ messages in thread
From: Eric Biggers @ 2026-06-13  0:27 UTC (permalink / raw)
  To: David Laight
  Cc: Christoph Hellwig, Andrew Morton, linux-kernel, linux-crypto, x86,
	Andrea Mazzoleni

On Fri, Jun 12, 2026 at 10:04:32AM +0100, David Laight wrote:
> On Fri, 12 Jun 2026 07:22:47 +0200
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:
> > > Add an implementation of xor_gen() using AVX-512.  
> > 
> > > Benchmark on AMD Ryzen 9 9950X (Zen 5):  
> > 
> > Can you share the benchmark?
> > 
> > In my local tree I have ports of the AVX2 and AVX512 implementations
> > from snapraid (https://github.com/amadvance/snapraid), which in userspace
> > give really good performance.  On my Laptop with a AMD Ryzen AI 7 PRO 350
> > (which is a Zen5 with the slower double pumped AVX512 unit), both of
> > them get over 1GB/s throughput on the snapraid benchmarks.  I've been
> > holding them back as I don't have a good kernel benchmarking harness,
> > and it's missing the quirks for old AVX512 or the newer AMD special
> > cases.
> 
> From my experiments on Intel cpu (and I don't remember the zen-5 being
> that different - but I've done less testing on it) you don't need to
> unroll loops very much at all.
> 
> A reasonable model seems to be that the uops generated by the instruction
> decoder get executed when all the prerequisite registers and the required
> execution unit are available.
> So for a memory copy (and the xor is basically a copy) the control loop
> can run way ahead of the read/write instructions.
> This means you can get the control loop 'for free' and unrolling further
> makes no/little difference.
> 
> Each xor is two memory reads and one memory write.
> The cpu I was using could only do one write/clock - so you can only do one
> xor each clock. I think some of the newer ones can to two writes/clock but
> I'm not sure how many reads/clock they can do - might still be 2, don't
> think it s 4.
> So you should be able to get one xor per clock, but I doubt you'll get two
> (and possibly not even 1.3 - which would require 4 memory accesses per clock).
> 
> The best loop construct is the one that uses negative offsets from the
> end of the buffers, basically:
> 	buf += len;
> 	offset = -len;
> 	do
> 		f(buf[offset]);
> 	while (offset += size);
> that reduces the loop control to just an 'add' and 'jnz' (which can
> get merged into a single u-op).
> 
> The cpu have enough execution units to execute two memory reads,
> a memory write, an xor the add and jnz every clock.
> So even the 'rolled up' loop might run at one xor per clock.
> While I think I got a 'one clock loop' on my zen-5 (testing
> word-at-a-time strlen) I only managed a two clock loop on the newest
> Intel cpu I've got (which isn't that new).
> So put two xor in the loop and it shouldn't be limited by the loop
> control, but will be limited by the memory accesses instead.
> 
> Further unrolling shouldn't help and may make things worse.
> The Intel cpu have logic to directly forward the result of an
> ALU instruction into the next few instructions, but after that you can
> get a stall because of the 'round trip' via the register file.
> So part way down an unrolled nn(%reg) sequence you can get a stall.
> An extra 'add $0,%reg' in the middle of the unrolled loop will
> 'refresh' the register and speed things up.
> (I hit that with a loop that needed a rather more complicated control
> structure.)
> 
> You definitely need to use the pmc clock counter and data dependencies
> against the rdpmc instruction to get sensible performance figures.
> The can reasonably reliably measure down to less than 20 clocks.

The version at the end of this email is what you're suggesting, I think.
On Sapphire Rapids and Ryzen 9 9950X it's about the same speed as mine,
just a few percent slower on Sapphire with src_cnt == 1.

So we could use it.  It's just a bit fragile since it assumes the loop
overhead and indexed addressing will never be a bottleneck on any
current or future CPU.  Unrolling by more gives something more robust
that "just works", without having to analyze whether the loops are okay
on each CPU model individually based on microarchitectural details.

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * AVX-512 optimized implementation of xor_gen()
 *
 * Copyright 2026 Google LLC
 */

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/unroll.h>
#include <asm/fpu/api.h>
#include "xor_impl.h"
#include "xor_arch.h"

static void xor_avx512_2(long bytes, u8 *p0, const u8 *p1)
{
	long i = -bytes; /* Use negative indexing to minimize loop overhead. */

	p0 += bytes;
	p1 += bytes;
	unrolled_none
	do {
		/* unroll by 2x to reduce loop overhead */
		asm volatile("vmovdqa64 (%2,%0), %%zmm0\n"
			     "vmovdqa64 64(%2,%0), %%zmm1\n"
			     "vpxorq (%2,%1), %%zmm0, %%zmm0\n"
			     "vpxorq 64(%2,%1), %%zmm1, %%zmm1\n"
			     "vmovdqa64 %%zmm0, (%2,%0)\n"
			     "vmovdqa64 %%zmm1, 64(%2,%0)\n"
			     :
			     : "r"(p0), "r"(p1), "r"(i)
			     : "memory");
	} while ((i += 128) != 0);
}

static void xor_avx512_3(long bytes, u8 *p0, const u8 *p1, const u8 *p2)
{
	long i = -bytes; /* Use negative indexing to minimize loop overhead. */

	p0 += bytes;
	p1 += bytes;
	p2 += bytes;
	unrolled_none
	do {
		/* unroll by 2x to reduce loop overhead */
		asm volatile("vmovdqa64 (%3,%0), %%zmm0\n"
			     "vmovdqa64 64(%3,%0), %%zmm1\n"
			     "vmovdqa64 (%3,%1), %%zmm2\n"
			     "vmovdqa64 64(%3,%1), %%zmm3\n"
			     "vpternlogq $0x96, (%3,%2), %%zmm2, %%zmm0\n"
			     "vpternlogq $0x96, 64(%3,%2), %%zmm3, %%zmm1\n"
			     "vmovdqa64 %%zmm0, (%3,%0)\n"
			     "vmovdqa64 %%zmm1, 64(%3,%0)\n"
			     :
			     : "r"(p0), "r"(p1), "r"(p2), "r"(i)
			     : "memory");
	} while ((i += 128) != 0);
}

static void xor_avx512_4(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
			 const u8 *p3)
{
	long i = -bytes; /* Use negative indexing to minimize loop overhead. */

	p0 += bytes;
	p1 += bytes;
	p2 += bytes;
	p3 += bytes;
	unrolled_none
	do {
		asm volatile("vmovdqa64 (%4,%0), %%zmm0\n"
			     "vmovdqa64 (%4,%1), %%zmm1\n"
			     "vpxorq (%4,%2), %%zmm0, %%zmm0\n"
			     "vpternlogq $0x96, (%4,%3), %%zmm1, %%zmm0\n"
			     "vmovdqa64 %%zmm0, (%4,%0)\n"
			     :
			     : "r"(p0), "r"(p1), "r"(p2), "r"(p3), "r"(i)
			     : "memory");
	} while ((i += 64) != 0);
}

static void xor_avx512_5(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
			 const u8 *p3, const u8 *p4)
{
	long i = -bytes; /* Use negative indexing to minimize loop overhead. */

	p0 += bytes;
	p1 += bytes;
	p2 += bytes;
	p3 += bytes;
	p4 += bytes;
	unrolled_none
	do {
		asm volatile("vmovdqa64 (%5,%0), %%zmm0\n"
			     "vmovdqa64 (%5,%1), %%zmm1\n"
			     "vpternlogq $0x96, (%5,%2), %%zmm1, %%zmm0\n"
			     "vmovdqa64 (%5,%3), %%zmm1\n"
			     "vpternlogq $0x96, (%5,%4), %%zmm1, %%zmm0\n"
			     "vmovdqa64 %%zmm0, (%5,%0)\n"
			     :
			     : "r"(p0), "r"(p1), "r"(p2), "r"(p3), "r"(p4),
			       "r"(i)
			     : "memory");
	} while ((i += 64) != 0);
}

DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
	      xor_avx512_5);

/*
 * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
 * 64-byte aligned.
 */
static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
			   unsigned int bytes)
{
	kernel_fpu_begin();
	xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
	kernel_fpu_end();
}

struct xor_block_template xor_block_avx512 = {
	.name = "avx512",
	.xor_gen = xor_gen_avx512,
};

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
  2026-06-13  0:27     ` Eric Biggers
@ 2026-06-13  8:48       ` David Laight
  0 siblings, 0 replies; 6+ messages in thread
From: David Laight @ 2026-06-13  8:48 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Christoph Hellwig, Andrew Morton, linux-kernel, linux-crypto, x86,
	Andrea Mazzoleni

On Fri, 12 Jun 2026 17:27:04 -0700
Eric Biggers <ebiggers@kernel.org> wrote:

> On Fri, Jun 12, 2026 at 10:04:32AM +0100, David Laight wrote:
> > On Fri, 12 Jun 2026 07:22:47 +0200
> > Christoph Hellwig <hch@lst.de> wrote:
> >   
> > > On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:  
> > > > Add an implementation of xor_gen() using AVX-512.    
> > >   
> > > > Benchmark on AMD Ryzen 9 9950X (Zen 5):    
> > > 
> > > Can you share the benchmark?
> > > 
> > > In my local tree I have ports of the AVX2 and AVX512 implementations
> > > from snapraid (https://github.com/amadvance/snapraid), which in userspace
> > > give really good performance.  On my Laptop with a AMD Ryzen AI 7 PRO 350
> > > (which is a Zen5 with the slower double pumped AVX512 unit), both of
> > > them get over 1GB/s throughput on the snapraid benchmarks.  I've been
> > > holding them back as I don't have a good kernel benchmarking harness,
> > > and it's missing the quirks for old AVX512 or the newer AMD special
> > > cases.  
> > 
> > From my experiments on Intel cpu (and I don't remember the zen-5 being
> > that different - but I've done less testing on it) you don't need to
> > unroll loops very much at all.
> > 
> > A reasonable model seems to be that the uops generated by the instruction
> > decoder get executed when all the prerequisite registers and the required
> > execution unit are available.
> > So for a memory copy (and the xor is basically a copy) the control loop
> > can run way ahead of the read/write instructions.
> > This means you can get the control loop 'for free' and unrolling further
> > makes no/little difference.
> > 
> > Each xor is two memory reads and one memory write.
> > The cpu I was using could only do one write/clock - so you can only do one
> > xor each clock. I think some of the newer ones can to two writes/clock but
> > I'm not sure how many reads/clock they can do - might still be 2, don't
> > think it s 4.
> > So you should be able to get one xor per clock, but I doubt you'll get two
> > (and possibly not even 1.3 - which would require 4 memory accesses per clock).
> > 
> > The best loop construct is the one that uses negative offsets from the
> > end of the buffers, basically:
> > 	buf += len;
> > 	offset = -len;
> > 	do
> > 		f(buf[offset]);
> > 	while (offset += size);
> > that reduces the loop control to just an 'add' and 'jnz' (which can
> > get merged into a single u-op).
> > 
> > The cpu have enough execution units to execute two memory reads,
> > a memory write, an xor the add and jnz every clock.
> > So even the 'rolled up' loop might run at one xor per clock.
> > While I think I got a 'one clock loop' on my zen-5 (testing
> > word-at-a-time strlen) I only managed a two clock loop on the newest
> > Intel cpu I've got (which isn't that new).
> > So put two xor in the loop and it shouldn't be limited by the loop
> > control, but will be limited by the memory accesses instead.
> > 
> > Further unrolling shouldn't help and may make things worse.
> > The Intel cpu have logic to directly forward the result of an
> > ALU instruction into the next few instructions, but after that you can
> > get a stall because of the 'round trip' via the register file.
> > So part way down an unrolled nn(%reg) sequence you can get a stall.
> > An extra 'add $0,%reg' in the middle of the unrolled loop will
> > 'refresh' the register and speed things up.
> > (I hit that with a loop that needed a rather more complicated control
> > structure.)
> > 
> > You definitely need to use the pmc clock counter and data dependencies
> > against the rdpmc instruction to get sensible performance figures.
> > The can reasonably reliably measure down to less than 20 clocks.  
> 
> The version at the end of this email is what you're suggesting, I think.

Looks about right (I wouldn't have found 'vpternlogq $0x96').
Should be read limited on both cpu.
I think zen5 can do two avx-512 reads and (maybe or) two avx-512 writes per clock.
(zen4 reads take two clocks so you might as well use avx-256.)
Sapphire raids might be the same, but I recall some cpu supports 3 reads/clock.

> On Sapphire Rapids and Ryzen 9 9950X it's about the same speed as mine,
> just a few percent slower on Sapphire with src_cnt == 1.

Is that 512 bytes?
The minimum block size for these in 128 bytes.
As you know a smaller block size is generally better if you need to support
arbitrary lengths.

> 
> So we could use it.  It's just a bit fragile since it assumes the loop
> overhead and indexed addressing will never be a bottleneck on any
> current or future CPU.  Unrolling by more gives something more robust
> that "just works", without having to analyze whether the loops are okay
> on each CPU model individually based on microarchitectural details.

Unrolling further is likely to be slower in 'real life' because of the
effects on the I-cache.
Not to mention D-cache effects - the exact cache alignment can matter.
It is also unlikely that future cpu will be significantly slower.

The more usual problem is that some older cpu (usually zen1) ends up
running the code significantly slower than other algorithms.
That might matter for the avx-128 version of this code.

I might try putting these functions through some user-space clock count
measuring code I've written.
You done the hard bit of getting the asm syntax right.

	David


> 
> // SPDX-License-Identifier: GPL-2.0-or-later
> /*
>  * AVX-512 optimized implementation of xor_gen()
>  *
>  * Copyright 2026 Google LLC
>  */
> 
> #include <linux/compiler.h>
> #include <linux/types.h>
> #include <linux/unroll.h>
> #include <asm/fpu/api.h>
> #include "xor_impl.h"
> #include "xor_arch.h"
> 
> static void xor_avx512_2(long bytes, u8 *p0, const u8 *p1)
> {
> 	long i = -bytes; /* Use negative indexing to minimize loop overhead. */
> 
> 	p0 += bytes;
> 	p1 += bytes;
> 	unrolled_none
> 	do {
> 		/* unroll by 2x to reduce loop overhead */
> 		asm volatile("vmovdqa64 (%2,%0), %%zmm0\n"
> 			     "vmovdqa64 64(%2,%0), %%zmm1\n"
> 			     "vpxorq (%2,%1), %%zmm0, %%zmm0\n"
> 			     "vpxorq 64(%2,%1), %%zmm1, %%zmm1\n"
> 			     "vmovdqa64 %%zmm0, (%2,%0)\n"
> 			     "vmovdqa64 %%zmm1, 64(%2,%0)\n"
> 			     :
> 			     : "r"(p0), "r"(p1), "r"(i)
> 			     : "memory");
> 	} while ((i += 128) != 0);
> }
> 
> static void xor_avx512_3(long bytes, u8 *p0, const u8 *p1, const u8 *p2)
> {
> 	long i = -bytes; /* Use negative indexing to minimize loop overhead. */
> 
> 	p0 += bytes;
> 	p1 += bytes;
> 	p2 += bytes;
> 	unrolled_none
> 	do {
> 		/* unroll by 2x to reduce loop overhead */
> 		asm volatile("vmovdqa64 (%3,%0), %%zmm0\n"
> 			     "vmovdqa64 64(%3,%0), %%zmm1\n"
> 			     "vmovdqa64 (%3,%1), %%zmm2\n"
> 			     "vmovdqa64 64(%3,%1), %%zmm3\n"
> 			     "vpternlogq $0x96, (%3,%2), %%zmm2, %%zmm0\n"
> 			     "vpternlogq $0x96, 64(%3,%2), %%zmm3, %%zmm1\n"
> 			     "vmovdqa64 %%zmm0, (%3,%0)\n"
> 			     "vmovdqa64 %%zmm1, 64(%3,%0)\n"
> 			     :
> 			     : "r"(p0), "r"(p1), "r"(p2), "r"(i)
> 			     : "memory");
> 	} while ((i += 128) != 0);
> }
> 
> static void xor_avx512_4(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
> 			 const u8 *p3)
> {
> 	long i = -bytes; /* Use negative indexing to minimize loop overhead. */
> 
> 	p0 += bytes;
> 	p1 += bytes;
> 	p2 += bytes;
> 	p3 += bytes;
> 	unrolled_none
> 	do {
> 		asm volatile("vmovdqa64 (%4,%0), %%zmm0\n"
> 			     "vmovdqa64 (%4,%1), %%zmm1\n"
> 			     "vpxorq (%4,%2), %%zmm0, %%zmm0\n"
> 			     "vpternlogq $0x96, (%4,%3), %%zmm1, %%zmm0\n"
> 			     "vmovdqa64 %%zmm0, (%4,%0)\n"
> 			     :
> 			     : "r"(p0), "r"(p1), "r"(p2), "r"(p3), "r"(i)
> 			     : "memory");
> 	} while ((i += 64) != 0);
> }
> 
> static void xor_avx512_5(long bytes, u8 *p0, const u8 *p1, const u8 *p2,
> 			 const u8 *p3, const u8 *p4)
> {
> 	long i = -bytes; /* Use negative indexing to minimize loop overhead. */
> 
> 	p0 += bytes;
> 	p1 += bytes;
> 	p2 += bytes;
> 	p3 += bytes;
> 	p4 += bytes;
> 	unrolled_none
> 	do {
> 		asm volatile("vmovdqa64 (%5,%0), %%zmm0\n"
> 			     "vmovdqa64 (%5,%1), %%zmm1\n"
> 			     "vpternlogq $0x96, (%5,%2), %%zmm1, %%zmm0\n"
> 			     "vmovdqa64 (%5,%3), %%zmm1\n"
> 			     "vpternlogq $0x96, (%5,%4), %%zmm1, %%zmm0\n"
> 			     "vmovdqa64 %%zmm0, (%5,%0)\n"
> 			     :
> 			     : "r"(p0), "r"(p1), "r"(p2), "r"(p3), "r"(p4),
> 			       "r"(i)
> 			     : "memory");
> 	} while ((i += 64) != 0);
> }
> 
> DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
> 	      xor_avx512_5);
> 
> /*
>  * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
>  * 64-byte aligned.
>  */
> static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
> 			   unsigned int bytes)
> {
> 	kernel_fpu_begin();
> 	xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
> 	kernel_fpu_end();
> }
> 
> struct xor_block_template xor_block_avx512 = {
> 	.name = "avx512",
> 	.xor_gen = xor_gen_avx512,
> };


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-06-13  8:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-12  4:40 [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
2026-06-12  5:22 ` Christoph Hellwig
2026-06-12  5:59   ` Eric Biggers
2026-06-12  9:04   ` David Laight
2026-06-13  0:27     ` Eric Biggers
2026-06-13  8:48       ` David Laight

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox