* [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
@ 2026-06-12 4:40 Eric Biggers
2026-06-12 5:22 ` Christoph Hellwig
0 siblings, 1 reply; 4+ messages in thread
From: Eric Biggers @ 2026-06-12 4:40 UTC (permalink / raw)
To: Andrew Morton, linux-kernel
Cc: Christoph Hellwig, linux-crypto, x86, Eric Biggers
Add an implementation of xor_gen() using AVX-512.
It uses 512-bit vectors, i.e. ZMM registers. It also uses the
vpternlogq instruction to do three-input XORs when applicable.
It's enabled on x86_64 CPUs that have AVX512F && !PREFER_YMM. In
practice that means:
- AMD Zen 4 and later (client and server)
- Intel Sapphire Rapids and later (server)
- Intel Rocket Lake (client)
- Intel Nova Lake and later (client)
The !PREFER_YMM condition excludes the older AVX-512 implementations in
Intel Skylake Server and Intel Ice Lake. They could run this code, but
they're known to have overly-eager downclocking when ZMM registers are
used. This is the same policy that the crypto and CRC code uses.
Benchmark on AMD Ryzen 9 9950X (Zen 5):
src_cnt avx2 avx512 Improvement
======= ========== ========== ===========
1 68423 MB/s 81940 MB/s 19%
2 56035 MB/s 74112 MB/s 32%
3 49396 MB/s 67011 MB/s 35%
4 43056 MB/s 60823 MB/s 41%
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
lib/raid/xor/Makefile | 2 +-
lib/raid/xor/x86/xor-avx512.c | 155 ++++++++++++++++++++++++++++++++++
lib/raid/xor/x86/xor_arch.h | 27 +++---
3 files changed, 172 insertions(+), 12 deletions(-)
create mode 100644 lib/raid/xor/x86/xor-avx512.c
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90..4af945861a51 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -26,11 +26,11 @@ xor-$(CONFIG_ALTIVEC) += powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
xor-$(CONFIG_RISCV_ISA_V) += riscv/xor.o riscv/xor-glue.o
xor-$(CONFIG_SPARC32) += sparc/xor-sparc32.o
xor-$(CONFIG_SPARC64) += sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
xor-$(CONFIG_S390) += s390/xor.o
xor-$(CONFIG_X86_32) += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
-xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o
+xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o x86/xor-avx512.o
obj-y += tests/
CFLAGS_arm/xor-neon.o += $(CC_FLAGS_FPU)
CFLAGS_REMOVE_arm/xor-neon.o += $(CC_FLAGS_NO_FPU)
diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
new file mode 100644
index 000000000000..d2b54aa2be98
--- /dev/null
+++ b/lib/raid/xor/x86/xor-avx512.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AVX-512 optimized implementation of xor_gen()
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/fpu/api.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+
+struct block64 {
+ u8 x[64];
+} __aligned(64);
+
+/*
+ * Use different registers for each unrolled iteration just in case it helps,
+ * though the hardware register renamer should make it unnecessary.
+ */
+
+#define DO_XOR2(i, reg0) \
+ asm volatile("vmovdqa64 %0, %%" reg0 "\n" \
+ "vpxorq %1, %%" reg0 ", %%" reg0 "\n" \
+ "vmovdqa64 %%" reg0 ", %0\n" \
+ : "+m"(p0[i]) \
+ : "m"(p1[i]))
+
+#define DO_XOR3(i, reg0, reg1) \
+ asm volatile("vmovdqa64 %0, %%" reg0 "\n" \
+ "vmovdqa64 %1, %%" reg1 "\n" \
+ "vpternlogq $0x96, %2, %%" reg1 ", %%" reg0 "\n" \
+ "vmovdqa64 %%" reg0 ", %0\n" \
+ : "+m"(p0[i]) \
+ : "m"(p1[i]), "m"(p2[i]))
+
+#define DO_XOR4(i, reg0, reg1) \
+ asm volatile("vmovdqa64 %0, %%" reg0 "\n" \
+ "vmovdqa64 %1, %%" reg1 "\n" \
+ "vpxorq %2, %%" reg0 ", %%" reg0 "\n" \
+ "vpternlogq $0x96, %3, %%" reg1 ", %%" reg0 "\n" \
+ "vmovdqa64 %%" reg0 ", %0\n" \
+ : "+m"(p0[i]) \
+ : "m"(p1[i]), "m"(p2[i]), "m"(p3[i]))
+
+#define DO_XOR5(i, reg0, reg1) \
+ asm volatile("vmovdqa64 %0, %%" reg0 "\n" \
+ "vmovdqa64 %1, %%" reg1 "\n" \
+ "vpternlogq $0x96, %2, %%" reg1 ", %%" reg0 "\n" \
+ "vmovdqa64 %3, %%" reg1 "\n" \
+ "vpternlogq $0x96, %4, %%" reg1 ", %%" reg0 "\n" \
+ "vmovdqa64 %%" reg0 ", %0\n" \
+ : "+m"(p0[i]) \
+ : "m"(p1[i]), "m"(p2[i]), "m"(p3[i]), "m"(p4[i]))
+
+static void xor_avx512_2(size_t bytes, struct block64 *p0,
+ const struct block64 *p1)
+{
+ do {
+ DO_XOR2(0, "zmm0");
+ DO_XOR2(1, "zmm1");
+ DO_XOR2(2, "zmm2");
+ DO_XOR2(3, "zmm3");
+ DO_XOR2(4, "zmm4");
+ DO_XOR2(5, "zmm5");
+ DO_XOR2(6, "zmm6");
+ DO_XOR2(7, "zmm7");
+ p0 += 512 / sizeof(*p0);
+ p1 += 512 / sizeof(*p1);
+ bytes -= 512;
+ } while (bytes);
+}
+
+static void xor_avx512_3(size_t bytes, struct block64 *p0,
+ const struct block64 *p1, const struct block64 *p2)
+{
+ do {
+ DO_XOR3(0, "zmm0", "zmm1");
+ DO_XOR3(1, "zmm2", "zmm3");
+ DO_XOR3(2, "zmm4", "zmm5");
+ DO_XOR3(3, "zmm6", "zmm7");
+ DO_XOR3(4, "zmm8", "zmm9");
+ DO_XOR3(5, "zmm10", "zmm11");
+ DO_XOR3(6, "zmm12", "zmm13");
+ DO_XOR3(7, "zmm14", "zmm15");
+ p0 += 512 / sizeof(*p0);
+ p1 += 512 / sizeof(*p1);
+ p2 += 512 / sizeof(*p2);
+ bytes -= 512;
+ } while (bytes);
+}
+
+static void xor_avx512_4(size_t bytes, struct block64 *p0,
+ const struct block64 *p1, const struct block64 *p2,
+ const struct block64 *p3)
+{
+ do {
+ DO_XOR4(0, "zmm0", "zmm1");
+ DO_XOR4(1, "zmm2", "zmm3");
+ DO_XOR4(2, "zmm4", "zmm5");
+ DO_XOR4(3, "zmm6", "zmm7");
+ DO_XOR4(4, "zmm8", "zmm9");
+ DO_XOR4(5, "zmm10", "zmm11");
+ DO_XOR4(6, "zmm12", "zmm13");
+ DO_XOR4(7, "zmm14", "zmm15");
+ p0 += 512 / sizeof(*p0);
+ p1 += 512 / sizeof(*p1);
+ p2 += 512 / sizeof(*p2);
+ p3 += 512 / sizeof(*p3);
+ bytes -= 512;
+ } while (bytes);
+}
+
+static void xor_avx512_5(size_t bytes, struct block64 *p0,
+ const struct block64 *p1, const struct block64 *p2,
+ const struct block64 *p3, const struct block64 *p4)
+{
+ do {
+ DO_XOR5(0, "zmm0", "zmm1");
+ DO_XOR5(1, "zmm2", "zmm3");
+ DO_XOR5(2, "zmm4", "zmm5");
+ DO_XOR5(3, "zmm6", "zmm7");
+ DO_XOR5(4, "zmm8", "zmm9");
+ DO_XOR5(5, "zmm10", "zmm11");
+ DO_XOR5(6, "zmm12", "zmm13");
+ DO_XOR5(7, "zmm14", "zmm15");
+ p0 += 512 / sizeof(*p0);
+ p1 += 512 / sizeof(*p1);
+ p2 += 512 / sizeof(*p2);
+ p3 += 512 / sizeof(*p3);
+ p4 += 512 / sizeof(*p4);
+ bytes -= 512;
+ } while (bytes);
+}
+
+DO_XOR_BLOCKS(avx512_inner, xor_avx512_2, xor_avx512_3, xor_avx512_4,
+ xor_avx512_5);
+
+/*
+ * Preconditions: bytes is a nonzero multiple of 512, and all buffers are
+ * 64-byte aligned.
+ */
+static void xor_gen_avx512(void *dest, void **srcs, unsigned int src_cnt,
+ unsigned int bytes)
+{
+ kernel_fpu_begin();
+ xor_gen_avx512_inner(dest, srcs, src_cnt, bytes);
+ kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_avx512 = {
+ .name = "avx512",
+ .xor_gen = xor_gen_avx512,
+};
diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
index 99fe85a213c6..199124e32c27 100644
--- a/lib/raid/xor/x86/xor_arch.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -1,29 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
extern struct xor_block_template xor_block_pII_mmx;
extern struct xor_block_template xor_block_p5_mmx;
extern struct xor_block_template xor_block_sse;
extern struct xor_block_template xor_block_sse_pf64;
extern struct xor_block_template xor_block_avx;
+extern struct xor_block_template xor_block_avx512;
-/*
- * When SSE is available, use it as it can write around L2. We may also be able
- * to load into the L1 only depending on how the cpu deals with a load to a line
- * that is being prefetched.
- *
- * When AVX2 is available, force using it as it is better by all measures.
- *
- * 32-bit without MMX can fall back to the generic routines.
- */
static __always_inline void __init arch_xor_init(void)
{
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ if (IS_ENABLED(CONFIG_X86_64) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+ !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
+ cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) {
+ /* AVX-512 will be the best; no need to try others. */
+ /* !PREFER_YMM excludes CPUs with overly-eager downclocking. */
+ xor_force(&xor_block_avx512);
+ } else if (boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ /* AVX will be the best; no need to try others. */
xor_force(&xor_block_avx);
} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
+ /*
+ * When SSE is available, use it as it can write around L2. We
+ * may also be able to load into the L1 only depending on how
+ * the cpu deals with a load to a line that is being prefetched.
+ */
xor_register(&xor_block_sse);
xor_register(&xor_block_sse_pf64);
} else if (boot_cpu_has(X86_FEATURE_MMX)) {
xor_register(&xor_block_pII_mmx);
xor_register(&xor_block_p5_mmx);
base-commit: 9716c086c8e8b141d35aa61f2e96a2e83de212a7
--
2.54.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
2026-06-12 4:40 [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
@ 2026-06-12 5:22 ` Christoph Hellwig
2026-06-12 5:59 ` Eric Biggers
2026-06-12 9:04 ` David Laight
0 siblings, 2 replies; 4+ messages in thread
From: Christoph Hellwig @ 2026-06-12 5:22 UTC (permalink / raw)
To: Eric Biggers
Cc: Andrew Morton, linux-kernel, Christoph Hellwig, linux-crypto, x86,
Andrea Mazzoleni
On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:
> Add an implementation of xor_gen() using AVX-512.
> Benchmark on AMD Ryzen 9 9950X (Zen 5):
Can you share the benchmark?
In my local tree I have ports of the AVX2 and AVX512 implementations
from snapraid (https://github.com/amadvance/snapraid), which in userspace
give really good performance. On my Laptop with a AMD Ryzen AI 7 PRO 350
(which is a Zen5 with the slower double pumped AVX512 unit), both of
them get over 1GB/s throughput on the snapraid benchmarks. I've been
holding them back as I don't have a good kernel benchmarking harness,
and it's missing the quirks for old AVX512 or the newer AMD special
cases.
Attached for reference.
Note that either way I'd prefer if we could get away from the stange
old code organization with the DO{1-4} helpers which don't really
help.
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4d633dfd5b90..3d5ebeda241e 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -28,7 +28,7 @@ xor-$(CONFIG_SPARC32) += sparc/xor-sparc32.o
xor-$(CONFIG_SPARC64) += sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
xor-$(CONFIG_S390) += s390/xor.o
xor-$(CONFIG_X86_32) += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
-xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o
+xor-$(CONFIG_X86_64) += x86/xor-avx512.o x86/xor-avx.o x86/xor-sse.o
obj-y += tests/
CFLAGS_arm/xor-neon.o += $(CC_FLAGS_FPU)
diff --git a/lib/raid/xor/x86/xor-avx.c b/lib/raid/xor/x86/xor-avx.c
index f7777d7aa269..cd376a7c52d3 100644
--- a/lib/raid/xor/x86/xor-avx.c
+++ b/lib/raid/xor/x86/xor-avx.c
@@ -1,152 +1,31 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Optimized XOR parity functions for AVX
- *
- * Copyright (C) 2012 Intel Corporation
- * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
- *
- * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ * Copyright (C) 2026 Andrea Mazzoleni
*/
-#include <linux/compiler.h>
#include <asm/fpu/api.h>
#include "xor_impl.h"
#include "xor_arch.h"
-#define BLOCK4(i) \
- BLOCK(32 * i, 0) \
- BLOCK(32 * (i + 1), 1) \
- BLOCK(32 * (i + 2), 2) \
- BLOCK(32 * (i + 3), 3)
-
-#define BLOCK16() \
- BLOCK4(0) \
- BLOCK4(4) \
- BLOCK4(8) \
- BLOCK4(12)
-
-static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
- const unsigned long * __restrict p1)
-{
- unsigned long lines = bytes >> 9;
-
- while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
- asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p0[i / sizeof(*p0)])); \
- asm volatile("vmovdqa %%ymm" #reg ", %0" : \
- "=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
- BLOCK16()
-
- p0 = (unsigned long *)((uintptr_t)p0 + 512);
- p1 = (unsigned long *)((uintptr_t)p1 + 512);
- }
-}
-
-static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
- const unsigned long * __restrict p1,
- const unsigned long * __restrict p2)
-{
- unsigned long lines = bytes >> 9;
-
- while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
- asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p1[i / sizeof(*p1)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p0[i / sizeof(*p0)])); \
- asm volatile("vmovdqa %%ymm" #reg ", %0" : \
- "=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
- BLOCK16()
-
- p0 = (unsigned long *)((uintptr_t)p0 + 512);
- p1 = (unsigned long *)((uintptr_t)p1 + 512);
- p2 = (unsigned long *)((uintptr_t)p2 + 512);
- }
-}
-
-static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
- const unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3)
-{
- unsigned long lines = bytes >> 9;
-
- while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
- asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p2[i / sizeof(*p2)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p1[i / sizeof(*p1)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p0[i / sizeof(*p0)])); \
- asm volatile("vmovdqa %%ymm" #reg ", %0" : \
- "=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
- BLOCK16();
-
- p0 = (unsigned long *)((uintptr_t)p0 + 512);
- p1 = (unsigned long *)((uintptr_t)p1 + 512);
- p2 = (unsigned long *)((uintptr_t)p2 + 512);
- p3 = (unsigned long *)((uintptr_t)p3 + 512);
- }
-}
-
-static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
- const unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4)
-{
- unsigned long lines = bytes >> 9;
-
- while (lines--) {
-#undef BLOCK
-#define BLOCK(i, reg) \
-do { \
- asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p3[i / sizeof(*p3)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p2[i / sizeof(*p2)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p1[i / sizeof(*p1)])); \
- asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
- "m" (p0[i / sizeof(*p0)])); \
- asm volatile("vmovdqa %%ymm" #reg ", %0" : \
- "=m" (p0[i / sizeof(*p0)])); \
-} while (0);
-
- BLOCK16()
-
- p0 = (unsigned long *)((uintptr_t)p0 + 512);
- p1 = (unsigned long *)((uintptr_t)p1 + 512);
- p2 = (unsigned long *)((uintptr_t)p2 + 512);
- p3 = (unsigned long *)((uintptr_t)p3 + 512);
- p4 = (unsigned long *)((uintptr_t)p4 + 512);
- }
-}
-
-DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5);
-
static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt,
unsigned int bytes)
{
+ u8 **v = (u8 **)srcs;
+ u8 *p = dest;
+ unsigned int i, d;
+
kernel_fpu_begin();
- xor_gen_avx_inner(dest, srcs, src_cnt, bytes);
+ for (i = 0; i < bytes; i += 64) {
+ asm volatile ("vmovdqa %0,%%ymm0" : : "m" (p[i]));
+ asm volatile ("vmovdqa %0,%%ymm1" : : "m" (p[i + 32]));
+ for (d = 0; d < src_cnt; ++d) {
+ asm volatile ("vpxor %0,%%ymm0,%%ymm0"
+ : : "m" (v[d][i]));
+ asm volatile ("vpxor %0,%%ymm1,%%ymm1"
+ : : "m" (v[d][i + 32]));
+ }
+ asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+ asm volatile ("vmovntdq %%ymm1,%0" : "=m" (p[i + 32]));
+ }
kernel_fpu_end();
}
diff --git a/lib/raid/xor/x86/xor-avx512.c b/lib/raid/xor/x86/xor-avx512.c
new file mode 100644
index 000000000000..9b323a0e1821
--- /dev/null
+++ b/lib/raid/xor/x86/xor-avx512.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2026 Andrea Mazzoleni
+ */
+#include <asm/fpu/api.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
+
+static void xor_gen_avx512bw(void *dest, void **srcs, unsigned int src_cnt,
+ unsigned int bytes)
+{
+ unsigned int last = src_cnt - 1, i, d;
+ u8 **v = (u8 **)srcs;
+ u8 *p = dest;
+
+ kernel_fpu_begin();
+ for (i = 0; i < bytes; i += 64) {
+ asm volatile("vmovdqa64 %0,%%zmm0" : : "m" (p[i]));
+ for (d = 0; d < last; d += 2)
+ asm volatile("vmovdqa64 %0,%%zmm1\n\t"
+ "vpternlogq $0x96,%1,%%zmm1,%%zmm0"
+ : : "m" (v[d][i]), "m" (v[d + 1][i]));
+ if (d == last)
+ asm volatile("vpxorq %0,%%zmm0,%%zmm0"
+ : : "m" (v[last][i]));
+ asm volatile("vmovntdq %%zmm0,%0" : "=m" (p[i]));
+ }
+ kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_avx512bw = {
+ .name = "avx512bw",
+ .xor_gen = xor_gen_avx512bw,
+};
diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h
index 99fe85a213c6..73c81221fc01 100644
--- a/lib/raid/xor/x86/xor_arch.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -6,6 +6,7 @@ extern struct xor_block_template xor_block_p5_mmx;
extern struct xor_block_template xor_block_sse;
extern struct xor_block_template xor_block_sse_pf64;
extern struct xor_block_template xor_block_avx;
+extern struct xor_block_template xor_block_avx512bw;
/*
* When SSE is available, use it as it can write around L2. We may also be able
@@ -20,7 +21,12 @@ static __always_inline void __init arch_xor_init(void)
{
if (boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_OSXSAVE)) {
- xor_force(&xor_block_avx);
+ if (boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW))
+ xor_force(&xor_block_avx512bw);
+ else
+ xor_force(&xor_block_avx);
} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
xor_register(&xor_block_sse);
xor_register(&xor_block_sse_pf64);
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
2026-06-12 5:22 ` Christoph Hellwig
@ 2026-06-12 5:59 ` Eric Biggers
2026-06-12 9:04 ` David Laight
1 sibling, 0 replies; 4+ messages in thread
From: Eric Biggers @ 2026-06-12 5:59 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Andrew Morton, linux-kernel, linux-crypto, x86, Andrea Mazzoleni
On Fri, Jun 12, 2026 at 07:22:47AM +0200, Christoph Hellwig wrote:
> On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:
> > Add an implementation of xor_gen() using AVX-512.
>
> > Benchmark on AMD Ryzen 9 9950X (Zen 5):
>
> Can you share the benchmark?
For now I had just hacked up do_xor_speed() as follows and changed
xor_force() to xor_register(). There should be a benchmark added to the
KUnit test similar to the one in the crypto and CRC tests, though.
diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index bd4e6e434418..8c5814af03d5 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -76,15 +76,24 @@ void __init xor_force(struct xor_block_template *tmpl)
#define REPS 800U
static void __init
-do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
+do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2,
+ void *b3, void *b4, void *b5)
{
+ for (int src_cnt = 1; src_cnt <= 4; src_cnt++) {
int speed;
unsigned long reps;
ktime_t min, start, t0;
- void *srcs[1] = { b2 };
+ void *srcs[4] = { b2, b3, b4, b5 };
preempt_disable();
+ /* warm-up */
+ for (int i = 0; i < 8000; i++) {
+ mb(); /* prevent loop optimization */
+ tmpl->xor_gen(b1, srcs, src_cnt, BENCH_SIZE);
+ mb();
+ }
+
reps = 0;
t0 = ktime_get();
/* delay start until time has advanced */
@@ -92,7 +101,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
cpu_relax();
do {
mb(); /* prevent loop optimization */
- tmpl->xor_gen(b1, srcs, 1, BENCH_SIZE);
+ tmpl->xor_gen(b1, srcs, src_cnt, BENCH_SIZE);
mb();
} while (reps++ < REPS || (t0 = ktime_get()) == start);
min = ktime_sub(t0, start);
@@ -105,26 +114,30 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
pr_info(" %-16s: %5d MB/sec\n", tmpl->name, speed);
}
+}
static int __init calibrate_xor_blocks(void)
{
- void *b1, *b2;
+ void *b1, *b2, *b3, *b4, *b5;
struct xor_block_template *f, *fastest;
if (forced_template)
return 0;
- b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
+ b1 = (void *) __get_free_pages(GFP_KERNEL, 4);
if (!b1) {
pr_warn("xor: Yikes! No memory available.\n");
return -ENOMEM;
}
b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
+ b3 = b2 + 2*PAGE_SIZE + BENCH_SIZE;
+ b4 = b3 + 2*PAGE_SIZE + BENCH_SIZE;
+ b5 = b4 + 2*PAGE_SIZE + BENCH_SIZE;
pr_info("xor: measuring software checksum speed\n");
fastest = template_list;
for (f = template_list; f; f = f->next) {
- do_xor_speed(f, b1, b2);
+ do_xor_speed(f, b1, b2, b3, b4, b5);
if (f->speed > fastest->speed)
fastest = f;
}
> In my local tree I have ports of the AVX2 and AVX512 implementations
> from snapraid (https://github.com/amadvance/snapraid), which in userspace
> give really good performance. On my Laptop with a AMD Ryzen AI 7 PRO 350
> (which is a Zen5 with the slower double pumped AVX512 unit), both of
> them get over 1GB/s throughput on the snapraid benchmarks. I've been
> holding them back as I don't have a good kernel benchmarking harness,
> and it's missing the quirks for old AVX512 or the newer AMD special
> cases.
>
> Attached for reference.
>
> Note that either way I'd prefer if we could get away from the stange
> old code organization with the DO{1-4} helpers which don't really
> help.
Well, doing the same on your avx512bw version and adding a column to my
table for it (by the way, I think it really just needs avx512f), I get:
src_cnt avx avx512 avx512bw
======= ========== ========== ==========
1 68423 MB/s 81940 MB/s 12067 MB/s
2 56035 MB/s 74112 MB/s 10958 MB/s
3 49396 MB/s 67011 MB/s 8608 MB/s
4 43056 MB/s 60823 MB/s 8069 MB/s
So, your version isn't great, I'm afraid. Making the inner loop be over
src_cnt does simplify the code a lot, but it destroys performance since
it turns into 9 instructions for each 64 bytes in each 3 buffers:
5b: 89 c1 mov %eax,%ecx
5d: 8d 70 01 lea 0x1(%rax),%esi
60: 48 8b 0c cb mov (%rbx,%rcx,8),%rcx
64: 48 8b 34 f3 mov (%rbx,%rsi,8),%rsi
68: 62 f1 fd 48 6f 0c 11 vmovdqa64 (%rcx,%rdx,1),%zmm1
6f: 62 f3 f5 48 25 04 16 vpternlogq $0x96,(%rsi,%rdx,1),%zmm1,%zmm0
76: 96
77: 83 c0 02 add $0x2,%eax
7a: 39 f8 cmp %edi,%eax
7c: 72 dd jb 5b <xor_gen_avx512bw+0x4b>
You could try unrolling by 512 bytes, which should help.
- Eric
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen()
2026-06-12 5:22 ` Christoph Hellwig
2026-06-12 5:59 ` Eric Biggers
@ 2026-06-12 9:04 ` David Laight
1 sibling, 0 replies; 4+ messages in thread
From: David Laight @ 2026-06-12 9:04 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Eric Biggers, Andrew Morton, linux-kernel, linux-crypto, x86,
Andrea Mazzoleni
On Fri, 12 Jun 2026 07:22:47 +0200
Christoph Hellwig <hch@lst.de> wrote:
> On Thu, Jun 11, 2026 at 09:40:34PM -0700, Eric Biggers wrote:
> > Add an implementation of xor_gen() using AVX-512.
>
> > Benchmark on AMD Ryzen 9 9950X (Zen 5):
>
> Can you share the benchmark?
>
> In my local tree I have ports of the AVX2 and AVX512 implementations
> from snapraid (https://github.com/amadvance/snapraid), which in userspace
> give really good performance. On my Laptop with a AMD Ryzen AI 7 PRO 350
> (which is a Zen5 with the slower double pumped AVX512 unit), both of
> them get over 1GB/s throughput on the snapraid benchmarks. I've been
> holding them back as I don't have a good kernel benchmarking harness,
> and it's missing the quirks for old AVX512 or the newer AMD special
> cases.
From my experiments on Intel cpu (and I don't remember the zen-5 being
that different - but I've done less testing on it) you don't need to
unroll loops very much at all.
A reasonable model seems to be that the uops generated by the instruction
decoder get executed when all the prerequisite registers and the required
execution unit are available.
So for a memory copy (and the xor is basically a copy) the control loop
can run way ahead of the read/write instructions.
This means you can get the control loop 'for free' and unrolling further
makes no/little difference.
Each xor is two memory reads and one memory write.
The cpu I was using could only do one write/clock - so you can only do one
xor each clock. I think some of the newer ones can to two writes/clock but
I'm not sure how many reads/clock they can do - might still be 2, don't
think it s 4.
So you should be able to get one xor per clock, but I doubt you'll get two
(and possibly not even 1.3 - which would require 4 memory accesses per clock).
The best loop construct is the one that uses negative offsets from the
end of the buffers, basically:
buf += len;
offset = -len;
do
f(buf[offset]);
while (offset += size);
that reduces the loop control to just an 'add' and 'jnz' (which can
get merged into a single u-op).
The cpu have enough execution units to execute two memory reads,
a memory write, an xor the add and jnz every clock.
So even the 'rolled up' loop might run at one xor per clock.
While I think I got a 'one clock loop' on my zen-5 (testing
word-at-a-time strlen) I only managed a two clock loop on the newest
Intel cpu I've got (which isn't that new).
So put two xor in the loop and it shouldn't be limited by the loop
control, but will be limited by the memory accesses instead.
Further unrolling shouldn't help and may make things worse.
The Intel cpu have logic to directly forward the result of an
ALU instruction into the next few instructions, but after that you can
get a stall because of the 'round trip' via the register file.
So part way down an unrolled nn(%reg) sequence you can get a stall.
An extra 'add $0,%reg' in the middle of the unrolled loop will
'refresh' the register and speed things up.
(I hit that with a loop that needed a rather more complicated control
structure.)
You definitely need to use the pmc clock counter and data dependencies
against the rdpmc instruction to get sensible performance figures.
The can reasonably reliably measure down to less than 20 clocks.
David
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-06-12 9:04 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-12 4:40 [PATCH] lib/raid/xor: x86: Add AVX-512 optimized xor_gen() Eric Biggers
2026-06-12 5:22 ` Christoph Hellwig
2026-06-12 5:59 ` Eric Biggers
2026-06-12 9:04 ` David Laight
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox