[PATCH] raid6: arm64: add SVE optimized implementation for syndrome generation

public inbox for linux-raid@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] raid6: arm64: add SVE optimized implementation for syndrome generation
@ 2026-03-17 11:17 Demian Shulhan
  2026-03-18 11:24 ` kernel test robot
  0 siblings, 1 reply; 2+ messages in thread
From: Demian Shulhan @ 2026-03-17 11:17 UTC (permalink / raw)
  To: Song Liu, Yu Kuai; +Cc: Li Nan, linux-raid, linux-kernel, Demian Shulhan

Implement Scalable Vector Extension (SVE) optimized routines for RAID6
syndrome generation and recovery on ARM64.

The SVE instruction set allows for variable vector lengths (from 128 to
2048 bits), scaling automatically with the hardware capabilities. This
implementation handles arbitrary SVE vector lengths using the `cntb`
instruction to determine the runtime vector length.

The implementation introduces `svex1`, `svex2`, and `svex4` algorithms.
The `svex4` algorithm utilizes loop unrolling by 4 blocks per iteration
and manual software pipelining (interleaving memory loads with XORs)
to minimize instruction dependency stalls and maximize CPU pipeline
utilization and memory bandwidth.

Performance was tested on an AWS Graviton3 (Neoverse-V1) instance which
features 256-bit SVE vector length. The `svex4` implementation outperforms
the existing 128-bit `neonx4` baseline for syndrome generation:

raid6: svex4    gen() 19688 MB/s
raid6: svex2    gen() 18610 MB/s
raid6: svex1    gen() 19254 MB/s
raid6: neonx8   gen() 18554 MB/s
raid6: neonx4   gen() 19612 MB/s
raid6: neonx2   gen() 16248 MB/s
raid6: neonx1   gen() 13591 MB/s
raid6: using algorithm svex4 gen() 19688 MB/s
raid6: .... xor() 11212 MB/s, rmw enabled
raid6: using neon recovery algorithm

Note that for the recovery path (`xor_syndrome`), NEON may still be
selected dynamically by the algorithm benchmark, as the recovery
workload is heavily memory-bound.

Signed-off-by: Demian Shulhan <demyansh@gmail.com>
---
 include/linux/raid/pq.h |   3 +
 lib/raid6/Makefile      |   5 +
 lib/raid6/algos.c       |   5 +
 lib/raid6/sve.c         | 675 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 688 insertions(+)
 create mode 100644 lib/raid6/sve.c

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 2467b3be15c9..787cc57aea9d 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -140,6 +140,9 @@ extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
 extern const struct raid6_calls raid6_neonx4;
 extern const struct raid6_calls raid6_neonx8;
+extern const struct raid6_calls raid6_svex1;
+extern const struct raid6_calls raid6_svex2;
+extern const struct raid6_calls raid6_svex4;
 
 /* Algorithm list */
 extern const struct raid6_calls * const raid6_algos[];
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 5be0a4e60ab1..6cdaa6f206fb 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -8,6 +8,7 @@ raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
                               vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
+raid6_pq-$(CONFIG_ARM64_SVE) += sve.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
 raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
@@ -67,6 +68,10 @@ CFLAGS_REMOVE_neon2.o += $(CC_FLAGS_NO_FPU)
 CFLAGS_REMOVE_neon4.o += $(CC_FLAGS_NO_FPU)
 CFLAGS_REMOVE_neon8.o += $(CC_FLAGS_NO_FPU)
 CFLAGS_REMOVE_recov_neon_inner.o += $(CC_FLAGS_NO_FPU)
+
+CFLAGS_sve.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_sve.o += $(CC_FLAGS_NO_FPU)
+
 targets += neon1.c neon2.c neon4.c neon8.c
 $(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 799e0e5eac26..0ae73c3a4be3 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -66,6 +66,11 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_neonx2,
 	&raid6_neonx1,
 #endif
+#ifdef CONFIG_ARM64_SVE
+	&raid6_svex4,
+	&raid6_svex2,
+	&raid6_svex1,
+#endif
 #ifdef CONFIG_LOONGARCH
 #ifdef CONFIG_CPU_HAS_LASX
 	&raid6_lasx,
diff --git a/lib/raid6/sve.c b/lib/raid6/sve.c
new file mode 100644
index 000000000000..afcf46b89a3d
--- /dev/null
+++ b/lib/raid6/sve.c
@@ -0,0 +1,675 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID-6 syndrome calculation using ARM SVE instructions
+ */
+
+#include <linux/raid/pq.h>
+
+#ifdef __KERNEL__
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#else
+#define scoped_ksimd()
+#define system_supports_sve() (1)
+#endif
+
+static void raid6_sve1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int z0 = disks - 3;
+
+	p = dptr[z0 + 1];
+	q = dptr[z0 + 2];
+
+	asm volatile(
+		".arch armv8.2-a+sve\n"
+		"ptrue p0.b\n"
+		"cntb x3\n"
+		"mov w4, #0x1d\n"
+		"dup z4.b, w4\n"
+		"mov x5, #0\n"
+
+		"0:\n"
+		"ldr x6, [%[dptr], %[z0], lsl #3]\n"
+		"ld1b z0.b, p0/z, [x6, x5]\n"
+		"mov z1.d, z0.d\n"
+
+		"mov w7, %w[z0]\n"
+		"sub w7, w7, #1\n"
+
+		"1:\n"
+		"cmp w7, #0\n"
+		"blt 2f\n"
+
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+
+		"sxtw x8, w7\n"
+		"ldr x6, [%[dptr], x8, lsl #3]\n"
+		"ld1b z2.b, p0/z, [x6, x5]\n"
+
+		"eor z1.d, z1.d, z2.d\n"
+		"eor z0.d, z0.d, z2.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 1b\n"
+		"2:\n"
+
+		"st1b z0.b, p0, [%[p], x5]\n"
+		"st1b z1.b, p0, [%[q], x5]\n"
+
+		"add x5, x5, x3\n"
+		"cmp x5, %[bytes]\n"
+		"blt 0b\n"
+		:
+		: [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
+		  [p] "r" (p), [q] "r" (q)
+		: "memory", "p0", "x3", "x4", "x5", "x6", "x7", "x8",
+		  "z0", "z1", "z2", "z3", "z4"
+	);
+}
+
+static void raid6_sve1_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int z0 = stop;
+
+	p = dptr[disks - 2];
+	q = dptr[disks - 1];
+
+	asm volatile(
+		".arch armv8.2-a+sve\n"
+		"ptrue p0.b\n"
+		"cntb x3\n"
+		"mov w4, #0x1d\n"
+		"dup z4.b, w4\n"
+		"mov x5, #0\n"
+
+		"0:\n"
+		"ldr x6, [%[dptr], %[z0], lsl #3]\n"
+		"ld1b z1.b, p0/z, [x6, x5]\n"
+		"ld1b z0.b, p0/z, [%[p], x5]\n"
+		"eor z0.d, z0.d, z1.d\n"
+
+		"mov w7, %w[z0]\n"
+		"sub w7, w7, #1\n"
+
+		"1:\n"
+		"cmp w7, %w[start]\n"
+		"blt 2f\n"
+
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+
+		"sxtw x8, w7\n"
+		"ldr x6, [%[dptr], x8, lsl #3]\n"
+		"ld1b z2.b, p0/z, [x6, x5]\n"
+
+		"eor z1.d, z1.d, z2.d\n"
+		"eor z0.d, z0.d, z2.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 1b\n"
+		"2:\n"
+
+		"mov w7, %w[start]\n"
+		"sub w7, w7, #1\n"
+		"3:\n"
+		"cmp w7, #0\n"
+		"blt 4f\n"
+
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 3b\n"
+		"4:\n"
+
+		"ld1b z2.b, p0/z, [%[q], x5]\n"
+		"eor z1.d, z1.d, z2.d\n"
+
+		"st1b z0.b, p0, [%[p], x5]\n"
+		"st1b z1.b, p0, [%[q], x5]\n"
+
+		"add x5, x5, x3\n"
+		"cmp x5, %[bytes]\n"
+		"blt 0b\n"
+		:
+		: [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
+		  [p] "r" (p), [q] "r" (q), [start] "r" (start)
+		: "memory", "p0", "x3", "x4", "x5", "x6", "x7", "x8",
+		  "z0", "z1", "z2", "z3", "z4"
+	);
+}
+
+static void raid6_sve2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int z0 = disks - 3;
+
+	p = dptr[z0 + 1];
+	q = dptr[z0 + 2];
+
+	asm volatile(
+		".arch armv8.2-a+sve\n"
+		"ptrue p0.b\n"
+		"cntb x3\n"
+		"mov w4, #0x1d\n"
+		"dup z4.b, w4\n"
+		"mov x5, #0\n"
+
+		"0:\n"
+		"ldr x6, [%[dptr], %[z0], lsl #3]\n"
+		"ld1b z0.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z5.b, p0/z, [x6, x8]\n"
+		"mov z1.d, z0.d\n"
+		"mov z6.d, z5.d\n"
+
+		"mov w7, %w[z0]\n"
+		"sub w7, w7, #1\n"
+
+		"1:\n"
+		"cmp w7, #0\n"
+		"blt 2f\n"
+
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+
+		"mov z8.d, z6.d\n"
+		"asr z8.b, p0/m, z8.b, #7\n"
+		"lsl z6.b, p0/m, z6.b, #1\n"
+		"and z8.d, z8.d, z4.d\n"
+		"eor z6.d, z6.d, z8.d\n"
+
+		"sxtw x8, w7\n"
+		"ldr x6, [%[dptr], x8, lsl #3]\n"
+		"ld1b z2.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z7.b, p0/z, [x6, x8]\n"
+
+		"eor z1.d, z1.d, z2.d\n"
+		"eor z0.d, z0.d, z2.d\n"
+
+		"eor z6.d, z6.d, z7.d\n"
+		"eor z5.d, z5.d, z7.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 1b\n"
+		"2:\n"
+
+		"st1b z0.b, p0, [%[p], x5]\n"
+		"st1b z1.b, p0, [%[q], x5]\n"
+		"add x8, x5, x3\n"
+		"st1b z5.b, p0, [%[p], x8]\n"
+		"st1b z6.b, p0, [%[q], x8]\n"
+
+		"add x5, x5, x3\n"
+		"add x5, x5, x3\n"
+		"cmp x5, %[bytes]\n"
+		"blt 0b\n"
+		:
+		: [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
+		  [p] "r" (p), [q] "r" (q)
+		: "memory", "p0", "x3", "x4", "x5", "x6", "x7", "x8",
+		  "z0", "z1", "z2", "z3", "z4",
+		  "z5", "z6", "z7", "z8"
+	);
+}
+
+static void raid6_sve2_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int z0 = stop;
+
+	p = dptr[disks - 2];
+	q = dptr[disks - 1];
+
+	asm volatile(
+		".arch armv8.2-a+sve\n"
+		"ptrue p0.b\n"
+		"cntb x3\n"
+		"mov w4, #0x1d\n"
+		"dup z4.b, w4\n"
+		"mov x5, #0\n"
+
+		"0:\n"
+		"ldr x6, [%[dptr], %[z0], lsl #3]\n"
+		"ld1b z1.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z6.b, p0/z, [x6, x8]\n"
+
+		"ld1b z0.b, p0/z, [%[p], x5]\n"
+		"ld1b z5.b, p0/z, [%[p], x8]\n"
+
+		"eor z0.d, z0.d, z1.d\n"
+		"eor z5.d, z5.d, z6.d\n"
+
+		"mov w7, %w[z0]\n"
+		"sub w7, w7, #1\n"
+
+		"1:\n"
+		"cmp w7, %w[start]\n"
+		"blt 2f\n"
+
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+
+		"mov z8.d, z6.d\n"
+		"asr z8.b, p0/m, z8.b, #7\n"
+		"lsl z6.b, p0/m, z6.b, #1\n"
+		"and z8.d, z8.d, z4.d\n"
+		"eor z6.d, z6.d, z8.d\n"
+
+		"sxtw x8, w7\n"
+		"ldr x6, [%[dptr], x8, lsl #3]\n"
+		"ld1b z2.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z7.b, p0/z, [x6, x8]\n"
+
+		"eor z1.d, z1.d, z2.d\n"
+		"eor z0.d, z0.d, z2.d\n"
+
+		"eor z6.d, z6.d, z7.d\n"
+		"eor z5.d, z5.d, z7.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 1b\n"
+		"2:\n"
+
+		"mov w7, %w[start]\n"
+		"sub w7, w7, #1\n"
+		"3:\n"
+		"cmp w7, #0\n"
+		"blt 4f\n"
+
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+
+		"mov z8.d, z6.d\n"
+		"asr z8.b, p0/m, z8.b, #7\n"
+		"lsl z6.b, p0/m, z6.b, #1\n"
+		"and z8.d, z8.d, z4.d\n"
+		"eor z6.d, z6.d, z8.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 3b\n"
+		"4:\n"
+
+		"ld1b z2.b, p0/z, [%[q], x5]\n"
+		"eor z1.d, z1.d, z2.d\n"
+		"st1b z0.b, p0, [%[p], x5]\n"
+		"st1b z1.b, p0, [%[q], x5]\n"
+
+		"add x8, x5, x3\n"
+		"ld1b z7.b, p0/z, [%[q], x8]\n"
+		"eor z6.d, z6.d, z7.d\n"
+		"st1b z5.b, p0, [%[p], x8]\n"
+		"st1b z6.b, p0, [%[q], x8]\n"
+
+		"add x5, x5, x3\n"
+		"add x5, x5, x3\n"
+		"cmp x5, %[bytes]\n"
+		"blt 0b\n"
+		:
+		: [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
+		  [p] "r" (p), [q] "r" (q), [start] "r" (start)
+		: "memory", "p0", "x3", "x4", "x5", "x6", "x7", "x8",
+		  "z0", "z1", "z2", "z3", "z4",
+		  "z5", "z6", "z7", "z8"
+	);
+}
+
+static void raid6_sve4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int z0 = disks - 3;
+
+	p = dptr[z0 + 1];
+	q = dptr[z0 + 2];
+
+	asm volatile(
+		".arch armv8.2-a+sve\n"
+		"ptrue p0.b\n"
+		"cntb x3\n"
+		"mov w4, #0x1d\n"
+		"dup z4.b, w4\n"
+		"mov x5, #0\n"
+
+		"0:\n"
+		"ldr x6, [%[dptr], %[z0], lsl #3]\n"
+		"ld1b z0.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z5.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z10.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z15.b, p0/z, [x6, x8]\n"
+
+		"mov z1.d, z0.d\n"
+		"mov z6.d, z5.d\n"
+		"mov z11.d, z10.d\n"
+		"mov z16.d, z15.d\n"
+
+		"mov w7, %w[z0]\n"
+		"sub w7, w7, #1\n"
+
+		"1:\n"
+		"cmp w7, #0\n"
+		"blt 2f\n"
+
+		// software pipelining: load data early
+		"sxtw x8, w7\n"
+		"ldr x6, [%[dptr], x8, lsl #3]\n"
+		"ld1b z2.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z7.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z12.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z17.b, p0/z, [x6, x8]\n"
+
+		// math block 1
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+		"eor z1.d, z1.d, z2.d\n"
+		"eor z0.d, z0.d, z2.d\n"
+
+		// math block 2
+		"mov z8.d, z6.d\n"
+		"asr z8.b, p0/m, z8.b, #7\n"
+		"lsl z6.b, p0/m, z6.b, #1\n"
+		"and z8.d, z8.d, z4.d\n"
+		"eor z6.d, z6.d, z8.d\n"
+		"eor z6.d, z6.d, z7.d\n"
+		"eor z5.d, z5.d, z7.d\n"
+
+		// math block 3
+		"mov z13.d, z11.d\n"
+		"asr z13.b, p0/m, z13.b, #7\n"
+		"lsl z11.b, p0/m, z11.b, #1\n"
+		"and z13.d, z13.d, z4.d\n"
+		"eor z11.d, z11.d, z13.d\n"
+		"eor z11.d, z11.d, z12.d\n"
+		"eor z10.d, z10.d, z12.d\n"
+
+		// math block 4
+		"mov z18.d, z16.d\n"
+		"asr z18.b, p0/m, z18.b, #7\n"
+		"lsl z16.b, p0/m, z16.b, #1\n"
+		"and z18.d, z18.d, z4.d\n"
+		"eor z16.d, z16.d, z18.d\n"
+		"eor z16.d, z16.d, z17.d\n"
+		"eor z15.d, z15.d, z17.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 1b\n"
+		"2:\n"
+
+		"st1b z0.b, p0, [%[p], x5]\n"
+		"st1b z1.b, p0, [%[q], x5]\n"
+		"add x8, x5, x3\n"
+		"st1b z5.b, p0, [%[p], x8]\n"
+		"st1b z6.b, p0, [%[q], x8]\n"
+		"add x8, x8, x3\n"
+		"st1b z10.b, p0, [%[p], x8]\n"
+		"st1b z11.b, p0, [%[q], x8]\n"
+		"add x8, x8, x3\n"
+		"st1b z15.b, p0, [%[p], x8]\n"
+		"st1b z16.b, p0, [%[q], x8]\n"
+
+		"add x8, x3, x3\n"
+		"add x5, x5, x8, lsl #1\n"
+		"cmp x5, %[bytes]\n"
+		"blt 0b\n"
+		:
+		: [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
+		  [p] "r" (p), [q] "r" (q)
+		: "memory", "p0", "x3", "x4", "x5", "x6", "x7", "x8",
+		  "z0", "z1", "z2", "z3", "z4",
+		  "z5", "z6", "z7", "z8",
+		  "z10", "z11", "z12", "z13",
+		  "z15", "z16", "z17", "z18"
+	);
+}
+
+static void raid6_sve4_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int z0 = stop;
+
+	p = dptr[disks - 2];
+	q = dptr[disks - 1];
+
+	asm volatile(
+		".arch armv8.2-a+sve\n"
+		"ptrue p0.b\n"
+		"cntb x3\n"
+		"mov w4, #0x1d\n"
+		"dup z4.b, w4\n"
+		"mov x5, #0\n"
+
+		"0:\n"
+		"ldr x6, [%[dptr], %[z0], lsl #3]\n"
+		"ld1b z1.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z6.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z11.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z16.b, p0/z, [x6, x8]\n"
+
+		"ld1b z0.b, p0/z, [%[p], x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z5.b, p0/z, [%[p], x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z10.b, p0/z, [%[p], x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z15.b, p0/z, [%[p], x8]\n"
+
+		"eor z0.d, z0.d, z1.d\n"
+		"eor z5.d, z5.d, z6.d\n"
+		"eor z10.d, z10.d, z11.d\n"
+		"eor z15.d, z15.d, z16.d\n"
+
+		"mov w7, %w[z0]\n"
+		"sub w7, w7, #1\n"
+
+		"1:\n"
+		"cmp w7, %w[start]\n"
+		"blt 2f\n"
+
+		// software pipelining: load data early
+		"sxtw x8, w7\n"
+		"ldr x6, [%[dptr], x8, lsl #3]\n"
+		"ld1b z2.b, p0/z, [x6, x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z7.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z12.b, p0/z, [x6, x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z17.b, p0/z, [x6, x8]\n"
+
+		// math block 1
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+		"eor z1.d, z1.d, z2.d\n"
+		"eor z0.d, z0.d, z2.d\n"
+
+		// math block 2
+		"mov z8.d, z6.d\n"
+		"asr z8.b, p0/m, z8.b, #7\n"
+		"lsl z6.b, p0/m, z6.b, #1\n"
+		"and z8.d, z8.d, z4.d\n"
+		"eor z6.d, z6.d, z8.d\n"
+		"eor z6.d, z6.d, z7.d\n"
+		"eor z5.d, z5.d, z7.d\n"
+
+		// math block 3
+		"mov z13.d, z11.d\n"
+		"asr z13.b, p0/m, z13.b, #7\n"
+		"lsl z11.b, p0/m, z11.b, #1\n"
+		"and z13.d, z13.d, z4.d\n"
+		"eor z11.d, z11.d, z13.d\n"
+		"eor z11.d, z11.d, z12.d\n"
+		"eor z10.d, z10.d, z12.d\n"
+
+		// math block 4
+		"mov z18.d, z16.d\n"
+		"asr z18.b, p0/m, z18.b, #7\n"
+		"lsl z16.b, p0/m, z16.b, #1\n"
+		"and z18.d, z18.d, z4.d\n"
+		"eor z16.d, z16.d, z18.d\n"
+		"eor z16.d, z16.d, z17.d\n"
+		"eor z15.d, z15.d, z17.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 1b\n"
+		"2:\n"
+
+		"mov w7, %w[start]\n"
+		"sub w7, w7, #1\n"
+		"3:\n"
+		"cmp w7, #0\n"
+		"blt 4f\n"
+
+		// math block 1
+		"mov z3.d, z1.d\n"
+		"asr z3.b, p0/m, z3.b, #7\n"
+		"lsl z1.b, p0/m, z1.b, #1\n"
+		"and z3.d, z3.d, z4.d\n"
+		"eor z1.d, z1.d, z3.d\n"
+
+		// math block 2
+		"mov z8.d, z6.d\n"
+		"asr z8.b, p0/m, z8.b, #7\n"
+		"lsl z6.b, p0/m, z6.b, #1\n"
+		"and z8.d, z8.d, z4.d\n"
+		"eor z6.d, z6.d, z8.d\n"
+
+		// math block 3
+		"mov z13.d, z11.d\n"
+		"asr z13.b, p0/m, z13.b, #7\n"
+		"lsl z11.b, p0/m, z11.b, #1\n"
+		"and z13.d, z13.d, z4.d\n"
+		"eor z11.d, z11.d, z13.d\n"
+
+		// math block 4
+		"mov z18.d, z16.d\n"
+		"asr z18.b, p0/m, z18.b, #7\n"
+		"lsl z16.b, p0/m, z16.b, #1\n"
+		"and z18.d, z18.d, z4.d\n"
+		"eor z16.d, z16.d, z18.d\n"
+
+		"sub w7, w7, #1\n"
+		"b 3b\n"
+		"4:\n"
+
+		// Load q and XOR
+		"ld1b z2.b, p0/z, [%[q], x5]\n"
+		"add x8, x5, x3\n"
+		"ld1b z7.b, p0/z, [%[q], x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z12.b, p0/z, [%[q], x8]\n"
+		"add x8, x8, x3\n"
+		"ld1b z17.b, p0/z, [%[q], x8]\n"
+
+		"eor z1.d, z1.d, z2.d\n"
+		"eor z6.d, z6.d, z7.d\n"
+		"eor z11.d, z11.d, z12.d\n"
+		"eor z16.d, z16.d, z17.d\n"
+
+		// Store results
+		"st1b z0.b, p0, [%[p], x5]\n"
+		"st1b z1.b, p0, [%[q], x5]\n"
+		"add x8, x5, x3\n"
+		"st1b z5.b, p0, [%[p], x8]\n"
+		"st1b z6.b, p0, [%[q], x8]\n"
+		"add x8, x8, x3\n"
+		"st1b z10.b, p0, [%[p], x8]\n"
+		"st1b z11.b, p0, [%[q], x8]\n"
+		"add x8, x8, x3\n"
+		"st1b z15.b, p0, [%[p], x8]\n"
+		"st1b z16.b, p0, [%[q], x8]\n"
+
+		"add x8, x3, x3\n"
+		"add x5, x5, x8, lsl #1\n"
+		"cmp x5, %[bytes]\n"
+		"blt 0b\n"
+		:
+		: [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
+		  [p] "r" (p), [q] "r" (q), [start] "r" (start)
+		: "memory", "p0", "x3", "x4", "x5", "x6", "x7", "x8",
+		  "z0", "z1", "z2", "z3", "z4",
+		  "z5", "z6", "z7", "z8",
+		  "z10", "z11", "z12", "z13",
+		  "z15", "z16", "z17", "z18"
+	);
+}
+
+#define RAID6_SVE_WRAPPER(_n)						\
+	static void raid6_sve ## _n ## _gen_syndrome(int disks,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		scoped_ksimd()						\
+		raid6_sve ## _n ## _gen_syndrome_real(disks,		\
+					(unsigned long)bytes, ptrs);	\
+	}								\
+	static void raid6_sve ## _n ## _xor_syndrome(int disks,		\
+					int start, int stop,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		scoped_ksimd()						\
+		raid6_sve ## _n ## _xor_syndrome_real(disks,		\
+				start, stop, (unsigned long)bytes, ptrs);\
+	}								\
+	struct raid6_calls const raid6_svex ## _n = {			\
+		raid6_sve ## _n ## _gen_syndrome,			\
+		raid6_sve ## _n ## _xor_syndrome,			\
+		raid6_have_sve,						\
+		"svex" #_n,						\
+		0							\
+	}
+
+static int raid6_have_sve(void)
+{
+	return system_supports_sve();
+}
+
+RAID6_SVE_WRAPPER(1);
+RAID6_SVE_WRAPPER(2);
+RAID6_SVE_WRAPPER(4);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] raid6: arm64: add SVE optimized implementation for syndrome generation
  2026-03-17 11:17 [PATCH] raid6: arm64: add SVE optimized implementation for syndrome generation Demian Shulhan
@ 2026-03-18 11:24 ` kernel test robot
  0 siblings, 0 replies; 2+ messages in thread
From: kernel test robot @ 2026-03-18 11:24 UTC (permalink / raw)
  To: Demian Shulhan, Song Liu, Yu Kuai
  Cc: llvm, oe-kbuild-all, Li Nan, linux-raid, linux-kernel,
	Demian Shulhan

Hi Demian,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-nonmm-unstable]
[also build test WARNING on linus/master v7.0-rc4 next-20260317]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Demian-Shulhan/raid6-arm64-add-SVE-optimized-implementation-for-syndrome-generation/20260317-224300
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-nonmm-unstable
patch link:    https://lore.kernel.org/r/20260317111706.2756977-1-demyansh%40gmail.com
patch subject: [PATCH] raid6: arm64: add SVE optimized implementation for syndrome generation
config: arm64-allmodconfig (https://download.01.org/0day-ci/archive/20260318/202603181940.cFwYmYoi-lkp@intel.com/config)
compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260318/202603181940.cFwYmYoi-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603181940.cFwYmYoi-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> lib/raid6/sve.c:70:34: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
      70 |                 : [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
         |                                                ^
   lib/raid6/sve.c:34:22: note: use constraint modifier "w"
      34 |                 "ldr x6, [%[dptr], %[z0], lsl #3]\n"
         |                                    ^~~~~
         |                                    %w[z0]
   lib/raid6/sve.c:151:34: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
     151 |                 : [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
         |                                                ^
   lib/raid6/sve.c:96:22: note: use constraint modifier "w"
      96 |                 "ldr x6, [%[dptr], %[z0], lsl #3]\n"
         |                                    ^~~~~
         |                                    %w[z0]
   lib/raid6/sve.c:229:34: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
     229 |                 : [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
         |                                                ^
   lib/raid6/sve.c:176:22: note: use constraint modifier "w"
     176 |                 "ldr x6, [%[dptr], %[z0], lsl #3]\n"
         |                                    ^~~~~
         |                                    %w[z0]
   lib/raid6/sve.c:340:34: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
     340 |                 : [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
         |                                                ^
   lib/raid6/sve.c:256:22: note: use constraint modifier "w"
     256 |                 "ldr x6, [%[dptr], %[z0], lsl #3]\n"
         |                                    ^~~~~
         |                                    %w[z0]
   lib/raid6/sve.c:455:34: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
     455 |                 : [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
         |                                                ^
   lib/raid6/sve.c:366:22: note: use constraint modifier "w"
     366 |                 "ldr x6, [%[dptr], %[z0], lsl #3]\n"
         |                                    ^~~~~
         |                                    %w[z0]
   lib/raid6/sve.c:634:34: warning: value size does not match register size specified by the constraint and modifier [-Wasm-operand-widths]
     634 |                 : [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
         |                                                ^
   lib/raid6/sve.c:484:22: note: use constraint modifier "w"
     484 |                 "ldr x6, [%[dptr], %[z0], lsl #3]\n"
         |                                    ^~~~~
         |                                    %w[z0]
   6 warnings generated.


vim +70 lib/raid6/sve.c

    15	
    16	static void raid6_sve1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
    17	{
    18		u8 **dptr = (u8 **)ptrs;
    19		u8 *p, *q;
    20		int z0 = disks - 3;
    21	
    22		p = dptr[z0 + 1];
    23		q = dptr[z0 + 2];
    24	
    25		asm volatile(
    26			".arch armv8.2-a+sve\n"
    27			"ptrue p0.b\n"
    28			"cntb x3\n"
    29			"mov w4, #0x1d\n"
    30			"dup z4.b, w4\n"
    31			"mov x5, #0\n"
    32	
    33			"0:\n"
    34			"ldr x6, [%[dptr], %[z0], lsl #3]\n"
    35			"ld1b z0.b, p0/z, [x6, x5]\n"
    36			"mov z1.d, z0.d\n"
    37	
    38			"mov w7, %w[z0]\n"
    39			"sub w7, w7, #1\n"
    40	
    41			"1:\n"
    42			"cmp w7, #0\n"
    43			"blt 2f\n"
    44	
    45			"mov z3.d, z1.d\n"
    46			"asr z3.b, p0/m, z3.b, #7\n"
    47			"lsl z1.b, p0/m, z1.b, #1\n"
    48	
    49			"and z3.d, z3.d, z4.d\n"
    50			"eor z1.d, z1.d, z3.d\n"
    51	
    52			"sxtw x8, w7\n"
    53			"ldr x6, [%[dptr], x8, lsl #3]\n"
    54			"ld1b z2.b, p0/z, [x6, x5]\n"
    55	
    56			"eor z1.d, z1.d, z2.d\n"
    57			"eor z0.d, z0.d, z2.d\n"
    58	
    59			"sub w7, w7, #1\n"
    60			"b 1b\n"
    61			"2:\n"
    62	
    63			"st1b z0.b, p0, [%[p], x5]\n"
    64			"st1b z1.b, p0, [%[q], x5]\n"
    65	
    66			"add x5, x5, x3\n"
    67			"cmp x5, %[bytes]\n"
    68			"blt 0b\n"
    69			:
  > 70			: [dptr] "r" (dptr), [z0] "r" (z0), [bytes] "r" (bytes),
    71			  [p] "r" (p), [q] "r" (q)
    72			: "memory", "p0", "x3", "x4", "x5", "x6", "x7", "x8",
    73			  "z0", "z1", "z2", "z3", "z4"
    74		);
    75	}
    76	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-03-18 11:24 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-17 11:17 [PATCH] raid6: arm64: add SVE optimized implementation for syndrome generation Demian Shulhan
2026-03-18 11:24 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox