[PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
@ 2026-03-17  6:54 Demian Shulhan
  2026-03-19 19:09 ` Eric Biggers
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Demian Shulhan @ 2026-03-17  6:54 UTC (permalink / raw)
  To: ebiggers, ardb; +Cc: linux-crypto, linux-kernel, Demian Shulhan

Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON
Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR
software implementation is slow, which creates a bottleneck in NVMe and
other storage subsystems.

The acceleration is implemented using C intrinsics (<arm_neon.h>) rather
than raw assembly for better readability and maintainability.

Key highlights of this implementation:
- Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency
  spikes on large buffers.
- Pre-calculates and loads fold constants via vld1q_u64() to minimize
  register spilling.
- Benchmarks show the break-even point against the generic implementation
  is around 128 bytes. The PMULL path is enabled only for len >= 128.
- Safely falls back to the generic implementation on Big-Endian systems.

Performance results (kunit crc_benchmark on Cortex-A72):
- Generic (len=4096): ~268 MB/s
- PMULL (len=4096): ~1556 MB/s (nearly 6x improvement)

Signed-off-by: Demian Shulhan <demyansh@gmail.com>
---
 lib/crc/Kconfig                  |  1 +
 lib/crc/Makefile                 |  4 ++
 lib/crc/arm64/crc64-neon-inner.c | 82 ++++++++++++++++++++++++++++++++
 lib/crc/arm64/crc64.h            | 35 ++++++++++++++
 4 files changed, 122 insertions(+)
 create mode 100644 lib/crc/arm64/crc64-neon-inner.c
 create mode 100644 lib/crc/arm64/crc64.h

diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index 70e7a6016de3..6b6c7d9f5ea5 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -82,6 +82,7 @@ config CRC64
 config CRC64_ARCH
 	bool
 	depends on CRC64 && CRC_OPTIMIZATIONS
+	default y if ARM64 && KERNEL_MODE_NEON
 	default y if RISCV && RISCV_ISA_ZBC && 64BIT
 	default y if X86_64
 
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index 7543ad295ab6..552760f28003 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -38,6 +38,10 @@ obj-$(CONFIG_CRC64) += crc64.o
 crc64-y := crc64-main.o
 ifeq ($(CONFIG_CRC64_ARCH),y)
 CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
+CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
+CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
+crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
 crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
 crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
 endif
diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
new file mode 100644
index 000000000000..beefdec5456b
--- /dev/null
+++ b/lib/crc/arm64/crc64-neon-inner.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
+ */
+
+#include <linux/types.h>
+#include <linux/crc64.h>
+#ifdef CONFIG_ARM64
+#include <asm/neon-intrinsics.h>
+#else
+#include <arm_neon.h>
+#endif
+
+#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
+#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
+
+static const u64 fold_consts_val[2] = {0xeadc41fd2ba3d420ULL, 0x21e9761e252621acULL};
+static const u64 bconsts_val[2] = {0x27ecfa329aef9f77ULL, 0x34d926535897936aULL};
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+	if (len == 0)
+		return crc;
+
+	uint64x2_t v0_u64 = {crc, 0};
+	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
+
+	poly64x2_t fold_consts = vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
+
+	if (len >= 16) {
+		poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+		v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
+						   vreinterpretq_u8_p64(v1)));
+		p += 16;
+		len -= 16;
+
+		while (len >= 16) {
+			v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+			poly128_t v2 = vmull_high_p64(fold_consts, v0);
+			poly128_t v0_128 = vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
+
+			uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
+						 vreinterpretq_u8_p128(v2));
+
+			x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
+			v0 = vreinterpretq_p64_u8(x0);
+
+			p += 16;
+			len -= 16;
+		}
+
+		poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){0, 0});
+
+		poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
+
+		uint8x16_t ext_v0 = vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
+		uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
+
+		v0 = vreinterpretq_p64_u8(x0);
+
+		poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
+
+		v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
+
+		poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
+		poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
+
+		x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
+
+		uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
+					     vreinterpretq_u8_p128(v1_128), 8);
+
+		x0 = veorq_u8(x0, ext_v2);
+
+		v0 = vreinterpretq_p64_u8(x0);
+		crc = vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
+	}
+
+	return crc;
+}
diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
new file mode 100644
index 000000000000..12b1a8bd518a
--- /dev/null
+++ b/lib/crc/arm64/crc64.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM64 PMULL instructions
+ */
+#ifndef _ARM64_CRC64_H
+#define _ARM64_CRC64_H
+
+#include <asm/cpufeature.h>
+#include <asm/simd.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+	if (!IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && len >= 128 &&
+	    cpu_have_named_feature(PMULL) && likely(may_use_simd())) {
+		while (len >= 128) {
+			size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+
+			scoped_ksimd() {
+				crc = crc64_nvme_arm64_c(crc, p, chunk);
+			}
+			p += chunk;
+			len -= chunk;
+		}
+	}
+	return crc64_nvme_generic(crc, p, len);
+}
+
+#endif /* _ARM64_CRC64_H */
+
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-17  6:54 [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation Demian Shulhan
@ 2026-03-19 19:09 ` Eric Biggers
  2026-03-20 10:36   ` David Laight
  2026-03-19 23:31 ` David Laight
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Eric Biggers @ 2026-03-19 19:09 UTC (permalink / raw)
  To: Demian Shulhan; +Cc: ardb, linux-crypto, linux-kernel

On Tue, Mar 17, 2026 at 06:54:25AM +0000, Demian Shulhan wrote:
> Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON
> Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR
> software implementation is slow, which creates a bottleneck in NVMe and
> other storage subsystems.
> 
> The acceleration is implemented using C intrinsics (<arm_neon.h>) rather
> than raw assembly for better readability and maintainability.
> 
> Key highlights of this implementation:
> - Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency
>   spikes on large buffers.
> - Pre-calculates and loads fold constants via vld1q_u64() to minimize
>   register spilling.
> - Benchmarks show the break-even point against the generic implementation
>   is around 128 bytes. The PMULL path is enabled only for len >= 128.
> - Safely falls back to the generic implementation on Big-Endian systems.
> 
> Performance results (kunit crc_benchmark on Cortex-A72):
> - Generic (len=4096): ~268 MB/s
> - PMULL (len=4096): ~1556 MB/s (nearly 6x improvement)
> 
> Signed-off-by: Demian Shulhan <demyansh@gmail.com>

Thanks!  I'm planning to accept this once the relatively minor comments
later on in this email are addressed.

But just FYI, having separate code for each CRC variant isn't very
sustainable.  CRC-T10DIF, CRC64-NVME, and CRC64-BE should all have
similar PMULL based implementations.  x86 and riscv solve this using a
"template" that supports all CRC variants.  I'd like to eventually bring
a similar solution to arm64 (and arm) as well.

So while this code is fine for now, later I'd like to replace it with
something more general, like x86 and riscv have now.  Then we can
optimize CRC-T10DIF, CRC64-NVME, and CRC64-BE together.

E.g., consider that the CRC64-NVME code added by patch folds across at
most 1 vector.  That's much less optimized than the existing CRC-T10DIF
code in lib/crc/arm64/crc-t10dif-core.S, which folds across 8.  If we
used a unified approach, we could optimize these CRC variants together.

As for intristics vs. assembly: the kernel usually uses assembly.
However, I'm supportive of starting to use intrinsics more, and this a
good start.  But we'll need to keep an eye out for any compiler issues.

Various fairly minor comments below:

> diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
> index 70e7a6016de3..6b6c7d9f5ea5 100644
> --- a/lib/crc/Kconfig
> +++ b/lib/crc/Kconfig
> @@ -82,6 +82,7 @@ config CRC64
>  config CRC64_ARCH
>  	bool
>  	depends on CRC64 && CRC_OPTIMIZATIONS
> +	default y if ARM64 && KERNEL_MODE_NEON

Just "default y if ARM64".  KERNEL_MODE_NEON is always enabled on ARM64.
Changes have already been submitted to remove the existing checks of
KERNEL_MODE_NEON in ARM64-specific code in lib/crc/ and lib/crypto/.

> diff --git a/lib/crc/Makefile b/lib/crc/Makefile
> index 7543ad295ab6..552760f28003 100644
> --- a/lib/crc/Makefile
> +++ b/lib/crc/Makefile
> @@ -38,6 +38,10 @@ obj-$(CONFIG_CRC64) += crc64.o
>  crc64-y := crc64-main.o
>  ifeq ($(CONFIG_CRC64_ARCH),y)
>  CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
> +CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
> +CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
> +CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
> +crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
>  crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
>  crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
>  endif

To make this a bit easier to read, add newlines before and after the
arm64-specific parts, and change 'endif' to 'endif # CONFIG_CRC64_ARCH'

> diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
> new file mode 100644
> index 000000000000..beefdec5456b
> --- /dev/null
> +++ b/lib/crc/arm64/crc64-neon-inner.c
> @@ -0,0 +1,82 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
> + */
> +
> +#include <linux/types.h>
> +#include <linux/crc64.h>

No need for <linux/crc64.h> here

> +#ifdef CONFIG_ARM64
> +#include <asm/neon-intrinsics.h>
> +#else
> +#include <arm_neon.h>
> +#endif

This is arm64-specific code, so all that's needed above is the part
under CONFIG_ARM64.

> static const u64 fold_consts_val[2] = {0xeadc41fd2ba3d420ULL, 0x21e9761e252621acULL};
> static const u64 bconsts_val[2] = {0x27ecfa329aef9f77ULL, 0x34d926535897936aULL};

Add comments that document what these constants are.  As per
lib/crc/x86/crc-pclmul-consts.h which has the same constants, these are:
x^191 mod G, x^127 mod G, floor(x^127 / G), and (G - x^64) / x.

> +u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)

Declare this function first earlier in the file, otherwise a
-Wmissing-prototypes warning is generated.

> +{
> +	if (len == 0)
> +		return crc;
> +
> +	uint64x2_t v0_u64 = {crc, 0};
> +	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
> +
> +	poly64x2_t fold_consts = vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
> +
> +	if (len >= 16) {

> +		poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
> +
> +		v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
> +						   vreinterpretq_u8_p64(v1)));
> +		p += 16;
> +		len -= 16;
> +
> +		while (len >= 16) {

Since this function is called only when len >= 128, and it exists
specifically for that caller and isn't available for wider use, it
doesn't need to handle other cases.  So the 'if (len == 0)' block should
be removed, 'len >= 16' should be made unconditional, 'while (len >=
16)' should be replaced with 'do ... while (len >= 16)'.

> diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
> new file mode 100644
> index 000000000000..12b1a8bd518a
> --- /dev/null
> +++ b/lib/crc/arm64/crc64.h
> @@ -0,0 +1,35 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * CRC64 using ARM64 PMULL instructions
> + */
> +#ifndef _ARM64_CRC64_H
> +#define _ARM64_CRC64_H

We haven't been using include guards in the headers
lib/{crc,crypto}/${ARCH}/${ALGORITHM}.h, as they are intended only for
inclusion in a specific C file -- lib/crc/crc64-main.c in this case.
Probably best to stay with the existing convention of omitting these.

> +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
> +{
> +	if (!IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && len >= 128 &&

No need to check !IS_ENABLED(CONFIG_CPU_BIG_ENDIAN), since arm64 kernels
are little-endian-only these days.

>               while (len >= 128)

Replace with a do-while loop, as this is already conditional on
'len >= 128'.

> +			scoped_ksimd() {
> +				crc = crc64_nvme_arm64_c(crc, p, chunk);
> +			}

Remove the braces above, as the contents of the block are a single
statement.

Finally, this patch also has many overly-long lines.  I recommend
running 'git clang-format'.  It's not perfect, but it's easier and often
produces better results than manual formatting.

- Eric

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-17  6:54 [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation Demian Shulhan
  2026-03-19 19:09 ` Eric Biggers
@ 2026-03-19 23:31 ` David Laight
  2026-03-20 11:22 ` kernel test robot
  2026-03-27  6:02 ` [PATCH v2] " Demian Shulhan
  3 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2026-03-19 23:31 UTC (permalink / raw)
  To: Demian Shulhan; +Cc: ebiggers, ardb, linux-crypto, linux-kernel

On Tue, 17 Mar 2026 06:54:25 +0000
Demian Shulhan <demyansh@gmail.com> wrote:

> Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON
> Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR
> software implementation is slow, which creates a bottleneck in NVMe and
> other storage subsystems.
> 
> The acceleration is implemented using C intrinsics (<arm_neon.h>) rather
> than raw assembly for better readability and maintainability.
> 
> Key highlights of this implementation:
> - Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency
>   spikes on large buffers.
> - Pre-calculates and loads fold constants via vld1q_u64() to minimize
>   register spilling.
> - Benchmarks show the break-even point against the generic implementation
>   is around 128 bytes. The PMULL path is enabled only for len >= 128.
> - Safely falls back to the generic implementation on Big-Endian systems.
> 
> Performance results (kunit crc_benchmark on Cortex-A72):
> - Generic (len=4096): ~268 MB/s
> - PMULL (len=4096): ~1556 MB/s (nearly 6x improvement)
> 
> Signed-off-by: Demian Shulhan <demyansh@gmail.com>
> ---
>  lib/crc/Kconfig                  |  1 +
>  lib/crc/Makefile                 |  4 ++
>  lib/crc/arm64/crc64-neon-inner.c | 82 ++++++++++++++++++++++++++++++++
>  lib/crc/arm64/crc64.h            | 35 ++++++++++++++
>  4 files changed, 122 insertions(+)
>  create mode 100644 lib/crc/arm64/crc64-neon-inner.c
>  create mode 100644 lib/crc/arm64/crc64.h
> 
> diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
> index 70e7a6016de3..6b6c7d9f5ea5 100644
> --- a/lib/crc/Kconfig
> +++ b/lib/crc/Kconfig
> @@ -82,6 +82,7 @@ config CRC64
>  config CRC64_ARCH
>  	bool
>  	depends on CRC64 && CRC_OPTIMIZATIONS
> +	default y if ARM64 && KERNEL_MODE_NEON
>  	default y if RISCV && RISCV_ISA_ZBC && 64BIT
>  	default y if X86_64
>  
> diff --git a/lib/crc/Makefile b/lib/crc/Makefile
> index 7543ad295ab6..552760f28003 100644
> --- a/lib/crc/Makefile
> +++ b/lib/crc/Makefile
> @@ -38,6 +38,10 @@ obj-$(CONFIG_CRC64) += crc64.o
>  crc64-y := crc64-main.o
>  ifeq ($(CONFIG_CRC64_ARCH),y)
>  CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
> +CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
> +CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
> +CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
> +crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
>  crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
>  crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
>  endif
> diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
> new file mode 100644
> index 000000000000..beefdec5456b
> --- /dev/null
> +++ b/lib/crc/arm64/crc64-neon-inner.c
> @@ -0,0 +1,82 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
> + */
> +
> +#include <linux/types.h>
> +#include <linux/crc64.h>
> +#ifdef CONFIG_ARM64
> +#include <asm/neon-intrinsics.h>
> +#else
> +#include <arm_neon.h>
> +#endif
> +
> +#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
> +#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
> +
> +static const u64 fold_consts_val[2] = {0xeadc41fd2ba3d420ULL, 0x21e9761e252621acULL};
> +static const u64 bconsts_val[2] = {0x27ecfa329aef9f77ULL, 0x34d926535897936aULL};
> +
> +u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
> +{
> +	if (len == 0)
> +		return crc;
> +
> +	uint64x2_t v0_u64 = {crc, 0};
> +	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
> +
> +	poly64x2_t fold_consts = vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
> +
> +	if (len >= 16) {
> +		poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
> +
> +		v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
> +						   vreinterpretq_u8_p64(v1)));
> +		p += 16;
> +		len -= 16;
> +
> +		while (len >= 16) {
> +			v1 = vreinterpretq_p64_u8(vld1q_u8(p));
> +
> +			poly128_t v2 = vmull_high_p64(fold_consts, v0);
> +			poly128_t v0_128 = vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));

If the cpu can execute two PMULL at the same time it should be possible
to speed things up.
With the correct constant the PMULL output can be xor'ed onto the p[32-63]
instead of p[16-47] (which is where I think it ends up) so you can execute
two in parallel - just needs some very careful register use.

> +
> +			uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
> +						 vreinterpretq_u8_p128(v2));
> +
> +			x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
> +			v0 = vreinterpretq_p64_u8(x0);
> +
> +			p += 16;
> +			len -= 16;
> +		}

I can't help feeling the part below really needs a few comments.
I think I know what it has to be doing - reducing 128 bits to 64
(or possibly 256 to 64 depending on the width of the multiply).
Although you only need to do it at the end of the outer loop.
Between the 4k blocks I think you can just save the output in 64bit
registers.

I'm also not entirely certain, but I think the code is equivalent to
calling crc64_nvme_generic() for 8 bytes (which could be xor'ed with
the last 8 bytes of the buffer).
(Or that might need a different constant and the code is actually
running the crc backwards on 8 bytes 'beyond the crc'.)


> +
> +		poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){0, 0});
> +
> +		poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
> +
> +		uint8x16_t ext_v0 = vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
> +		uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
> +
> +		v0 = vreinterpretq_p64_u8(x0);
> +
> +		poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
> +
> +		v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
> +
> +		poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
> +		poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
> +
> +		x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
> +
> +		uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
> +					     vreinterpretq_u8_p128(v1_128), 8);
> +
> +		x0 = veorq_u8(x0, ext_v2);
> +
> +		v0 = vreinterpretq_p64_u8(x0);
> +		crc = vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
> +	}
> +
> +	return crc;
> +}
> diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
> new file mode 100644
> index 000000000000..12b1a8bd518a
> --- /dev/null
> +++ b/lib/crc/arm64/crc64.h
> @@ -0,0 +1,35 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * CRC64 using ARM64 PMULL instructions
> + */
> +#ifndef _ARM64_CRC64_H
> +#define _ARM64_CRC64_H
> +
> +#include <asm/cpufeature.h>
> +#include <asm/simd.h>
> +#include <linux/minmax.h>
> +#include <linux/sizes.h>
> +
> +u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
> +
> +#define crc64_be_arch crc64_be_generic
> +
> +static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
> +{
> +	if (!IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && len >= 128 &&
> +	    cpu_have_named_feature(PMULL) && likely(may_use_simd())) {
> +		while (len >= 128) {
> +			size_t chunk = min_t(size_t, len & ~15, SZ_4K);

That shouldn't need to be min_t().

	David

> +
> +			scoped_ksimd() {
> +				crc = crc64_nvme_arm64_c(crc, p, chunk);
> +			}
> +			p += chunk;
> +			len -= chunk;
> +		}
> +	}
> +	return crc64_nvme_generic(crc, p, len);
> +}
> +
> +#endif /* _ARM64_CRC64_H */
> +


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-19 19:09 ` Eric Biggers
@ 2026-03-20 10:36   ` David Laight
  2026-03-20 20:00     ` Eric Biggers
  0 siblings, 1 reply; 9+ messages in thread
From: David Laight @ 2026-03-20 10:36 UTC (permalink / raw)
  To: Eric Biggers; +Cc: Demian Shulhan, ardb, linux-crypto, linux-kernel

On Thu, 19 Mar 2026 12:09:08 -0700
Eric Biggers <ebiggers@kernel.org> wrote:

> On Tue, Mar 17, 2026 at 06:54:25AM +0000, Demian Shulhan wrote:
> > Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON
> > Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR
> > software implementation is slow, which creates a bottleneck in NVMe and
> > other storage subsystems.
> > 
> > The acceleration is implemented using C intrinsics (<arm_neon.h>) rather
> > than raw assembly for better readability and maintainability.
> > 
> > Key highlights of this implementation:
> > - Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency
> >   spikes on large buffers.
> > - Pre-calculates and loads fold constants via vld1q_u64() to minimize
> >   register spilling.
> > - Benchmarks show the break-even point against the generic implementation
> >   is around 128 bytes. The PMULL path is enabled only for len >= 128.
> > - Safely falls back to the generic implementation on Big-Endian systems.
> > 
> > Performance results (kunit crc_benchmark on Cortex-A72):
> > - Generic (len=4096): ~268 MB/s
> > - PMULL (len=4096): ~1556 MB/s (nearly 6x improvement)
> > 
> > Signed-off-by: Demian Shulhan <demyansh@gmail.com>  
> 
> Thanks!  I'm planning to accept this once the relatively minor comments
> later on in this email are addressed.
> 
> But just FYI, having separate code for each CRC variant isn't very
> sustainable.  CRC-T10DIF, CRC64-NVME, and CRC64-BE should all have
> similar PMULL based implementations.  x86 and riscv solve this using a
> "template" that supports all CRC variants.  I'd like to eventually bring
> a similar solution to arm64 (and arm) as well.
> 
> So while this code is fine for now, later I'd like to replace it with
> something more general, like x86 and riscv have now.  Then we can
> optimize CRC-T10DIF, CRC64-NVME, and CRC64-BE together.

I'm also pretty sure that the same loop will process 32bit and 16bit CRC
(just needs the high bits of the constant multiplier set to zero).
There are fewer bits to correct for at the end (I think it is always
the size of the CRC) but that may not be worth worrying about.

> E.g., consider that the CRC64-NVME code added by patch folds across at
> most 1 vector.  That's much less optimized than the existing CRC-T10DIF
> code in lib/crc/arm64/crc-t10dif-core.S, which folds across 8.  If we
> used a unified approach, we could optimize these CRC variants together.
> 
> As for intristics vs. assembly: the kernel usually uses assembly.
> However, I'm supportive of starting to use intrinsics more, and this a
> good start.  But we'll need to keep an eye out for any compiler issues.

But they do make the code unreadable - probably even more than the
assembler would be.
It might be better to write some C that required the architecture provide
the functions required for doing a CRC with 128bit registers that hold
two 64bit values (etc) and give them sane names.
Then common C code can be used provided the required instructions exist.
I'm pretty sure the loop is effectively:
	for (; p < limit; p++)
		p[N] ^= low(*p) * const_a ^ high(*p) * const_b;
where N is at least one and you don't actually want to write into the buffer.
Making N > 1 should improve performance - just needs care.

That might be what you've done for x86 - I keep meaning to look at that code.

	David


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-17  6:54 [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation Demian Shulhan
  2026-03-19 19:09 ` Eric Biggers
  2026-03-19 23:31 ` David Laight
@ 2026-03-20 11:22 ` kernel test robot
  2026-03-27  6:02 ` [PATCH v2] " Demian Shulhan
  3 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2026-03-20 11:22 UTC (permalink / raw)
  To: Demian Shulhan, ebiggers, ardb
  Cc: llvm, oe-kbuild-all, linux-crypto, linux-kernel, Demian Shulhan

Hi Demian,

kernel test robot noticed the following build warnings:

[auto build test WARNING on ebiggers/crc-next]
[also build test WARNING on linus/master v7.0-rc4 next-20260319]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Demian-Shulhan/lib-crc-arm64-add-NEON-accelerated-CRC64-NVMe-implementation/20260318-003936
base:   https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git crc-next
patch link:    https://lore.kernel.org/r/20260317065425.2684093-1-demyansh%40gmail.com
patch subject: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
config: arm64-allmodconfig (https://download.01.org/0day-ci/archive/20260320/202603201958.gFoHxLV7-lkp@intel.com/config)
compiler: clang version 19.1.7 (https://github.com/llvm/llvm-project cd708029e0b2869e80abe31ddb175f7c35361f90)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260320/202603201958.gFoHxLV7-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603201958.gFoHxLV7-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> lib/crc/arm64/crc64-neon-inner.c:20:5: warning: no previous prototype for function 'crc64_nvme_arm64_c' [-Wmissing-prototypes]
      20 | u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
         |     ^
   lib/crc/arm64/crc64-neon-inner.c:20:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
      20 | u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
         | ^
         | static 
   1 warning generated.


vim +/crc64_nvme_arm64_c +20 lib/crc/arm64/crc64-neon-inner.c

    19	
  > 20	u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-20 10:36   ` David Laight
@ 2026-03-20 20:00     ` Eric Biggers
  2026-03-22  9:29       ` Demian Shulhan
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Biggers @ 2026-03-20 20:00 UTC (permalink / raw)
  To: David Laight; +Cc: Demian Shulhan, ardb, linux-crypto, linux-kernel

On Fri, Mar 20, 2026 at 10:36:24AM +0000, David Laight wrote:
> I'm also pretty sure that the same loop will process 32bit and 16bit CRC
> (just needs the high bits of the constant multiplier set to zero).
> There are fewer bits to correct for at the end (I think it is always
> the size of the CRC) but that may not be worth worrying about.

Again, see lib/crc/x86/ and lib/crc/riscv/ which do basically this.

> It might be better to write some C that required the architecture provide
> the functions required for doing a CRC with 128bit registers that hold
> two 64bit values (etc) and give them sane names.
>
> Then common C code can be used provided the required instructions exist.

While it would be great to share more CRC code between architectures by
using a C "template" combined with some arch-dependent inline asm
blocks, there's actually a lot of variation in what instructions and
register widths the different architectures have.

lib/crc/riscv/crc-clmul-template.h actually has something very similar
to this already: it's written in C, and there are just three
single-instruction inline asm blocks to access RISC-V's clmul
instructions.  Unfortunately, the carryless multiplication instructions
on the other architectures are not compatible with these.  So, it's hard
to make it anything more than RISC-V specific code.

There might be enough similarity between arm, arm64, and x86_64 for them
to share code using a similar "template".  However, consider that for
x86_64 we need to support different register widths.  See
lib/crc/x86/crc-pclmul-template.S.

> I'm pretty sure the loop is effectively:
> 	for (; p < limit; p++)
> 		p[N] ^= low(*p) * const_a ^ high(*p) * const_b;
> where N is at least one and you don't actually want to write into the buffer.
> Making N > 1 should improve performance - just needs care.

Well, you're welcome to read the actual code and not just speculate.

But again, maybe best to not get too sidetracked for now, unless you or
Demian are actually planning to work on the more general version.

- Eric

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-20 20:00     ` Eric Biggers
@ 2026-03-22  9:29       ` Demian Shulhan
  2026-03-22 14:13         ` Eric Biggers
  0 siblings, 1 reply; 9+ messages in thread
From: Demian Shulhan @ 2026-03-22  9:29 UTC (permalink / raw)
  To: Eric Biggers; +Cc: David Laight, ardb, linux-crypto, linux-kernel

Hi Eric, David,

Thanks to both of you for the review and suggestions! I've addressed
all your comments and will send the v2 patch shortly.

The idea of a unified PMULL template for ARM64 is very interesting. I
can own it and work on it, but as it requires careful design (parallel
folding across multiple vectors, handling LSB/MSB differences, and
generalizing Barrett reduction), it will take some time to implement
and test properly.

Do you think it makes sense to merge this current solution(with fixed
comments) for now, and I will follow up with the general template
implementation in a separate patchset later?

Thanks,
Demian


пт, 20 бер. 2026 р. о 22:00 Eric Biggers <ebiggers@kernel.org> пише:
>
> On Fri, Mar 20, 2026 at 10:36:24AM +0000, David Laight wrote:
> > I'm also pretty sure that the same loop will process 32bit and 16bit CRC
> > (just needs the high bits of the constant multiplier set to zero).
> > There are fewer bits to correct for at the end (I think it is always
> > the size of the CRC) but that may not be worth worrying about.
>
> Again, see lib/crc/x86/ and lib/crc/riscv/ which do basically this.
>
> > It might be better to write some C that required the architecture provide
> > the functions required for doing a CRC with 128bit registers that hold
> > two 64bit values (etc) and give them sane names.
> >
> > Then common C code can be used provided the required instructions exist.
>
> While it would be great to share more CRC code between architectures by
> using a C "template" combined with some arch-dependent inline asm
> blocks, there's actually a lot of variation in what instructions and
> register widths the different architectures have.
>
> lib/crc/riscv/crc-clmul-template.h actually has something very similar
> to this already: it's written in C, and there are just three
> single-instruction inline asm blocks to access RISC-V's clmul
> instructions.  Unfortunately, the carryless multiplication instructions
> on the other architectures are not compatible with these.  So, it's hard
> to make it anything more than RISC-V specific code.
>
> There might be enough similarity between arm, arm64, and x86_64 for them
> to share code using a similar "template".  However, consider that for
> x86_64 we need to support different register widths.  See
> lib/crc/x86/crc-pclmul-template.S.
>
> > I'm pretty sure the loop is effectively:
> >       for (; p < limit; p++)
> >               p[N] ^= low(*p) * const_a ^ high(*p) * const_b;
> > where N is at least one and you don't actually want to write into the buffer.
> > Making N > 1 should improve performance - just needs care.
>
> Well, you're welcome to read the actual code and not just speculate.
>
> But again, maybe best to not get too sidetracked for now, unless you or
> Demian are actually planning to work on the more general version.
>
> - Eric

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-22  9:29       ` Demian Shulhan
@ 2026-03-22 14:13         ` Eric Biggers
  0 siblings, 0 replies; 9+ messages in thread
From: Eric Biggers @ 2026-03-22 14:13 UTC (permalink / raw)
  To: Demian Shulhan; +Cc: David Laight, ardb, linux-crypto, linux-kernel

On Sun, Mar 22, 2026 at 11:29:36AM +0200, Demian Shulhan wrote:
> Hi Eric, David,
> 
> Thanks to both of you for the review and suggestions! I've addressed
> all your comments and will send the v2 patch shortly.
> 
> The idea of a unified PMULL template for ARM64 is very interesting. I
> can own it and work on it, but as it requires careful design (parallel
> folding across multiple vectors, handling LSB/MSB differences, and
> generalizing Barrett reduction), it will take some time to implement
> and test properly.
> 
> Do you think it makes sense to merge this current solution(with fixed
> comments) for now, and I will follow up with the general template
> implementation in a separate patchset later?
> 
> Thanks,
> Demian

That sounds good to me.  Thanks!

- Eric

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
  2026-03-17  6:54 [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation Demian Shulhan
                   ` (2 preceding siblings ...)
  2026-03-20 11:22 ` kernel test robot
@ 2026-03-27  6:02 ` Demian Shulhan
  3 siblings, 0 replies; 9+ messages in thread
From: Demian Shulhan @ 2026-03-27  6:02 UTC (permalink / raw)
  To: linux-crypto, linux-kernel; +Cc: ebiggers, ardb, Demian Shulhan

Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON
Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR
software implementation is slow, which creates a bottleneck in NVMe and
other storage subsystems.

The acceleration is implemented using C intrinsics (<arm_neon.h>) rather
than raw assembly for better readability and maintainability.

Key highlights of this implementation:
- Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency
  spikes on large buffers.
- Pre-calculates and loads fold constants via vld1q_u64() to minimize
  register spilling.
- Benchmarks show the break-even point against the generic implementation
  is around 128 bytes. The PMULL path is enabled only for len >= 128.
- Safely falls back to the generic implementation on Big-Endian systems.

Performance results (kunit crc_benchmark on Cortex-A72):
- Generic (len=4096): ~268 MB/s
- PMULL (len=4096): ~1556 MB/s (nearly 6x improvement)

Signed-off-by: Demian Shulhan <demyansh@gmail.com>
---
v2: - Removed KERNEL_MODE_NEON check from Kconfig as it's redundant on arm64.
  - Added missing prototype for crc64_nvme_arm64_c to fix sparse/W=1 warning.
  - Improved readability in Makefile with extra newlines and comments.
  - Removed redundant include guards in crc64.h.
  - Switched to do-while loops for better optimization in hot paths.
  - Added comments explaining the magic constants (fold/Barrett).
---
 lib/crc/Kconfig                  |  1 +
 lib/crc/Makefile                 |  8 +++-
 lib/crc/arm64/crc64-neon-inner.c | 82 ++++++++++++++++++++++++++++++++
 lib/crc/arm64/crc64.h            | 29 +++++++++++
 4 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 lib/crc/arm64/crc64-neon-inner.c
 create mode 100644 lib/crc/arm64/crc64.h

diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index 70e7a6016de3..16cb42d5e306 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -82,6 +82,7 @@ config CRC64
 config CRC64_ARCH
 	bool
 	depends on CRC64 && CRC_OPTIMIZATIONS
+	default y if ARM64
 	default y if RISCV && RISCV_ISA_ZBC && 64BIT
 	default y if X86_64
 
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index 7543ad295ab6..c9c35419b39c 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -38,9 +38,15 @@ obj-$(CONFIG_CRC64) += crc64.o
 crc64-y := crc64-main.o
 ifeq ($(CONFIG_CRC64_ARCH),y)
 CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+
+CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
+CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
+CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
+crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
+
 crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
 crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
-endif
+endif # CONFIG_CRC64_ARCH
 
 obj-y += tests/
 
diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
new file mode 100644
index 000000000000..ad268ad35ab8
--- /dev/null
+++ b/lib/crc/arm64/crc64-neon-inner.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
+ */
+
+#include <linux/types.h>
+#include <asm/neon-intrinsics.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
+#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
+
+/* x^191 mod G, x^127 mod G */
+static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
+					0x21e9761e252621acULL };
+/* floor(x^127 / G), (G - x^64) / x */
+static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
+				    0x34d926535897936aULL };
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+	uint64x2_t v0_u64 = { crc, 0 };
+	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
+	poly64x2_t fold_consts =
+		vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
+	poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+	v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
+					   vreinterpretq_u8_p64(v1)));
+	p += 16;
+	len -= 16;
+
+	do {
+		v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+		poly128_t v2 = vmull_high_p64(fold_consts, v0);
+		poly128_t v0_128 =
+			vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
+
+		uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
+					 vreinterpretq_u8_p128(v2));
+
+		x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
+		v0 = vreinterpretq_p64_u8(x0);
+
+		p += 16;
+		len -= 16;
+	} while (len >= 16);
+
+	/*
+	 * Reduce the 128-bit value to 64 bits.
+	 * By multiplying the high 64 bits by x^127 mod G (fold_consts_val[1])
+	 * and XORing the result with the low 64 bits.
+	 */
+	poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
+	poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
+
+	uint8x16_t ext_v0 =
+		vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
+	uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
+
+	v0 = vreinterpretq_p64_u8(x0);
+
+	/* Final Barrett reduction */
+	poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
+
+	v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
+
+	poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
+	poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
+
+	x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
+
+	uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
+				     vreinterpretq_u8_p128(v1_128), 8);
+
+	x0 = veorq_u8(x0, ext_v2);
+
+	v0 = vreinterpretq_p64_u8(x0);
+	return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
+}
diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
new file mode 100644
index 000000000000..2c1449d57486
--- /dev/null
+++ b/lib/crc/arm64/crc64.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM64 PMULL instructions
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/simd.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+	if (len >= 128 && cpu_have_named_feature(PMULL) &&
+	    likely(may_use_simd())) {
+		do {
+			size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+
+			scoped_ksimd() crc = crc64_nvme_arm64_c(crc, p, chunk);
+
+			p += chunk;
+			len -= chunk;
+		} while (len >= 128);
+	}
+	return crc64_nvme_generic(crc, p, len);
+}
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2026-03-27  6:02 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-17  6:54 [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation Demian Shulhan
2026-03-19 19:09 ` Eric Biggers
2026-03-20 10:36   ` David Laight
2026-03-20 20:00     ` Eric Biggers
2026-03-22  9:29       ` Demian Shulhan
2026-03-22 14:13         ` Eric Biggers
2026-03-19 23:31 ` David Laight
2026-03-20 11:22 ` kernel test robot
2026-03-27  6:02 ` [PATCH v2] " Demian Shulhan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox