From: Demian Shulhan <demyansh@gmail.com>
To: linux-crypto@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: ebiggers@kernel.org, ardb@kernel.org,
Demian Shulhan <demyansh@gmail.com>
Subject: [PATCH v2] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation
Date: Fri, 27 Mar 2026 06:02:11 +0000 [thread overview]
Message-ID: <20260327060211.902077-1-demyansh@gmail.com> (raw)
In-Reply-To: <20260317065425.2684093-1-demyansh@gmail.com>
Implement an optimized CRC64 (NVMe) algorithm for ARM64 using NEON
Polynomial Multiply Long (PMULL) instructions. The generic shift-and-XOR
software implementation is slow, which creates a bottleneck in NVMe and
other storage subsystems.
The acceleration is implemented using C intrinsics (<arm_neon.h>) rather
than raw assembly for better readability and maintainability.
Key highlights of this implementation:
- Uses 4KB chunking inside scoped_ksimd() to avoid preemption latency
spikes on large buffers.
- Pre-calculates and loads fold constants via vld1q_u64() to minimize
register spilling.
- Benchmarks show the break-even point against the generic implementation
is around 128 bytes. The PMULL path is enabled only for len >= 128.
- Safely falls back to the generic implementation on Big-Endian systems.
Performance results (kunit crc_benchmark on Cortex-A72):
- Generic (len=4096): ~268 MB/s
- PMULL (len=4096): ~1556 MB/s (nearly 6x improvement)
Signed-off-by: Demian Shulhan <demyansh@gmail.com>
---
v2: - Removed KERNEL_MODE_NEON check from Kconfig as it's redundant on arm64.
- Added missing prototype for crc64_nvme_arm64_c to fix sparse/W=1 warning.
- Improved readability in Makefile with extra newlines and comments.
- Removed redundant include guards in crc64.h.
- Switched to do-while loops for better optimization in hot paths.
- Added comments explaining the magic constants (fold/Barrett).
---
lib/crc/Kconfig | 1 +
lib/crc/Makefile | 8 +++-
lib/crc/arm64/crc64-neon-inner.c | 82 ++++++++++++++++++++++++++++++++
lib/crc/arm64/crc64.h | 29 +++++++++++
4 files changed, 119 insertions(+), 1 deletion(-)
create mode 100644 lib/crc/arm64/crc64-neon-inner.c
create mode 100644 lib/crc/arm64/crc64.h
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index 70e7a6016de3..16cb42d5e306 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -82,6 +82,7 @@ config CRC64
config CRC64_ARCH
bool
depends on CRC64 && CRC_OPTIMIZATIONS
+ default y if ARM64
default y if RISCV && RISCV_ISA_ZBC && 64BIT
default y if X86_64
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index 7543ad295ab6..c9c35419b39c 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -38,9 +38,15 @@ obj-$(CONFIG_CRC64) += crc64.o
crc64-y := crc64-main.o
ifeq ($(CONFIG_CRC64_ARCH),y)
CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+
+CFLAGS_REMOVE_arm64/crc64-neon-inner.o += -mgeneral-regs-only
+CFLAGS_arm64/crc64-neon-inner.o += -ffreestanding -march=armv8-a+crypto
+CFLAGS_arm64/crc64-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
+crc64-$(CONFIG_ARM64) += arm64/crc64-neon-inner.o
+
crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
crc64-$(CONFIG_X86) += x86/crc64-pclmul.o
-endif
+endif # CONFIG_CRC64_ARCH
obj-y += tests/
diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
new file mode 100644
index 000000000000..ad268ad35ab8
--- /dev/null
+++ b/lib/crc/arm64/crc64-neon-inner.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC64 (NVMe) using ARM NEON C intrinsics
+ */
+
+#include <linux/types.h>
+#include <asm/neon-intrinsics.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
+#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
+
+/* x^191 mod G, x^127 mod G */
+static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
+ 0x21e9761e252621acULL };
+/* floor(x^127 / G), (G - x^64) / x */
+static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
+ 0x34d926535897936aULL };
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+ uint64x2_t v0_u64 = { crc, 0 };
+ poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
+ poly64x2_t fold_consts =
+ vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
+ poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+ v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
+ vreinterpretq_u8_p64(v1)));
+ p += 16;
+ len -= 16;
+
+ do {
+ v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+
+ poly128_t v2 = vmull_high_p64(fold_consts, v0);
+ poly128_t v0_128 =
+ vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
+
+ uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
+ vreinterpretq_u8_p128(v2));
+
+ x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
+ v0 = vreinterpretq_p64_u8(x0);
+
+ p += 16;
+ len -= 16;
+ } while (len >= 16);
+
+ /*
+ * Reduce the 128-bit value to 64 bits.
+ * By multiplying the high 64 bits by x^127 mod G (fold_consts_val[1])
+ * and XORing the result with the low 64 bits.
+ */
+ poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
+ poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
+
+ uint8x16_t ext_v0 =
+ vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
+ uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
+
+ v0 = vreinterpretq_p64_u8(x0);
+
+ /* Final Barrett reduction */
+ poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
+
+ v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
+
+ poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
+ poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
+
+ x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
+
+ uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
+ vreinterpretq_u8_p128(v1_128), 8);
+
+ x0 = veorq_u8(x0, ext_v2);
+
+ v0 = vreinterpretq_p64_u8(x0);
+ return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
+}
diff --git a/lib/crc/arm64/crc64.h b/lib/crc/arm64/crc64.h
new file mode 100644
index 000000000000..2c1449d57486
--- /dev/null
+++ b/lib/crc/arm64/crc64.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM64 PMULL instructions
+ */
+
+#include <linux/cpufeature.h>
+#include <asm/simd.h>
+#include <linux/minmax.h>
+#include <linux/sizes.h>
+
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ if (len >= 128 && cpu_have_named_feature(PMULL) &&
+ likely(may_use_simd())) {
+ do {
+ size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+
+ scoped_ksimd() crc = crc64_nvme_arm64_c(crc, p, chunk);
+
+ p += chunk;
+ len -= chunk;
+ } while (len >= 128);
+ }
+ return crc64_nvme_generic(crc, p, len);
+}
--
2.43.0
next prev parent reply other threads:[~2026-03-27 6:02 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-17 6:54 [PATCH] lib/crc: arm64: add NEON accelerated CRC64-NVMe implementation Demian Shulhan
2026-03-19 19:09 ` Eric Biggers
2026-03-20 10:36 ` David Laight
2026-03-20 20:00 ` Eric Biggers
2026-03-22 9:29 ` Demian Shulhan
2026-03-22 14:13 ` Eric Biggers
2026-03-19 23:31 ` David Laight
2026-03-20 11:22 ` kernel test robot
2026-03-27 6:02 ` Demian Shulhan [this message]
2026-03-27 19:38 ` [PATCH v2] " Eric Biggers
2026-03-29 7:43 ` [PATCH v3] " Demian Shulhan
2026-03-29 20:38 ` Eric Biggers
2026-03-29 21:57 ` David Laight
2026-03-29 22:18 ` Eric Biggers
2026-03-30 9:31 ` David Laight
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260327060211.902077-1-demyansh@gmail.com \
--to=demyansh@gmail.com \
--cc=ardb@kernel.org \
--cc=ebiggers@kernel.org \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.