From: Ard Biesheuvel <ardb+git@google.com>
To: linux-arm-kernel@lists.infradead.org
Cc: linux-crypto@vger.kernel.org, linux-raid@vger.kernel.org,
Ard Biesheuvel <ardb@kernel.org>, Christoph Hellwig <hch@lst.de>,
Russell King <linux@armlinux.org.uk>,
Arnd Bergmann <arnd@arndb.de>,
Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 5/8] lib/crc: arm: Enable arm64's NEON intrinsics implementation of crc64
Date: Wed, 22 Apr 2026 19:17:01 +0200 [thread overview]
Message-ID: <20260422171655.3437334-15-ardb+git@google.com> (raw)
In-Reply-To: <20260422171655.3437334-10-ardb+git@google.com>
From: Ard Biesheuvel <ardb@kernel.org>
Tweak the NEON intrinsics crc64 code written for arm64 so it can be
built for 32-bit ARM as well. The only workaround needed is to provide
alternatives for vmull_p64() and vmull_high_p64() on Clang, which only
defines those when building for the AArch64 or arm64ec ISA. Use the same
helpers for GCC too, to avoid doubling the size of the test/validation
matrix.
KUnit benchmark results (Cortex-A53 @ 1 Ghz)
Before:
# crc64_nvme_benchmark: len=1: 35 MB/s
# crc64_nvme_benchmark: len=16: 78 MB/s
# crc64_nvme_benchmark: len=64: 87 MB/s
# crc64_nvme_benchmark: len=127: 88 MB/s
# crc64_nvme_benchmark: len=128: 88 MB/s
# crc64_nvme_benchmark: len=200: 89 MB/s
# crc64_nvme_benchmark: len=256: 89 MB/s
# crc64_nvme_benchmark: len=511: 89 MB/s
# crc64_nvme_benchmark: len=512: 89 MB/s
# crc64_nvme_benchmark: len=1024: 90 MB/s
# crc64_nvme_benchmark: len=3173: 90 MB/s
# crc64_nvme_benchmark: len=4096: 90 MB/s
# crc64_nvme_benchmark: len=16384: 90 MB/s
After:
# crc64_nvme_benchmark: len=1: 32 MB/s
# crc64_nvme_benchmark: len=16: 76 MB/s
# crc64_nvme_benchmark: len=64: 71 MB/s
# crc64_nvme_benchmark: len=127: 88 MB/s
# crc64_nvme_benchmark: len=128: 618 MB/s
# crc64_nvme_benchmark: len=200: 542 MB/s
# crc64_nvme_benchmark: len=256: 920 MB/s
# crc64_nvme_benchmark: len=511: 836 MB/s
# crc64_nvme_benchmark: len=512: 1261 MB/s
# crc64_nvme_benchmark: len=1024: 1531 MB/s
# crc64_nvme_benchmark: len=3173: 1731 MB/s
# crc64_nvme_benchmark: len=4096: 1851 MB/s
# crc64_nvme_benchmark: len=16384: 1858 MB/s
Don't bother with big-endian, as it doesn't work correctly on Clang, and
is barely used these days.
Note that ARM disables preemption and softirq processing when using
kernel mode SIMD, so take care not to hog the CPU for too long.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
lib/crc/Kconfig | 1 +
lib/crc/Makefile | 5 ++-
lib/crc/arm/crc64-neon.h | 34 ++++++++++++++++++
lib/crc/arm/crc64.h | 36 ++++++++++++++++++++
4 files changed, 75 insertions(+), 1 deletion(-)
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index 31038c8d111a..86a0e4bfec77 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -82,6 +82,7 @@ config CRC64
config CRC64_ARCH
bool
depends on CRC64 && CRC_OPTIMIZATIONS
+ default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
default y if ARM64
default y if RISCV && RISCV_ISA_ZBC && 64BIT
default y if X86_64
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index 193257ae466f..386e9c175263 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -39,8 +39,11 @@ crc64-y := crc64-main.o
ifeq ($(CONFIG_CRC64_ARCH),y)
CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+crc64-cflags-$(CONFIG_ARM) += -march=armv8-a -mfpu=crypto-neon-fp-armv8
+crc64-cflags-$(CONFIG_ARM64) += -march=armv8-a+crypto
CFLAGS_REMOVE_crc64-neon.o += $(CC_FLAGS_NO_FPU)
-CFLAGS_crc64-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) -march=armv8-a+crypto
+CFLAGS_crc64-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) $(crc64-cflags-y)
+crc64-$(CONFIG_ARM) += crc64-neon.o
crc64-$(CONFIG_ARM64) += crc64-neon.o
crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
diff --git a/lib/crc/arm/crc64-neon.h b/lib/crc/arm/crc64-neon.h
new file mode 100644
index 000000000000..645f553220ff
--- /dev/null
+++ b/lib/crc/arm/crc64-neon.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
+{
+ uint64_t l = vgetq_lane_u64(a, 0);
+ uint64_t m = vgetq_lane_u64(b, 0);
+ uint64x2_t result;
+
+ asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+ return result;
+}
+
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+ uint64_t l = vgetq_lane_u64(a, 1);
+ uint64_t m = vgetq_lane_u64(b, 1);
+ uint64x2_t result;
+
+ asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+ return result;
+}
+
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+ uint64_t l = vgetq_lane_u64(a, 1);
+ uint64_t m = vgetq_lane_u64(b, 0);
+ uint64x2_t result;
+
+ asm("vmull.p64 %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+ return result;
+}
diff --git a/lib/crc/arm/crc64.h b/lib/crc/arm/crc64.h
new file mode 100644
index 000000000000..de274288af61
--- /dev/null
+++ b/lib/crc/arm/crc64.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM PMULL instructions
+ */
+
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+ if (len >= 128 && static_branch_likely(&have_pmull) &&
+ likely(may_use_simd())) {
+ do {
+ size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+
+ scoped_ksimd()
+ crc = crc64_nvme_neon(crc, p, chunk);
+
+ p += chunk;
+ len -= chunk;
+ } while (len >= 128);
+ }
+ return crc64_nvme_generic(crc, p, len);
+}
+
+#define crc64_mod_init_arch crc64_mod_init_arch
+static void crc64_mod_init_arch(void)
+{
+ if (elf_hwcap2 & HWCAP2_PMULL)
+ static_branch_enable(&have_pmull);
+}
--
2.54.0.rc1.555.g9c883467ad-goog
next prev parent reply other threads:[~2026-04-22 17:17 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-22 17:16 [PATCH 0/8] ARM crc64 and XOR using NEON intrinsics Ard Biesheuvel
2026-04-22 17:16 ` [PATCH 1/8] ARM: Add a neon-intrinsics.h header like on arm64 Ard Biesheuvel
2026-04-22 17:16 ` [PATCH 2/8] xor/arm: Replace vectorized implementation with arm64's intrinsics Ard Biesheuvel
2026-04-22 18:07 ` Josh Law
2026-04-23 7:44 ` Christoph Hellwig
2026-04-22 17:16 ` [PATCH 3/8] xor/arm64: Use shared NEON intrinsics implementation from 32-bit ARM Ard Biesheuvel
2026-04-22 18:11 ` Josh Law
2026-04-23 7:46 ` Christoph Hellwig
2026-04-23 7:48 ` Ard Biesheuvel
2026-04-22 17:17 ` [PATCH 4/8] lib/crc: Turn NEON intrinsics crc64 implementation into common code Ard Biesheuvel
2026-04-22 18:13 ` Josh Law
2026-04-22 17:17 ` Ard Biesheuvel [this message]
2026-04-22 18:16 ` [PATCH 5/8] lib/crc: arm: Enable arm64's NEON intrinsics implementation of crc64 Josh Law
2026-04-22 17:17 ` [PATCH 6/8] crypto: aegis128 - Use neon-intrinsics.h on ARM too Ard Biesheuvel
2026-04-22 18:19 ` Josh Law
2026-04-22 17:17 ` [PATCH 7/8] lib/raid6: Include asm/neon-intrinsics.h rather than arm_neon.h Ard Biesheuvel
2026-04-22 18:20 ` Josh Law
2026-04-23 7:47 ` Christoph Hellwig
2026-04-22 17:17 ` [PATCH 8/8] ARM: Remove hacked-up asm/types.h header Ard Biesheuvel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260422171655.3437334-15-ardb+git@google.com \
--to=ardb+git@google.com \
--cc=ardb@kernel.org \
--cc=arnd@arndb.de \
--cc=ebiggers@kernel.org \
--cc=hch@lst.de \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-raid@vger.kernel.org \
--cc=linux@armlinux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox