[PATCH 4/5] lib/crc: arm64: Simplify intrinsics implementation

public inbox for linux-crypto@vger.kernel.org
 help / color / mirror / Atom feed

From: Ard Biesheuvel <ardb@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org,
	Ard Biesheuvel <ardb@kernel.org>,
	Demian Shulhan <demyansh@gmail.com>,
	Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 4/5] lib/crc: arm64: Simplify intrinsics implementation
Date: Mon, 30 Mar 2026 16:46:35 +0200	[thread overview]
Message-ID: <20260330144630.33026-11-ardb@kernel.org> (raw)
In-Reply-To: <20260330144630.33026-7-ardb@kernel.org>

NEON intrinsics are useful because they remove the need for manual
register allocation, and the resulting code can be re-compiled and
optimized for different micro-architectures, and shared between arm64
and 32-bit ARM.

However, the strong typing of the vector variables can lead to
incomprehensible gibberish, as is the case with the new CRC64
implementation. To address this, let's repaint all variables as
uint64x2_t to minimize the number of vreinterpretq_xxx() calls, and to
be able to rely on the ^ operator for exclusive OR operations. This
makes the code much more concise and readable.

While at it, wrap the calls to vmull_p64() et al in order to have a more
consistent calling convention, and encapsulate any remaining
vreinterpret() calls that are still needed.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 lib/crc/arm64/crc64-neon-inner.c | 77 ++++++++------------
 1 file changed, 32 insertions(+), 45 deletions(-)

diff --git a/lib/crc/arm64/crc64-neon-inner.c b/lib/crc/arm64/crc64-neon-inner.c
index 881cdafadb37..28527e544ff6 100644
--- a/lib/crc/arm64/crc64-neon-inner.c
+++ b/lib/crc/arm64/crc64-neon-inner.c
@@ -8,9 +8,6 @@
 
 u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len);
 
-#define GET_P64_0(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 0))
-#define GET_P64_1(v) ((poly64_t)vgetq_lane_u64(vreinterpretq_u64_p64(v), 1))
-
 /* x^191 mod G, x^127 mod G */
 static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
 					0x21e9761e252621acULL };
@@ -18,61 +15,51 @@ static const u64 fold_consts_val[2] = { 0xeadc41fd2ba3d420ULL,
 static const u64 bconsts_val[2] = { 0x27ecfa329aef9f77ULL,
 				    0x34d926535897936aULL };
 
-u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
 {
-	uint64x2_t v0_u64 = { crc, 0 };
-	poly64x2_t v0 = vreinterpretq_p64_u64(v0_u64);
-	poly64x2_t fold_consts =
-		vreinterpretq_p64_u64(vld1q_u64(fold_consts_val));
-	poly64x2_t v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 0),
+						vgetq_lane_u64(b, 0)));
+}
 
-	v0 = vreinterpretq_p64_u8(veorq_u8(vreinterpretq_u8_p64(v0),
-					   vreinterpretq_u8_p64(v1)));
-	p += 16;
-	len -= 16;
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+	poly64x2_t l = vreinterpretq_p64_u64(a);
+	poly64x2_t m = vreinterpretq_p64_u64(b);
 
-	do {
-		v1 = vreinterpretq_p64_u8(vld1q_u8(p));
+	return vreinterpretq_u64_p128(vmull_high_p64(l, m));
+}
 
-		poly128_t v2 = vmull_high_p64(fold_consts, v0);
-		poly128_t v0_128 =
-			vmull_p64(GET_P64_0(fold_consts), GET_P64_0(v0));
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+	return vreinterpretq_u64_p128(vmull_p64(vgetq_lane_u64(a, 1),
+						vgetq_lane_u64(b, 0)));
+}
 
-		uint8x16_t x0 = veorq_u8(vreinterpretq_u8_p128(v0_128),
-					 vreinterpretq_u8_p128(v2));
+u64 crc64_nvme_arm64_c(u64 crc, const u8 *p, size_t len)
+{
+	uint64x2_t fold_consts = vld1q_u64(fold_consts_val);
+	uint64x2_t v0 = { crc, 0 };
+	uint64x2_t zero = { };
 
-		x0 = veorq_u8(x0, vreinterpretq_u8_p64(v1));
-		v0 = vreinterpretq_p64_u8(x0);
+	for (;;) {
+		v0 ^= vreinterpretq_u64_u8(vld1q_u8(p));
 
 		p += 16;
 		len -= 16;
-	} while (len >= 16);
-
-	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
-	poly64x2_t v7 = vreinterpretq_p64_u64((uint64x2_t){ 0, 0 });
-	poly128_t v1_128 = vmull_p64(GET_P64_1(fold_consts), GET_P64_0(v0));
+		if (len < 16)
+			break;
 
-	uint8x16_t ext_v0 =
-		vextq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p64(v7), 8);
-	uint8x16_t x0 = veorq_u8(ext_v0, vreinterpretq_u8_p128(v1_128));
+		v0 = pmull64(fold_consts, v0) ^ pmull64_high(fold_consts, v0);
+	}
 
-	v0 = vreinterpretq_p64_u8(x0);
+	/* Multiply the 128-bit value by x^64 and reduce it back to 128 bits. */
+	v0 = vextq_u64(v0, zero, 1) ^ pmull64_hi_lo(fold_consts, v0);
 
 	/* Final Barrett reduction */
-	poly64x2_t bconsts = vreinterpretq_p64_u64(vld1q_u64(bconsts_val));
-
-	v1_128 = vmull_p64(GET_P64_0(bconsts), GET_P64_0(v0));
-
-	poly64x2_t v1_64 = vreinterpretq_p64_u8(vreinterpretq_u8_p128(v1_128));
-	poly128_t v3_128 = vmull_p64(GET_P64_1(bconsts), GET_P64_0(v1_64));
-
-	x0 = veorq_u8(vreinterpretq_u8_p64(v0), vreinterpretq_u8_p128(v3_128));
-
-	uint8x16_t ext_v2 = vextq_u8(vreinterpretq_u8_p64(v7),
-				     vreinterpretq_u8_p128(v1_128), 8);
+	uint64x2_t bconsts = vld1q_u64(bconsts_val);
+	uint64x2_t final = pmull64(bconsts, v0);
 
-	x0 = veorq_u8(x0, ext_v2);
+	v0 ^= vextq_u64(zero, final, 1) ^ pmull64_hi_lo(bconsts, final);
 
-	v0 = vreinterpretq_p64_u8(x0);
-	return vgetq_lane_u64(vreinterpretq_u64_p64(v0), 1);
+	return vgetq_lane_u64(v0, 1);
 }
-- 
2.53.0.1018.g2bb0e51243-goog

next prev parent reply	other threads:[~2026-03-30 14:46 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-30 14:46 [PATCH 0/5] crc64: Tweak intrinsics code and enable it for ARM Ard Biesheuvel
2026-03-30 14:46 ` [PATCH 1/5] lib/crc: arm64: Drop unnecessary chunking logic from crc64 Ard Biesheuvel
2026-03-31 22:33   ` Eric Biggers
2026-04-01  0:09     ` Eric Biggers
2026-04-01  6:57     ` Ard Biesheuvel
2026-03-30 14:46 ` [PATCH 2/5] lib/crc: arm64: Use existing macros for kernel-mode FPU cflags Ard Biesheuvel
2026-03-30 14:46 ` [PATCH 3/5] ARM: Add a neon-intrinsics.h header like on arm64 Ard Biesheuvel
2026-03-30 14:46 ` Ard Biesheuvel [this message]
2026-03-30 14:46 ` [PATCH 5/5] lib/crc: arm: Enable arm64's NEON intrinsics implementation of crc64 Ard Biesheuvel
2026-03-31  6:47   ` Christoph Hellwig
2026-03-31  8:20     ` Ard Biesheuvel
2026-03-31 22:41   ` Eric Biggers
2026-04-01 16:48     ` Ard Biesheuvel
2026-04-01 19:59 ` [PATCH 0/5] crc64: Tweak intrinsics code and enable it for ARM Eric Biggers
2026-04-02  8:52   ` Ard Biesheuvel
2026-04-02 23:40     ` Eric Biggers
2026-04-03  6:49       ` Ard Biesheuvel
2026-04-03 19:59         ` Eric Biggers

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:881cdafadb3 dfblob:28527e544ff )
 OR (
bs:"[PATCH 4/5] lib/crc: arm64: Simplify intrinsics implementation" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260330144630.33026-11-ardb@kernel.org \
    --to=ardb@kernel.org \
    --cc=demyansh@gmail.com \
    --cc=ebiggers@kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-crypto@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox