From: Ard Biesheuvel <ardb+git@google.com>
To: linux-crypto@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org, ebiggers@kernel.org,
herbert@gondor.apana.org.au, keescook@chromium.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH v2 6/6] crypto: arm/crct10dif - Implement plain NEON variant
Date: Tue, 5 Nov 2024 17:09:06 +0100 [thread overview]
Message-ID: <20241105160859.1459261-14-ardb+git@google.com> (raw)
In-Reply-To: <20241105160859.1459261-8-ardb+git@google.com>
From: Ard Biesheuvel <ardb@kernel.org>
The CRC-T10DIF algorithm produces a 16-bit CRC, and this is reflected in
the folding coefficients, which are also only 16 bits wide.
This means that the polynomial multiplications involving these
coefficients can be performed using 8-bit long polynomial multiplication
(8x8 -> 16) in only a few steps, and this is an instruction that is part
of the base NEON ISA, which is all most real ARMv7 cores implement. (The
64-bit PMULL instruction is part of the crypto extensions, which are
only implemented by 64-bit cores)
The final reduction is a bit more involved, but we can delegate that to
the generic CRC-T10DIF implementation after folding the entire input
into a 16 byte vector.
This results in a speedup of around 6.6x on Cortex-A72 running in 32-bit
mode. On Cortex-A8 (BeagleBone White), the results are substantially
better than that, but not sufficiently reproducible (with tcrypt) to
quote a number here.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm/crypto/crct10dif-ce-core.S | 98 +++++++++++++++++++-
arch/arm/crypto/crct10dif-ce-glue.c | 45 ++++++++-
2 files changed, 134 insertions(+), 9 deletions(-)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 6b72167574b2..2bbf2df9c1e2 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,6 +112,82 @@
FOLD_CONST_L .req q10l
FOLD_CONST_H .req q10h
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant. The resulting 80-bit vectors are XOR'ed together.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and XORed together to produce the
+ * final 80-bit result.
+ */
+ .macro pmull16x64_p8, v16, v64
+ vext.8 q11, \v64, \v64, #1
+ vld1.64 {q12}, [r4, :128]
+ vuzp.8 q11, \v64
+ vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
+ vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
+ bl __pmull16x64_p8
+ veor \v64, q12, q14
+ .endm
+
+__pmull16x64_p8:
+ vmull.p8 q13, d23, d24
+ vmull.p8 q14, d23, d25
+ vmull.p8 q15, d22, d24
+ vmull.p8 q12, d22, d25
+
+ veor q14, q14, q15
+ veor d24, d24, d25
+ veor d26, d26, d27
+ veor d28, d28, d29
+ vmov.i32 d25, #0
+ vmov.i32 d29, #0
+ vext.8 q12, q12, q12, #14
+ vext.8 q14, q14, q14, #15
+ veor d24, d24, d26
+ bx lr
+ENDPROC(__pmull16x64_p8)
+
.macro pmull16x64_p64, v16, v64
vmull.p64 q11, \v64\()l, \v16\()_L
vmull.p64 \v64, \v64\()h, \v16\()_H
@@ -249,9 +325,9 @@ CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h
// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
- mov_l r3, .Lbyteshift_table + 16
- sub r3, r3, len
- vld1.8 {q2}, [r3]
+ mov_l r1, .Lbyteshift_table + 16
+ sub r1, r1, len
+ vld1.8 {q2}, [r1]
vtbl.8 q1l, {q7l-q7h}, q2l
vtbl.8 q1h, {q7l-q7h}, q2h
@@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64)
vmov.u16 r0, q0l[0]
bx lr
-
ENDPROC(crc_t10dif_pmull64)
+ENTRY(crc_t10dif_pmull8)
+ push {r4, lr}
+ mov_l r4, .L16x64perm
+
+ crct10dif p8
+
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+ vst1.64 {q7}, [r3, :128]
+ pop {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
+
.section ".rodata", "a"
.align 4
@@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64)
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
+
+.L16x64perm:
+ .quad 0x808080800000000, 0x909090901010101
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index 60aa79c2fcdb..a8b74523729e 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -20,6 +20,8 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -45,6 +47,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
return 0;
}
+static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crcp = shash_desc_ctx(desc);
+ u8 buf[16] __aligned(16);
+ u16 crc = *crcp;
+
+ if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc_t10dif_pmull8(crc, data, length, buf);
+ kernel_neon_end();
+
+ crc = 0;
+ data = buf;
+ length = sizeof(buf);
+ }
+
+ *crcp = crc_t10dif_generic(crc, data, length);
+ return 0;
+}
+
static int crct10dif_final(struct shash_desc *desc, u8 *out)
{
u16 *crc = shash_desc_ctx(desc);
@@ -53,7 +76,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg algs[] = {{
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update_neon,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm-neon",
+ .base.cra_priority = 150,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
.update = crct10dif_update_ce,
@@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
-};
+}};
static int __init crc_t10dif_mod_init(void)
{
- if (!(elf_hwcap2 & HWCAP2_PMULL))
+ if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
- return crypto_register_shash(&crc_t10dif_alg);
+ return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
static void __exit crc_t10dif_mod_exit(void)
{
- crypto_unregister_shash(&crc_t10dif_alg);
+ crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
module_init(crc_t10dif_mod_init);
--
2.47.0.199.ga7371fff76-goog
next prev parent reply other threads:[~2024-11-05 16:56 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-05 16:09 [PATCH v2 0/6] Clean up and improve ARM/arm64 CRC-T10DIF code Ard Biesheuvel
2024-11-05 16:09 ` [PATCH v2 1/6] crypto: arm64/crct10dif - Remove obsolete chunking logic Ard Biesheuvel
2024-11-05 16:09 ` [PATCH v2 2/6] crypto: arm64/crct10dif - Use faster 16x64 bit polynomial multiply Ard Biesheuvel
2024-11-05 16:09 ` [PATCH v2 3/6] crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code Ard Biesheuvel
2024-11-05 16:09 ` [PATCH v2 4/6] crypto: arm/crct10dif - Use existing mov_l macro instead of __adrl Ard Biesheuvel
2024-11-05 16:09 ` [PATCH v2 5/6] crypto: arm/crct10dif - Macroify PMULL asm code Ard Biesheuvel
2024-11-05 16:09 ` Ard Biesheuvel [this message]
2024-11-13 13:56 ` [PATCH v2 0/6] Clean up and improve ARM/arm64 CRC-T10DIF code Eric Biggers
2024-11-15 11:57 ` Herbert Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241105160859.1459261-14-ardb+git@google.com \
--to=ardb+git@google.com \
--cc=ardb@kernel.org \
--cc=ebiggers@kernel.org \
--cc=herbert@gondor.apana.org.au \
--cc=keescook@chromium.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).