From: Ard Biesheuvel <ardb+git@google.com>
To: linux-crypto@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org, ebiggers@kernel.org,
herbert@gondor.apana.org.au, keescook@chromium.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH 6/6] crypto: arm/crct10dif - Implement plain NEON variant
Date: Mon, 28 Oct 2024 20:02:14 +0100 [thread overview]
Message-ID: <20241028190207.1394367-14-ardb+git@google.com> (raw)
In-Reply-To: <20241028190207.1394367-8-ardb+git@google.com>
From: Ard Biesheuvel <ardb@kernel.org>
The CRC-T10DIF algorithm produces a 16-bit CRC, and this is reflected in
the folding coefficients, which are also only 16 bits wide.
This means that the polynomial multiplications involving these
coefficients can be performed using 8-bit long polynomial multiplication
(8x8 -> 16) in only a few steps, and this is an instruction that is part
of the base NEON ISA, which is all most real ARMv7 cores implement. (The
64-bit PMULL instruction is part of the crypto extensions, which are
only implemented by 64-bit cores)
The final reduction is a bit more involved, but we can delegate that to
the generic CRC-T10DIF implementation after folding the entire input
into a 16 byte vector.
This results in a speedup of around 6.6x on Cortex-A72 running in 32-bit
mode.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm/crypto/crct10dif-ce-core.S | 50 ++++++++++++++++++--
arch/arm/crypto/crct10dif-ce-glue.c | 44 +++++++++++++++--
2 files changed, 85 insertions(+), 9 deletions(-)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 6b72167574b2..5e103a9a42dd 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,6 +112,34 @@
FOLD_CONST_L .req q10l
FOLD_CONST_H .req q10h
+__pmull16x64_p8:
+ vmull.p8 q13, d23, d24
+ vmull.p8 q14, d23, d25
+ vmull.p8 q15, d22, d24
+ vmull.p8 q12, d22, d25
+
+ veor q14, q14, q15
+ veor d24, d24, d25
+ veor d26, d26, d27
+ veor d28, d28, d29
+ vmov.i32 d25, #0
+ vmov.i32 d29, #0
+ vext.8 q12, q12, q12, #14
+ vext.8 q14, q14, q14, #15
+ veor d24, d24, d26
+ bx lr
+ENDPROC(__pmull16x64_p8)
+
+ .macro pmull16x64_p8, v16, v64
+ vext.8 q11, \v64, \v64, #1
+ vld1.64 {q12}, [r4, :128]
+ vuzp.8 q11, \v64
+ vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
+ vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
+ bl __pmull16x64_p8
+ veor \v64, q12, q14
+ .endm
+
.macro pmull16x64_p64, v16, v64
vmull.p64 q11, \v64\()l, \v16\()_L
vmull.p64 \v64, \v64\()h, \v16\()_H
@@ -249,9 +277,9 @@ CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h
// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
- mov_l r3, .Lbyteshift_table + 16
- sub r3, r3, len
- vld1.8 {q2}, [r3]
+ mov_l r1, .Lbyteshift_table + 16
+ sub r1, r1, len
+ vld1.8 {q2}, [r1]
vtbl.8 q1l, {q7l-q7h}, q2l
vtbl.8 q1h, {q7l-q7h}, q2h
@@ -341,9 +369,20 @@ ENTRY(crc_t10dif_pmull64)
vmov.u16 r0, q0l[0]
bx lr
-
ENDPROC(crc_t10dif_pmull64)
+ENTRY(crc_t10dif_pmull8)
+ push {r4, lr}
+ mov_l r4, .L16x64perm
+
+ crct10dif p8
+
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+ vst1.64 {q7}, [r3, :128]
+ pop {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
+
.section ".rodata", "a"
.align 4
@@ -376,3 +415,6 @@ ENDPROC(crc_t10dif_pmull64)
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
+
+.L16x64perm:
+ .quad 0x808080800000000, 0x909090901010101
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index 60aa79c2fcdb..4431e4ce2dbe 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -20,6 +20,7 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, u8 *out);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -45,6 +46,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
return 0;
}
+static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crcp = shash_desc_ctx(desc);
+ u8 buf[16] __aligned(16);
+ u16 crc = *crcp;
+
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc_t10dif_pmull8(crc, data, length, buf);
+ kernel_neon_end();
+
+ crc = 0;
+ data = buf;
+ length = sizeof(buf);
+ }
+
+ *crcp = crc_t10dif_generic(crc, data, length);
+ return 0;
+}
+
static int crct10dif_final(struct shash_desc *desc, u8 *out)
{
u16 *crc = shash_desc_ctx(desc);
@@ -53,7 +75,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg algs[] = {{
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update_neon,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm-neon",
+ .base.cra_priority = 150,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
.update = crct10dif_update_ce,
@@ -65,19 +99,19 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
-};
+}};
static int __init crc_t10dif_mod_init(void)
{
- if (!(elf_hwcap2 & HWCAP2_PMULL))
+ if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
- return crypto_register_shash(&crc_t10dif_alg);
+ return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
static void __exit crc_t10dif_mod_exit(void)
{
- crypto_unregister_shash(&crc_t10dif_alg);
+ crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
module_init(crc_t10dif_mod_init);
--
2.47.0.163.g1226f6d8fa-goog
next prev parent reply other threads:[~2024-10-28 19:18 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-28 19:02 [PATCH 0/6] Clean up and improve ARM/arm64 CRC-T10DIF code Ard Biesheuvel
2024-10-28 19:02 ` [PATCH 1/6] crypto: arm64/crct10dif - Remove obsolete chunking logic Ard Biesheuvel
2024-10-30 3:54 ` Eric Biggers
2024-10-28 19:02 ` [PATCH 2/6] crypto: arm64/crct10dif - Use faster 16x64 bit polynomial multiply Ard Biesheuvel
2024-10-30 4:01 ` Eric Biggers
2024-10-28 19:02 ` [PATCH 3/6] crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code Ard Biesheuvel
2024-10-30 4:15 ` Eric Biggers
2024-10-28 19:02 ` [PATCH 4/6] crypto: arm/crct10dif - Use existing mov_l macro instead of __adrl Ard Biesheuvel
2024-10-30 4:29 ` Eric Biggers
2024-10-28 19:02 ` [PATCH 5/6] crypto: arm/crct10dif - Macroify PMULL asm code Ard Biesheuvel
2024-10-30 4:31 ` Eric Biggers
2024-10-28 19:02 ` Ard Biesheuvel [this message]
2024-10-30 4:33 ` [PATCH 6/6] crypto: arm/crct10dif - Implement plain NEON variant Eric Biggers
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241028190207.1394367-14-ardb+git@google.com \
--to=ardb+git@google.com \
--cc=ardb@kernel.org \
--cc=ebiggers@kernel.org \
--cc=herbert@gondor.apana.org.au \
--cc=keescook@chromium.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.