From: ard.biesheuvel@linaro.org (Ard Biesheuvel)
To: linux-arm-kernel@lists.infradead.org
Subject: [PATCH] arm64/lib: add optimized implementation of sha_transform
Date: Fri, 14 Mar 2014 16:02:33 +0100 [thread overview]
Message-ID: <1394809353-16707-1-git-send-email-ard.biesheuvel@linaro.org> (raw)
This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
Hello all,
No performance numbers I am allowed to share, unfortunately, so if anyone else
(with access to actual, representative hardware) would care to have a go, I
would be very grateful.
This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m), and
inserting the module using 'mode=303' as a parameter (note that the insmod
always fails, but produces its test output to the kernel log). Also note that
the sha_transform() function will be part of the kernel proper, so just
rebuilding the sha1_generic module is not sufficient.
Cheers,
arch/arm64/kernel/arm64ksyms.c | 3 +
arch/arm64/lib/Makefile | 2 +-
arch/arm64/lib/sha1.S | 256 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 260 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/lib/sha1.S
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
EXPORT_SYMBOL(test_and_clear_bit);
EXPORT_SYMBOL(change_bit);
EXPORT_SYMBOL(test_and_change_bit);
+
+ /* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
- strchr.o strrchr.o
+ strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..877b8d70e992
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,256 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ k .req w1
+
+ res .req w2
+ xres .req x2
+
+ wA .req w3
+ wB .req w4
+ wC .req w5
+ wD .req w6
+ wE .req w7
+
+ tmp .req w16
+ xtmp .req x16
+
+ .macro sha1_choose, out, b, c, d
+ eor \out, \c, \d
+ and \out, \out, \b
+ eor \out, \out, \d
+ .endm
+
+ .macro sha1_parity, out, b, c, d
+ eor \out, \b, \c
+ eor \out, \out, \d
+ .endm
+
+ .macro sha1_majority, out, b, c, d
+ eor tmp, \b, \c
+ and \out, \b, \c
+ and tmp, tmp, \d
+ add \out, \out, tmp
+ .endm
+
+ .macro mix_state, st0, st1, st4, st6, st7
+ extr xtmp, \st7, \st6, #32
+ eor \st0, \st0, \st1
+ eor xtmp, xtmp, \st4
+ eor xtmp, xtmp, \st0
+ ror res, tmp, #(32 - 1)
+ lsr xtmp, xtmp, #32
+ ror tmp, tmp, #(32 - 1)
+ orr \st0, xres, xtmp, lsl #32
+ .endm
+
+ .macro sha1_round, func, r, h, a, b, c, d, e
+ sha1_\func res, \b, \c, \d
+ add res, res, \e
+ ror \e, \a, #(32 - 5)
+ .ifc \h, h
+ add xres, xres, x\r, lsr #32
+ .else
+ add res, res, w\r
+ .endif
+ add \e, \e, k
+ ror \b, \b, #2
+ add \e, \e, res
+ .endm
+
+ /*
+ * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+ */
+ENTRY(sha_transform)
+ /* load input into state array */
+ ldp x8, x9, [x1]
+ ldp x10, x11, [x1, #16]
+ ldp x12, x13, [x1, #32]
+ ldp x14, x15, [x1, #48]
+
+ /* load digest input */
+ ldr wA, [x0]
+ ldp wB, wC, [x0, #4]
+ ldp wD, wE, [x0, #12]
+
+ /* endian-reverse the input on LE builds */
+CPU_LE( rev32 x8, x8 )
+CPU_LE( rev32 x9, x9 )
+CPU_LE( rev32 x10, x10 )
+CPU_LE( rev32 x11, x11 )
+CPU_LE( rev32 x12, x12 )
+CPU_LE( rev32 x13, x13 )
+CPU_LE( rev32 x14, x14 )
+CPU_LE( rev32 x15, x15 )
+
+ /* round 1 */
+ ldr k, =0x5a827999
+ sha1_round choose, 8, l, wA, wB, wC, wD, wE
+ sha1_round choose, 8, h, wE, wA, wB, wC, wD
+ sha1_round choose, 9, l, wD, wE, wA, wB, wC
+ sha1_round choose, 9, h, wC, wD, wE, wA, wB
+ sha1_round choose, 10, l, wB, wC, wD, wE, wA
+ sha1_round choose, 10, h, wA, wB, wC, wD, wE
+ sha1_round choose, 11, l, wE, wA, wB, wC, wD
+ sha1_round choose, 11, h, wD, wE, wA, wB, wC
+ sha1_round choose, 12, l, wC, wD, wE, wA, wB
+ sha1_round choose, 12, h, wB, wC, wD, wE, wA
+ sha1_round choose, 13, l, wA, wB, wC, wD, wE
+ sha1_round choose, 13, h, wE, wA, wB, wC, wD
+ sha1_round choose, 14, l, wD, wE, wA, wB, wC
+ sha1_round choose, 14, h, wC, wD, wE, wA, wB
+ sha1_round choose, 15, l, wB, wC, wD, wE, wA
+ sha1_round choose, 15, h, wA, wB, wC, wD, wE
+
+ mix_state x8, x9, x12, x14, x15
+ sha1_round choose, 8, l, wE, wA, wB, wC, wD
+ sha1_round choose, 8, h, wD, wE, wA, wB, wC
+ mix_state x9, x10, x13, x15, x8
+ sha1_round choose, 9, l, wC, wD, wE, wA, wB
+ sha1_round choose, 9, h, wB, wC, wD, wE, wA
+
+ /* round 2 */
+ ldr k, =0x6ed9eba1
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wA, wB, wC, wD, wE
+ sha1_round parity, 10, h, wE, wA, wB, wC, wD
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wD, wE, wA, wB, wC
+ sha1_round parity, 11, h, wC, wD, wE, wA, wB
+ mix_state x12, x13, x8, x10, x11
+ sha1_round parity, 12, l, wB, wC, wD, wE, wA
+ sha1_round parity, 12, h, wA, wB, wC, wD, wE
+ mix_state x13, x14, x9, x11, x12
+ sha1_round parity, 13, l, wE, wA, wB, wC, wD
+ sha1_round parity, 13, h, wD, wE, wA, wB, wC
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wC, wD, wE, wA, wB
+ sha1_round parity, 14, h, wB, wC, wD, wE, wA
+ mix_state x15, x8, x11, x13, x14
+ sha1_round parity, 15, l, wA, wB, wC, wD, wE
+ sha1_round parity, 15, h, wE, wA, wB, wC, wD
+ mix_state x8, x9, x12, x14, x15
+ sha1_round parity, 8, l, wD, wE, wA, wB, wC
+ sha1_round parity, 8, h, wC, wD, wE, wA, wB
+ mix_state x9, x10, x13, x15, x8
+ sha1_round parity, 9, l, wB, wC, wD, wE, wA
+ sha1_round parity, 9, h, wA, wB, wC, wD, wE
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wE, wA, wB, wC, wD
+ sha1_round parity, 10, h, wD, wE, wA, wB, wC
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wC, wD, wE, wA, wB
+ sha1_round parity, 11, h, wB, wC, wD, wE, wA
+
+ /* round 3 */
+ ldr k, =0x8f1bbcdc
+ mix_state x12, x13, x8, x10, x11
+ sha1_round majority, 12, l, wA, wB, wC, wD, wE
+ sha1_round majority, 12, h, wE, wA, wB, wC, wD
+ mix_state x13, x14, x9, x11, x12
+ sha1_round majority, 13, l, wD, wE, wA, wB, wC
+ sha1_round majority, 13, h, wC, wD, wE, wA, wB
+ mix_state x14, x15, x10, x12, x13
+ sha1_round majority, 14, l, wB, wC, wD, wE, wA
+ sha1_round majority, 14, h, wA, wB, wC, wD, wE
+ mix_state x15, x8, x11, x13, x14
+ sha1_round majority, 15, l, wE, wA, wB, wC, wD
+ sha1_round majority, 15, h, wD, wE, wA, wB, wC
+ mix_state x8, x9, x12, x14, x15
+ sha1_round majority, 8, l, wC, wD, wE, wA, wB
+ sha1_round majority, 8, h, wB, wC, wD, wE, wA
+ mix_state x9, x10, x13, x15, x8
+ sha1_round majority, 9, l, wA, wB, wC, wD, wE
+ sha1_round majority, 9, h, wE, wA, wB, wC, wD
+ mix_state x10, x11, x14, x8, x9
+ sha1_round majority, 10, l, wD, wE, wA, wB, wC
+ sha1_round majority, 10, h, wC, wD, wE, wA, wB
+ mix_state x11, x12, x15, x9, x10
+ sha1_round majority, 11, l, wB, wC, wD, wE, wA
+ sha1_round majority, 11, h, wA, wB, wC, wD, wE
+ mix_state x12, x13, x8, x10, x11
+ sha1_round majority, 12, l, wE, wA, wB, wC, wD
+ sha1_round majority, 12, h, wD, wE, wA, wB, wC
+ mix_state x13, x14, x9, x11, x12
+ sha1_round majority, 13, l, wC, wD, wE, wA, wB
+ sha1_round majority, 13, h, wB, wC, wD, wE, wA
+
+ /* round 4 */
+ ldr k, =0xca62c1d6
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wA, wB, wC, wD, wE
+ sha1_round parity, 14, h, wE, wA, wB, wC, wD
+ mix_state x15, x8, x11, x13, x14
+ sha1_round parity, 15, l, wD, wE, wA, wB, wC
+ sha1_round parity, 15, h, wC, wD, wE, wA, wB
+ mix_state x8, x9, x12, x14, x15
+ sha1_round parity, 8, l, wB, wC, wD, wE, wA
+ sha1_round parity, 8, h, wA, wB, wC, wD, wE
+ mix_state x9, x10, x13, x15, x8
+ sha1_round parity, 9, l, wE, wA, wB, wC, wD
+ sha1_round parity, 9, h, wD, wE, wA, wB, wC
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wC, wD, wE, wA, wB
+ sha1_round parity, 10 ,h, wB, wC, wD, wE, wA
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wA, wB, wC, wD, wE
+ sha1_round parity, 11, h, wE, wA, wB, wC, wD
+ mix_state x12, x13, x8, x10, x11
+ sha1_round parity, 12, l, wD, wE, wA, wB, wC
+ sha1_round parity, 12, h, wC, wD, wE, wA, wB
+ mix_state x13, x14, x9, x11, x12
+ sha1_round parity, 13, l, wB, wC, wD, wE, wA
+ sha1_round parity, 13, h, wA, wB, wC, wD, wE
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wE, wA, wB, wC, wD
+ sha1_round parity, 14, h, wD, wE, wA, wB, wC
+ mix_state x15, x8, x11, x13, x14
+
+ /* reload digest input */
+ ldr w8, [x0]
+ ldp w9, w10, [x0, #4]
+ ldp w11, w12, [x0, #12]
+
+ sha1_round parity, 15, l, wC, wD, wE, wA, wB
+ sha1_round parity, 15, h, wB, wC, wD, wE, wA
+
+ /* add this round's output to digest */
+ add wA, wA, w8
+ add wB, wB, w9
+ add wC, wC, w10
+ add wD, wD, w11
+ add wE, wE, w12
+
+ /* store digest */
+ str wA, [x0]
+ stp wB, wC, [x0, #4]
+ stp wD, wE, [x0, #12]
+ ret
+ENDPROC(sha_transform)
+
+ /*
+ * void sha_init(__u32 *buf)
+ */
+ENTRY(sha_init)
+ ldr w1, =0x67452301
+ ldr w2, =0xefcdab89
+ ldr w3, =0x98badcfe
+ ldr w4, =0x10325476
+ ldr w5, =0xc3d2e1f0
+ str w1, [x0]
+ stp w2, w3, [x0, #4]
+ stp w4, w5, [x0, #12]
+ ret
+ENDPROC(sha_init)
--
1.8.3.2
next reply other threads:[~2014-03-14 15:02 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-14 15:02 Ard Biesheuvel [this message]
2014-03-17 18:18 ` [PATCH] arm64/lib: add optimized implementation of sha_transform Marek Vasut
2014-03-18 7:26 ` Ard Biesheuvel
2014-03-18 11:51 ` Marek Vasut
-- strict thread matches above, loose matches on Subject: below --
2014-03-17 15:55 Ard Biesheuvel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1394809353-16707-1-git-send-email-ard.biesheuvel@linaro.org \
--to=ard.biesheuvel@linaro.org \
--cc=linux-arm-kernel@lists.infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).