[PATCH] arm64/lib: add optimized implementation of sha_transform

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

From: ard.biesheuvel@linaro.org (Ard Biesheuvel)
To: linux-arm-kernel@lists.infradead.org
Subject: [PATCH] arm64/lib: add optimized implementation of sha_transform
Date: Fri, 14 Mar 2014 16:02:33 +0100	[thread overview]
Message-ID: <1394809353-16707-1-git-send-email-ard.biesheuvel@linaro.org> (raw)

This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
Hello all,

No performance numbers I am allowed to share, unfortunately, so if anyone else
(with access to actual, representative hardware) would care to have a go, I
would be very grateful.

This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m), and
inserting the module using 'mode=303' as a parameter (note that the insmod
always fails, but produces its test output to the kernel log). Also note that
the sha_transform() function will be part of the kernel proper, so just
rebuilding the sha1_generic module is not sufficient.

Cheers,


 arch/arm64/kernel/arm64ksyms.c |   3 +
 arch/arm64/lib/Makefile        |   2 +-
 arch/arm64/lib/sha1.S          | 256 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/sha1.S

diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(test_and_clear_bit);
 EXPORT_SYMBOL(change_bit);
 EXPORT_SYMBOL(test_and_change_bit);
+
+	/* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
 lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
-		   strchr.o strrchr.o
+		   strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..877b8d70e992
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,256 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	k		.req	w1
+
+	res		.req	w2
+	xres		.req	x2
+
+	wA		.req	w3
+	wB		.req	w4
+	wC		.req	w5
+	wD		.req	w6
+	wE		.req	w7
+
+	tmp		.req	w16
+	xtmp		.req	x16
+
+	.macro		sha1_choose, out, b, c, d
+	eor		\out, \c, \d
+	and		\out, \out, \b
+	eor		\out, \out, \d
+	.endm
+
+	.macro		sha1_parity, out, b, c, d
+	eor		\out, \b, \c
+	eor		\out, \out, \d
+	.endm
+
+	.macro		sha1_majority, out, b, c, d
+	eor		tmp, \b, \c
+	and		\out, \b, \c
+	and		tmp, tmp, \d
+	add		\out, \out, tmp
+	.endm
+
+	.macro		mix_state, st0, st1, st4, st6, st7
+	extr		xtmp, \st7, \st6, #32
+	eor		\st0, \st0, \st1
+	eor		xtmp, xtmp, \st4
+	eor		xtmp, xtmp, \st0
+	ror		res, tmp, #(32 - 1)
+	lsr		xtmp, xtmp, #32
+	ror		tmp, tmp, #(32 - 1)
+	orr		\st0, xres, xtmp, lsl #32
+	.endm
+
+	.macro		sha1_round, func, r, h, a, b, c, d, e
+	sha1_\func	res, \b, \c, \d
+	add		res, res, \e
+	ror		\e, \a, #(32 - 5)
+	.ifc		\h, h
+	add		xres, xres, x\r, lsr #32
+	.else
+	add		res, res, w\r
+	.endif
+	add		\e, \e, k
+	ror		\b, \b, #2
+	add		\e, \e, res
+	.endm
+
+	/*
+	 * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+	 */
+ENTRY(sha_transform)
+	/* load input into state array */
+	ldp		x8, x9, [x1]
+	ldp		x10, x11, [x1, #16]
+	ldp		x12, x13, [x1, #32]
+	ldp		x14, x15, [x1, #48]
+
+	/* load digest input */
+	ldr		wA, [x0]
+	ldp		wB, wC, [x0, #4]
+	ldp		wD, wE, [x0, #12]
+
+	/* endian-reverse the input on LE builds */
+CPU_LE( rev32		x8, x8		)
+CPU_LE( rev32		x9, x9		)
+CPU_LE( rev32		x10, x10	)
+CPU_LE( rev32		x11, x11	)
+CPU_LE( rev32		x12, x12	)
+CPU_LE( rev32		x13, x13	)
+CPU_LE( rev32		x14, x14	)
+CPU_LE( rev32		x15, x15	)
+
+	/* round 1 */
+	ldr		k, =0x5a827999
+	sha1_round	choose,  8, l, wA, wB, wC, wD, wE
+	sha1_round	choose,  8, h, wE, wA, wB, wC, wD
+	sha1_round	choose,  9, l, wD, wE, wA, wB, wC
+	sha1_round	choose,  9, h, wC, wD, wE, wA, wB
+	sha1_round	choose, 10, l, wB, wC, wD, wE, wA
+	sha1_round	choose, 10, h, wA, wB, wC, wD, wE
+	sha1_round	choose, 11, l, wE, wA, wB, wC, wD
+	sha1_round	choose, 11, h, wD, wE, wA, wB, wC
+	sha1_round	choose, 12, l, wC, wD, wE, wA, wB
+	sha1_round	choose, 12, h, wB, wC, wD, wE, wA
+	sha1_round	choose, 13, l, wA, wB, wC, wD, wE
+	sha1_round	choose, 13, h, wE, wA, wB, wC, wD
+	sha1_round	choose, 14, l, wD, wE, wA, wB, wC
+	sha1_round	choose, 14, h, wC, wD, wE, wA, wB
+	sha1_round	choose, 15, l, wB, wC, wD, wE, wA
+	sha1_round	choose, 15, h, wA, wB, wC, wD, wE
+
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	choose,  8, l, wE, wA, wB, wC, wD
+	sha1_round	choose,  8, h, wD, wE, wA, wB, wC
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	choose,  9, l, wC, wD, wE, wA, wB
+	sha1_round	choose,  9, h, wB, wC, wD, wE, wA
+
+	/* round 2 */
+	ldr		k, =0x6ed9eba1
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 10, h, wE, wA, wB, wC, wD
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 11, h, wC, wD, wE, wA, wB
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	parity, 12, l, wB, wC, wD, wE, wA
+	sha1_round	parity, 12, h, wA, wB, wC, wD, wE
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	parity, 13, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 13, h, wD, wE, wA, wB, wC
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 14, h, wB, wC, wD, wE, wA
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	parity, 15, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 15, h, wE, wA, wB, wC, wD
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	parity,  8, l, wD, wE, wA, wB, wC
+	sha1_round	parity,  8, h, wC, wD, wE, wA, wB
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	parity,  9, l, wB, wC, wD, wE, wA
+	sha1_round	parity,  9, h, wA, wB, wC, wD, wE
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 10, h, wD, wE, wA, wB, wC
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 11, h, wB, wC, wD, wE, wA
+
+	/* round 3 */
+	ldr		k, =0x8f1bbcdc
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	majority, 12, l, wA, wB, wC, wD, wE
+	sha1_round	majority, 12, h, wE, wA, wB, wC, wD
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	majority, 13, l, wD, wE, wA, wB, wC
+	sha1_round	majority, 13, h, wC, wD, wE, wA, wB
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	majority, 14, l, wB, wC, wD, wE, wA
+	sha1_round	majority, 14, h, wA, wB, wC, wD, wE
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	majority, 15, l, wE, wA, wB, wC, wD
+	sha1_round	majority, 15, h, wD, wE, wA, wB, wC
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	majority,  8, l, wC, wD, wE, wA, wB
+	sha1_round	majority,  8, h, wB, wC, wD, wE, wA
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	majority,  9, l, wA, wB, wC, wD, wE
+	sha1_round	majority,  9, h, wE, wA, wB, wC, wD
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	majority, 10, l, wD, wE, wA, wB, wC
+	sha1_round	majority, 10, h, wC, wD, wE, wA, wB
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	majority, 11, l, wB, wC, wD, wE, wA
+	sha1_round	majority, 11, h, wA, wB, wC, wD, wE
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	majority, 12, l, wE, wA, wB, wC, wD
+	sha1_round	majority, 12, h, wD, wE, wA, wB, wC
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	majority, 13, l, wC, wD, wE, wA, wB
+	sha1_round	majority, 13, h, wB, wC, wD, wE, wA
+
+	/* round 4 */
+	ldr		k, =0xca62c1d6
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 14, h, wE, wA, wB, wC, wD
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	parity, 15, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 15, h, wC, wD, wE, wA, wB
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	parity,  8, l, wB, wC, wD, wE, wA
+	sha1_round	parity,  8, h, wA, wB, wC, wD, wE
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	parity,  9, l, wE, wA, wB, wC, wD
+	sha1_round	parity,  9, h, wD, wE, wA, wB, wC
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 10 ,h, wB, wC, wD, wE, wA
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 11, h, wE, wA, wB, wC, wD
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	parity, 12, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 12, h, wC, wD, wE, wA, wB
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	parity, 13, l, wB, wC, wD, wE, wA
+	sha1_round	parity, 13, h, wA, wB, wC, wD, wE
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 14, h, wD, wE, wA, wB, wC
+	mix_state	x15, x8, x11, x13, x14
+
+	/* reload digest input */
+	ldr		w8, [x0]
+	ldp		w9, w10, [x0, #4]
+	ldp		w11, w12, [x0, #12]
+
+	sha1_round	parity, 15, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 15, h, wB, wC, wD, wE, wA
+
+	/* add this round's output to digest */
+	add		wA, wA, w8
+	add		wB, wB, w9
+	add		wC, wC, w10
+	add		wD, wD, w11
+	add		wE, wE, w12
+
+	/* store digest */
+	str		wA, [x0]
+	stp		wB, wC, [x0, #4]
+	stp		wD, wE, [x0, #12]
+	ret
+ENDPROC(sha_transform)
+
+	/*
+	 * void sha_init(__u32 *buf)
+	 */
+ENTRY(sha_init)
+	ldr	w1, =0x67452301
+	ldr	w2, =0xefcdab89
+	ldr	w3, =0x98badcfe
+	ldr	w4, =0x10325476
+	ldr	w5, =0xc3d2e1f0
+	str	w1, [x0]
+	stp	w2, w3, [x0, #4]
+	stp	w4, w5, [x0, #12]
+	ret
+ENDPROC(sha_init)
-- 
1.8.3.2

next             reply	other threads:[~2014-03-14 15:02 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-14 15:02 Ard Biesheuvel [this message]
2014-03-17 18:18 ` [PATCH] arm64/lib: add optimized implementation of sha_transform Marek Vasut
2014-03-18  7:26   ` Ard Biesheuvel
2014-03-18 11:51     ` Marek Vasut
  -- strict thread matches above, loose matches on Subject: below --
2014-03-17 15:55 Ard Biesheuvel

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:338b568cd8a dfblob:1f5693fb5d9 dfblob:328ce1a99da
dfblob:ea093ebb9a9 dfblob:877b8d70e99 )
 OR (
bs:"[PATCH] arm64/lib: add optimized implementation of sha_transform" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1394809353-16707-1-git-send-email-ard.biesheuvel@linaro.org \
    --to=ard.biesheuvel@linaro.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).