[PATCH] arm64/lib: add optimized implementation of sha

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] arm64/lib: add optimized implementation of sha_transform
@ 2014-03-14 15:02 Ard Biesheuvel
  2014-03-17 18:18 ` Marek Vasut
  0 siblings, 1 reply; 5+ messages in thread
From: Ard Biesheuvel @ 2014-03-14 15:02 UTC (permalink / raw)
  To: linux-arm-kernel

This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
Hello all,

No performance numbers I am allowed to share, unfortunately, so if anyone else
(with access to actual, representative hardware) would care to have a go, I
would be very grateful.

This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m), and
inserting the module using 'mode=303' as a parameter (note that the insmod
always fails, but produces its test output to the kernel log). Also note that
the sha_transform() function will be part of the kernel proper, so just
rebuilding the sha1_generic module is not sufficient.

Cheers,


 arch/arm64/kernel/arm64ksyms.c |   3 +
 arch/arm64/lib/Makefile        |   2 +-
 arch/arm64/lib/sha1.S          | 256 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/sha1.S

diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(test_and_clear_bit);
 EXPORT_SYMBOL(change_bit);
 EXPORT_SYMBOL(test_and_change_bit);
+
+	/* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
 lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
-		   strchr.o strrchr.o
+		   strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..877b8d70e992
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,256 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	k		.req	w1
+
+	res		.req	w2
+	xres		.req	x2
+
+	wA		.req	w3
+	wB		.req	w4
+	wC		.req	w5
+	wD		.req	w6
+	wE		.req	w7
+
+	tmp		.req	w16
+	xtmp		.req	x16
+
+	.macro		sha1_choose, out, b, c, d
+	eor		\out, \c, \d
+	and		\out, \out, \b
+	eor		\out, \out, \d
+	.endm
+
+	.macro		sha1_parity, out, b, c, d
+	eor		\out, \b, \c
+	eor		\out, \out, \d
+	.endm
+
+	.macro		sha1_majority, out, b, c, d
+	eor		tmp, \b, \c
+	and		\out, \b, \c
+	and		tmp, tmp, \d
+	add		\out, \out, tmp
+	.endm
+
+	.macro		mix_state, st0, st1, st4, st6, st7
+	extr		xtmp, \st7, \st6, #32
+	eor		\st0, \st0, \st1
+	eor		xtmp, xtmp, \st4
+	eor		xtmp, xtmp, \st0
+	ror		res, tmp, #(32 - 1)
+	lsr		xtmp, xtmp, #32
+	ror		tmp, tmp, #(32 - 1)
+	orr		\st0, xres, xtmp, lsl #32
+	.endm
+
+	.macro		sha1_round, func, r, h, a, b, c, d, e
+	sha1_\func	res, \b, \c, \d
+	add		res, res, \e
+	ror		\e, \a, #(32 - 5)
+	.ifc		\h, h
+	add		xres, xres, x\r, lsr #32
+	.else
+	add		res, res, w\r
+	.endif
+	add		\e, \e, k
+	ror		\b, \b, #2
+	add		\e, \e, res
+	.endm
+
+	/*
+	 * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+	 */
+ENTRY(sha_transform)
+	/* load input into state array */
+	ldp		x8, x9, [x1]
+	ldp		x10, x11, [x1, #16]
+	ldp		x12, x13, [x1, #32]
+	ldp		x14, x15, [x1, #48]
+
+	/* load digest input */
+	ldr		wA, [x0]
+	ldp		wB, wC, [x0, #4]
+	ldp		wD, wE, [x0, #12]
+
+	/* endian-reverse the input on LE builds */
+CPU_LE( rev32		x8, x8		)
+CPU_LE( rev32		x9, x9		)
+CPU_LE( rev32		x10, x10	)
+CPU_LE( rev32		x11, x11	)
+CPU_LE( rev32		x12, x12	)
+CPU_LE( rev32		x13, x13	)
+CPU_LE( rev32		x14, x14	)
+CPU_LE( rev32		x15, x15	)
+
+	/* round 1 */
+	ldr		k, =0x5a827999
+	sha1_round	choose,  8, l, wA, wB, wC, wD, wE
+	sha1_round	choose,  8, h, wE, wA, wB, wC, wD
+	sha1_round	choose,  9, l, wD, wE, wA, wB, wC
+	sha1_round	choose,  9, h, wC, wD, wE, wA, wB
+	sha1_round	choose, 10, l, wB, wC, wD, wE, wA
+	sha1_round	choose, 10, h, wA, wB, wC, wD, wE
+	sha1_round	choose, 11, l, wE, wA, wB, wC, wD
+	sha1_round	choose, 11, h, wD, wE, wA, wB, wC
+	sha1_round	choose, 12, l, wC, wD, wE, wA, wB
+	sha1_round	choose, 12, h, wB, wC, wD, wE, wA
+	sha1_round	choose, 13, l, wA, wB, wC, wD, wE
+	sha1_round	choose, 13, h, wE, wA, wB, wC, wD
+	sha1_round	choose, 14, l, wD, wE, wA, wB, wC
+	sha1_round	choose, 14, h, wC, wD, wE, wA, wB
+	sha1_round	choose, 15, l, wB, wC, wD, wE, wA
+	sha1_round	choose, 15, h, wA, wB, wC, wD, wE
+
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	choose,  8, l, wE, wA, wB, wC, wD
+	sha1_round	choose,  8, h, wD, wE, wA, wB, wC
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	choose,  9, l, wC, wD, wE, wA, wB
+	sha1_round	choose,  9, h, wB, wC, wD, wE, wA
+
+	/* round 2 */
+	ldr		k, =0x6ed9eba1
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 10, h, wE, wA, wB, wC, wD
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 11, h, wC, wD, wE, wA, wB
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	parity, 12, l, wB, wC, wD, wE, wA
+	sha1_round	parity, 12, h, wA, wB, wC, wD, wE
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	parity, 13, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 13, h, wD, wE, wA, wB, wC
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 14, h, wB, wC, wD, wE, wA
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	parity, 15, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 15, h, wE, wA, wB, wC, wD
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	parity,  8, l, wD, wE, wA, wB, wC
+	sha1_round	parity,  8, h, wC, wD, wE, wA, wB
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	parity,  9, l, wB, wC, wD, wE, wA
+	sha1_round	parity,  9, h, wA, wB, wC, wD, wE
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 10, h, wD, wE, wA, wB, wC
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 11, h, wB, wC, wD, wE, wA
+
+	/* round 3 */
+	ldr		k, =0x8f1bbcdc
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	majority, 12, l, wA, wB, wC, wD, wE
+	sha1_round	majority, 12, h, wE, wA, wB, wC, wD
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	majority, 13, l, wD, wE, wA, wB, wC
+	sha1_round	majority, 13, h, wC, wD, wE, wA, wB
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	majority, 14, l, wB, wC, wD, wE, wA
+	sha1_round	majority, 14, h, wA, wB, wC, wD, wE
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	majority, 15, l, wE, wA, wB, wC, wD
+	sha1_round	majority, 15, h, wD, wE, wA, wB, wC
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	majority,  8, l, wC, wD, wE, wA, wB
+	sha1_round	majority,  8, h, wB, wC, wD, wE, wA
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	majority,  9, l, wA, wB, wC, wD, wE
+	sha1_round	majority,  9, h, wE, wA, wB, wC, wD
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	majority, 10, l, wD, wE, wA, wB, wC
+	sha1_round	majority, 10, h, wC, wD, wE, wA, wB
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	majority, 11, l, wB, wC, wD, wE, wA
+	sha1_round	majority, 11, h, wA, wB, wC, wD, wE
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	majority, 12, l, wE, wA, wB, wC, wD
+	sha1_round	majority, 12, h, wD, wE, wA, wB, wC
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	majority, 13, l, wC, wD, wE, wA, wB
+	sha1_round	majority, 13, h, wB, wC, wD, wE, wA
+
+	/* round 4 */
+	ldr		k, =0xca62c1d6
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 14, h, wE, wA, wB, wC, wD
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	parity, 15, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 15, h, wC, wD, wE, wA, wB
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	parity,  8, l, wB, wC, wD, wE, wA
+	sha1_round	parity,  8, h, wA, wB, wC, wD, wE
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	parity,  9, l, wE, wA, wB, wC, wD
+	sha1_round	parity,  9, h, wD, wE, wA, wB, wC
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 10 ,h, wB, wC, wD, wE, wA
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 11, h, wE, wA, wB, wC, wD
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	parity, 12, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 12, h, wC, wD, wE, wA, wB
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	parity, 13, l, wB, wC, wD, wE, wA
+	sha1_round	parity, 13, h, wA, wB, wC, wD, wE
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 14, h, wD, wE, wA, wB, wC
+	mix_state	x15, x8, x11, x13, x14
+
+	/* reload digest input */
+	ldr		w8, [x0]
+	ldp		w9, w10, [x0, #4]
+	ldp		w11, w12, [x0, #12]
+
+	sha1_round	parity, 15, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 15, h, wB, wC, wD, wE, wA
+
+	/* add this round's output to digest */
+	add		wA, wA, w8
+	add		wB, wB, w9
+	add		wC, wC, w10
+	add		wD, wD, w11
+	add		wE, wE, w12
+
+	/* store digest */
+	str		wA, [x0]
+	stp		wB, wC, [x0, #4]
+	stp		wD, wE, [x0, #12]
+	ret
+ENDPROC(sha_transform)
+
+	/*
+	 * void sha_init(__u32 *buf)
+	 */
+ENTRY(sha_init)
+	ldr	w1, =0x67452301
+	ldr	w2, =0xefcdab89
+	ldr	w3, =0x98badcfe
+	ldr	w4, =0x10325476
+	ldr	w5, =0xc3d2e1f0
+	str	w1, [x0]
+	stp	w2, w3, [x0, #4]
+	stp	w4, w5, [x0, #12]
+	ret
+ENDPROC(sha_init)
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH] arm64/lib: add optimized implementation of sha_transform
  2014-03-14 15:02 [PATCH] arm64/lib: add optimized implementation of sha_transform Ard Biesheuvel
@ 2014-03-17 18:18 ` Marek Vasut
  2014-03-18  7:26   ` Ard Biesheuvel
  0 siblings, 1 reply; 5+ messages in thread
From: Marek Vasut @ 2014-03-17 18:18 UTC (permalink / raw)
  To: linux-arm-kernel

On Friday, March 14, 2014 at 04:02:33 PM, Ard Biesheuvel wrote:
> This implementation keeps the 64 bytes of workspace in registers rather
> than on the stack, eliminating most of the loads and stores, and reducing
> the instruction count by about 25%.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
> Hello all,
> 
> No performance numbers I am allowed to share, unfortunately, so if anyone
> else (with access to actual, representative hardware) would care to have a
> go, I would be very grateful.
> 
> This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m),
> and inserting the module using 'mode=303' as a parameter (note that the
> insmod always fails, but produces its test output to the kernel log). Also
> note that the sha_transform() function will be part of the kernel proper,
> so just rebuilding the sha1_generic module is not sufficient.
> 
> Cheers,

Won't the function sha_transform() collide with the one in lib/sha1.c ? Or will 
the one in lib/sha1.c be overriden somehow ?

Otherwise:

Reviewed-by: Marek Vasut <marex@denx.de>

Best regards,
Marek Vasut

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] arm64/lib: add optimized implementation of sha_transform
  2014-03-17 18:18 ` Marek Vasut
@ 2014-03-18  7:26   ` Ard Biesheuvel
  2014-03-18 11:51     ` Marek Vasut
  0 siblings, 1 reply; 5+ messages in thread
From: Ard Biesheuvel @ 2014-03-18  7:26 UTC (permalink / raw)
  To: linux-arm-kernel

On 17 March 2014 22:18, Marek Vasut <marex@denx.de> wrote:
> On Friday, March 14, 2014 at 04:02:33 PM, Ard Biesheuvel wrote:
>> This implementation keeps the 64 bytes of workspace in registers rather
>> than on the stack, eliminating most of the loads and stores, and reducing
>> the instruction count by about 25%.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>> Hello all,
>>
>> No performance numbers I am allowed to share, unfortunately, so if anyone
>> else (with access to actual, representative hardware) would care to have a
>> go, I would be very grateful.
>>
>> This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m),
>> and inserting the module using 'mode=303' as a parameter (note that the
>> insmod always fails, but produces its test output to the kernel log). Also
>> note that the sha_transform() function will be part of the kernel proper,
>> so just rebuilding the sha1_generic module is not sufficient.
>>
>> Cheers,
>
> Won't the function sha_transform() collide with the one in lib/sha1.c ? Or will
> the one in lib/sha1.c be overriden somehow ?
>

No, this works pretty well, in fact: arch/*/lib has precedence over
lib/, and objects (declared with lib-y +=)  are only included to
satisfy unresolved dependencies. So the second (generic) sha1.o will
not get linked.

> Otherwise:
>
> Reviewed-by: Marek Vasut <marex@denx.de>
>

Thanks. I did send a v2 which is actually a lot different from the
version you reviewed, so I won't carry over your reviewed-by without
your acknowledgement.

Cheers,
Ard.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] arm64/lib: add optimized implementation of sha_transform
  2014-03-18  7:26   ` Ard Biesheuvel
@ 2014-03-18 11:51     ` Marek Vasut
  0 siblings, 0 replies; 5+ messages in thread
From: Marek Vasut @ 2014-03-18 11:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Tuesday, March 18, 2014 at 08:26:00 AM, Ard Biesheuvel wrote:
> On 17 March 2014 22:18, Marek Vasut <marex@denx.de> wrote:
> > On Friday, March 14, 2014 at 04:02:33 PM, Ard Biesheuvel wrote:
> >> This implementation keeps the 64 bytes of workspace in registers rather
> >> than on the stack, eliminating most of the loads and stores, and
> >> reducing the instruction count by about 25%.
> >> 
> >> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> >> ---
> >> Hello all,
> >> 
> >> No performance numbers I am allowed to share, unfortunately, so if
> >> anyone else (with access to actual, representative hardware) would care
> >> to have a go, I would be very grateful.
> >> 
> >> This can be done by building the tcrypt.ko module
> >> (CONFIG_CRYPTO_TEST=m), and inserting the module using 'mode=303' as a
> >> parameter (note that the insmod always fails, but produces its test
> >> output to the kernel log). Also note that the sha_transform() function
> >> will be part of the kernel proper, so just rebuilding the sha1_generic
> >> module is not sufficient.
> >> 
> >> Cheers,
> > 
> > Won't the function sha_transform() collide with the one in lib/sha1.c ?
> > Or will the one in lib/sha1.c be overriden somehow ?
> 
> No, this works pretty well, in fact: arch/*/lib has precedence over
> lib/, and objects (declared with lib-y +=)  are only included to
> satisfy unresolved dependencies. So the second (generic) sha1.o will
> not get linked.

Thanks for clearing this !

> > Otherwise:
> > 
> > Reviewed-by: Marek Vasut <marex@denx.de>
> 
> Thanks. I did send a v2 which is actually a lot different from the
> version you reviewed, so I won't carry over your reviewed-by without
> your acknowledgement.

Thanks!

Best regards,
Marek Vasut

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] arm64/lib: add optimized implementation of sha_transform
@ 2014-03-17 15:55 Ard Biesheuvel
  0 siblings, 0 replies; 5+ messages in thread
From: Ard Biesheuvel @ 2014-03-17 15:55 UTC (permalink / raw)
  To: linux-arm-kernel

This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---

@Catalin: I assumed x18 has no special significance in the kernel, so I am
using it as a temp register without preserving it. Is this correct?

Changes since v1:
- as suggested in feedback I received off list, it makes sense to schedule
  more carefully for an in-order pipeline (A53?), so the rounds are now
  2-way interleaved and combined with the schedule updates
- use named constants rather than bare numbers
- use ldnp for loading the input (non-temporal hint)

 arch/arm64/kernel/arm64ksyms.c |   3 +
 arch/arm64/lib/Makefile        |   2 +-
 arch/arm64/lib/sha1.S          | 277 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 281 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/sha1.S

diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(test_and_clear_bit);
 EXPORT_SYMBOL(change_bit);
 EXPORT_SYMBOL(test_and_change_bit);
+
+	/* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
 lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
-		   strchr.o strrchr.o
+		   strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..5c472f32f917
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,277 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.altmacro
+
+	wA		.req	w2
+	wB		.req	w3
+	wC		.req	w4
+	wD		.req	w5
+	wE		.req	w6
+
+	k		.req	w7
+
+	t0		.req	w16
+	t1		.req	w17
+	t2		.req	w18
+	t3		.req	w1
+
+	xt0		.req	x16
+	xt1		.req	x17
+	xt2		.req	x18
+	xt3		.req	x1
+
+	.macro		load_k_hi, reg, rc
+	.ifnb		rc
+	movz		\reg, #:abs_g1:\rc
+	.endif
+	.endm
+
+	.macro		load_k_lo, reg, rc
+	.ifnb		rc
+	movk		\reg, #:abs_g0_nc:\rc
+	.endif
+	.endm
+
+	.macro		inp_2rounds, in, a, b, c, d, e, rc
+	eor		t0, \c, \d
+	.irp		in2, %(in | 1)
+	.ifne		in ^ in2
+	ldnp		x\in, x\in2, [x1, #8 * (\in - 8)]
+	.endif
+	.endr
+	load_k_hi	k, \rc
+	and		t0, t0, \b
+	load_k_lo	k, \rc
+	ror		\b, \b, #2
+	eor		t0, t0, \d
+	eor		t1, \b, \c
+CPU_LE(	rev32		x\in, x\in	)
+	add		t0, t0, \e
+	ror		\e, \a, #(32 - 5)
+	and		t1, t1, \a
+	add		\e, \e, k
+	add		t0, t0, w\in
+	eor		t1, t1, \c
+	add		\e, \e, t0
+	add		t1, t1, \d
+	ror		\d, \e, #(32 - 5)
+	add		xt1, xt1, x\in, lsr #32
+	add		\d, \d, k
+	ror		\a, \a, #2
+	add		\d, \d, t1
+	.endm
+
+	.macro		cho_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7
+	extr		xt2, x\st7, x\st6, #32
+	eor		t0, \c, \d
+	eor		x\st0, x\st0, x\st1
+	and		t0, t0, \b
+	eor		xt2, xt2, x\st4
+	ror		\b, \b, #2
+	eor		xt2, xt2, x\st0
+	eor		t0, t0, \d
+	eor		t1, \b, \c
+	ror		t3, t2, #(32 - 1)
+	add		t0, t0, \e
+	lsr		xt2, xt2, #32
+	and		t1, t1, \a
+	ror		t2, t2, #(32 - 1)
+	ror		\e, \a, #(32 - 5)
+	eor		t1, t1, \c
+	add		\e, \e, k
+	add		t0, t0, t3
+	ror		\a, \a, #2
+	add		\e, \e, t0
+	add		t1, t1, \d
+	ror		\d, \e, #(32 - 5)
+	add		t1, t1, t2
+	add		\d, \d, k
+	orr		x\st0, xt3, xt2, lsl #32
+	add		\d, \d, t1
+	.endm
+
+	.macro		par_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7, rc
+	extr		xt2, x\st7, x\st6, #32
+	load_k_hi	k, \rc
+	eor		x\st0, x\st0, x\st1
+	eor		t0, \b, \c
+	load_k_lo	k, \rc
+	eor		xt2, xt2, x\st4
+	ror		\b, \b, #2
+	eor		xt2, xt2, x\st0
+	eor		t0, t0, \d
+	ror		t3, t2, #(32 - 1)
+	eor		t1, \a, \b
+	lsr		xt2, xt2, #32
+	add		t0, t0, \e
+	ror		t2, t2, #(32 - 1)
+	ror		\e, \a, #(32 - 5)
+	eor		t1, t1, \c
+	add		\e, \e, k
+	add		t0, t0, t3
+	ror		\a, \a, #2
+	add		\e, \e, t0
+	add		t1, t1, \d
+	ror		\d, \e, #(32 - 5)
+	add		t1, t1, t2
+	add		\d, \d, k
+	orr		x\st0, xt3, xt2, lsl #32
+	add		\d, \d, t1
+	.endm
+
+	.macro		maj_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7, rc
+	extr		xt2, x\st7, x\st6, #32
+	load_k_hi	k, \rc
+	eor		t1, \b, \c
+	eor		x\st0, x\st0, x\st1
+	and		t0, \b, \c
+	load_k_lo	k, \rc
+	eor		xt2, xt2, x\st4
+	ror		\b, \b, #2
+	and		t1, t1, \d
+	eor		t3, \a, \b
+	add		t0, t0, t1
+	and		t1, \a, \b
+	and		t3, t3, \c
+	eor		xt2, xt2, x\st0
+	add		t1, t1, t3
+	ror		t3, t2, #(32 - 1)
+	lsr		xt2, xt2, #32
+	add		t0, t0, \e
+	ror		\e, \a, #(32 - 5)
+	ror		t2, t2, #(32 - 1)
+	add		\e, \e, k
+	add		t0, t0, t3
+	ror		\a, \a, #2
+	add		\e, \e, t0
+	add		t1, t1, \d
+	ror		\d, \e, #(32 - 5)
+	add		t1, t1, t2
+	add		\d, \d, k
+	orr		x\st0, xt3, xt2, lsl #32
+	add		\d, \d, t1
+	.endm
+
+	.macro		mix_2rounds, in, a, b, c, d, e, f, rc
+			st1 = (in + 1) % 8 + 8
+			st4 = (in + 4) % 8 + 8
+			st6 = (in + 6) % 8 + 8
+			st7 = (in + 7) % 8 + 8
+	\f\()_2rounds	\a, \b, \c, \d, \e, \in, %st1, %st4, %st6, %st7, \rc
+	.endm
+
+	/*
+	 * The SHA-1 round constants
+	 */
+	.set		sha_rcon1, 0x5a827999
+	.set		sha_rcon2, 0x6ed9eba1
+	.set		sha_rcon3, 0x8f1bbcdc
+	.set		sha_rcon4, 0xca62c1d6
+
+	/*
+	 * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+	 */
+ENTRY(sha_transform)
+	/* load digest input */
+	ldp		wC, wD, [x0, #8]
+	ldp		wA, wB, [x0]
+	ldr		wE, [x0, #16]
+
+	inp_2rounds	 8, wA, wB, wC, wD, wE, sha_rcon1
+	inp_2rounds	 9, wD, wE, wA, wB, wC
+	inp_2rounds	10, wB, wC, wD, wE, wA
+	inp_2rounds	11, wE, wA, wB, wC, wD
+	inp_2rounds	12, wC, wD, wE, wA, wB
+	inp_2rounds	13, wA, wB, wC, wD, wE
+	inp_2rounds	14, wD, wE, wA, wB, wC
+	inp_2rounds	15, wB, wC, wD, wE, wA
+	mix_2rounds	 8, wE, wA, wB, wC, wD, cho
+	mix_2rounds	 9, wC, wD, wE, wA, wB, cho
+
+	mix_2rounds	10, wA, wB, wC, wD, wE, par, sha_rcon2
+	mix_2rounds	11, wD, wE, wA, wB, wC, par
+	mix_2rounds	12, wB, wC, wD, wE, wA, par
+	mix_2rounds	13, wE, wA, wB, wC, wD, par
+	mix_2rounds	14, wC, wD, wE, wA, wB, par
+	mix_2rounds	15, wA, wB, wC, wD, wE, par
+	mix_2rounds	 8, wD, wE, wA, wB, wC, par
+	mix_2rounds	 9, wB, wC, wD, wE, wA, par
+	mix_2rounds	10, wE, wA, wB, wC, wD, par
+	mix_2rounds	11, wC, wD, wE, wA, wB, par
+
+	mix_2rounds	12, wA, wB, wC, wD, wE, maj, sha_rcon3
+	mix_2rounds	13, wD, wE, wA, wB, wC, maj
+	mix_2rounds	14, wB, wC, wD, wE, wA, maj
+	mix_2rounds	15, wE, wA, wB, wC, wD, maj
+	mix_2rounds	 8, wC, wD, wE, wA, wB, maj
+	mix_2rounds	 9, wA, wB, wC, wD, wE, maj
+	mix_2rounds	10, wD, wE, wA, wB, wC, maj
+	mix_2rounds	11, wB, wC, wD, wE, wA, maj
+	mix_2rounds	12, wE, wA, wB, wC, wD, maj
+	mix_2rounds	13, wC, wD, wE, wA, wB, maj
+
+	mix_2rounds	14, wA, wB, wC, wD, wE, par, sha_rcon4
+	mix_2rounds	15, wD, wE, wA, wB, wC, par
+	mix_2rounds	 8, wB, wC, wD, wE, wA, par
+	mix_2rounds	 9, wE, wA, wB, wC, wD, par
+	mix_2rounds	10, wC, wD, wE, wA, wB, par
+	mix_2rounds	11, wA, wB, wC, wD, wE, par
+	mix_2rounds	12, wD, wE, wA, wB, wC, par
+	mix_2rounds	13, wB, wC, wD, wE, wA, par
+	mix_2rounds	14, wE, wA, wB, wC, wD, par
+	mix_2rounds	15, wC, wD, wE, wA, wB, par
+
+	/* reload digest input */
+	ldr		w8, [x0]
+	ldp		w9, w10, [x0, #4]
+	ldp		w11, w12, [x0, #12]
+
+	/* add this block's output to digest */
+	add		wA, wA, w8
+	add		wB, wB, w9
+	add		wC, wC, w10
+	add		wD, wD, w11
+	add		wE, wE, w12
+
+	/* store digest */
+	str		wA, [x0]
+	stp		wB, wC, [x0, #4]
+	stp		wD, wE, [x0, #12]
+	ret
+ENDPROC(sha_transform)
+
+	/*
+	 * The SHA-1 digest initial values
+	 */
+.Lsha_init:
+	.word		0x67452301
+	.word		0xefcdab89
+	.word		0x98badcfe
+	.word		0x10325476
+	.word		0xc3d2e1f0
+
+	/*
+	 * void sha_init(__u32 *buf)
+	 */
+ENTRY(sha_init)
+	adr		xt0, .Lsha_init
+	ldr		wA, [xt0]
+	ldp		wB, wC, [xt0, #4]
+	ldp		wD, wE, [xt0, #12]
+	str		wA, [x0]
+	stp		wB, wC, [x0, #4]
+	stp		wD, wE, [x0, #12]
+	ret
+ENDPROC(sha_init)
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-03-18 11:51 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-03-14 15:02 [PATCH] arm64/lib: add optimized implementation of sha_transform Ard Biesheuvel
2014-03-17 18:18 ` Marek Vasut
2014-03-18  7:26   ` Ard Biesheuvel
2014-03-18 11:51     ` Marek Vasut
  -- strict thread matches above, loose matches on Subject: below --
2014-03-17 15:55 Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).