* [PATCH] arm64/lib: add optimized implementation of sha_transform
@ 2014-03-14 15:02 Ard Biesheuvel
2014-03-17 18:18 ` Marek Vasut
0 siblings, 1 reply; 5+ messages in thread
From: Ard Biesheuvel @ 2014-03-14 15:02 UTC (permalink / raw)
To: linux-arm-kernel
This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
Hello all,
No performance numbers I am allowed to share, unfortunately, so if anyone else
(with access to actual, representative hardware) would care to have a go, I
would be very grateful.
This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m), and
inserting the module using 'mode=303' as a parameter (note that the insmod
always fails, but produces its test output to the kernel log). Also note that
the sha_transform() function will be part of the kernel proper, so just
rebuilding the sha1_generic module is not sufficient.
Cheers,
arch/arm64/kernel/arm64ksyms.c | 3 +
arch/arm64/lib/Makefile | 2 +-
arch/arm64/lib/sha1.S | 256 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 260 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/lib/sha1.S
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
EXPORT_SYMBOL(test_and_clear_bit);
EXPORT_SYMBOL(change_bit);
EXPORT_SYMBOL(test_and_change_bit);
+
+ /* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
- strchr.o strrchr.o
+ strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..877b8d70e992
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,256 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ k .req w1
+
+ res .req w2
+ xres .req x2
+
+ wA .req w3
+ wB .req w4
+ wC .req w5
+ wD .req w6
+ wE .req w7
+
+ tmp .req w16
+ xtmp .req x16
+
+ .macro sha1_choose, out, b, c, d
+ eor \out, \c, \d
+ and \out, \out, \b
+ eor \out, \out, \d
+ .endm
+
+ .macro sha1_parity, out, b, c, d
+ eor \out, \b, \c
+ eor \out, \out, \d
+ .endm
+
+ .macro sha1_majority, out, b, c, d
+ eor tmp, \b, \c
+ and \out, \b, \c
+ and tmp, tmp, \d
+ add \out, \out, tmp
+ .endm
+
+ .macro mix_state, st0, st1, st4, st6, st7
+ extr xtmp, \st7, \st6, #32
+ eor \st0, \st0, \st1
+ eor xtmp, xtmp, \st4
+ eor xtmp, xtmp, \st0
+ ror res, tmp, #(32 - 1)
+ lsr xtmp, xtmp, #32
+ ror tmp, tmp, #(32 - 1)
+ orr \st0, xres, xtmp, lsl #32
+ .endm
+
+ .macro sha1_round, func, r, h, a, b, c, d, e
+ sha1_\func res, \b, \c, \d
+ add res, res, \e
+ ror \e, \a, #(32 - 5)
+ .ifc \h, h
+ add xres, xres, x\r, lsr #32
+ .else
+ add res, res, w\r
+ .endif
+ add \e, \e, k
+ ror \b, \b, #2
+ add \e, \e, res
+ .endm
+
+ /*
+ * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+ */
+ENTRY(sha_transform)
+ /* load input into state array */
+ ldp x8, x9, [x1]
+ ldp x10, x11, [x1, #16]
+ ldp x12, x13, [x1, #32]
+ ldp x14, x15, [x1, #48]
+
+ /* load digest input */
+ ldr wA, [x0]
+ ldp wB, wC, [x0, #4]
+ ldp wD, wE, [x0, #12]
+
+ /* endian-reverse the input on LE builds */
+CPU_LE( rev32 x8, x8 )
+CPU_LE( rev32 x9, x9 )
+CPU_LE( rev32 x10, x10 )
+CPU_LE( rev32 x11, x11 )
+CPU_LE( rev32 x12, x12 )
+CPU_LE( rev32 x13, x13 )
+CPU_LE( rev32 x14, x14 )
+CPU_LE( rev32 x15, x15 )
+
+ /* round 1 */
+ ldr k, =0x5a827999
+ sha1_round choose, 8, l, wA, wB, wC, wD, wE
+ sha1_round choose, 8, h, wE, wA, wB, wC, wD
+ sha1_round choose, 9, l, wD, wE, wA, wB, wC
+ sha1_round choose, 9, h, wC, wD, wE, wA, wB
+ sha1_round choose, 10, l, wB, wC, wD, wE, wA
+ sha1_round choose, 10, h, wA, wB, wC, wD, wE
+ sha1_round choose, 11, l, wE, wA, wB, wC, wD
+ sha1_round choose, 11, h, wD, wE, wA, wB, wC
+ sha1_round choose, 12, l, wC, wD, wE, wA, wB
+ sha1_round choose, 12, h, wB, wC, wD, wE, wA
+ sha1_round choose, 13, l, wA, wB, wC, wD, wE
+ sha1_round choose, 13, h, wE, wA, wB, wC, wD
+ sha1_round choose, 14, l, wD, wE, wA, wB, wC
+ sha1_round choose, 14, h, wC, wD, wE, wA, wB
+ sha1_round choose, 15, l, wB, wC, wD, wE, wA
+ sha1_round choose, 15, h, wA, wB, wC, wD, wE
+
+ mix_state x8, x9, x12, x14, x15
+ sha1_round choose, 8, l, wE, wA, wB, wC, wD
+ sha1_round choose, 8, h, wD, wE, wA, wB, wC
+ mix_state x9, x10, x13, x15, x8
+ sha1_round choose, 9, l, wC, wD, wE, wA, wB
+ sha1_round choose, 9, h, wB, wC, wD, wE, wA
+
+ /* round 2 */
+ ldr k, =0x6ed9eba1
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wA, wB, wC, wD, wE
+ sha1_round parity, 10, h, wE, wA, wB, wC, wD
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wD, wE, wA, wB, wC
+ sha1_round parity, 11, h, wC, wD, wE, wA, wB
+ mix_state x12, x13, x8, x10, x11
+ sha1_round parity, 12, l, wB, wC, wD, wE, wA
+ sha1_round parity, 12, h, wA, wB, wC, wD, wE
+ mix_state x13, x14, x9, x11, x12
+ sha1_round parity, 13, l, wE, wA, wB, wC, wD
+ sha1_round parity, 13, h, wD, wE, wA, wB, wC
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wC, wD, wE, wA, wB
+ sha1_round parity, 14, h, wB, wC, wD, wE, wA
+ mix_state x15, x8, x11, x13, x14
+ sha1_round parity, 15, l, wA, wB, wC, wD, wE
+ sha1_round parity, 15, h, wE, wA, wB, wC, wD
+ mix_state x8, x9, x12, x14, x15
+ sha1_round parity, 8, l, wD, wE, wA, wB, wC
+ sha1_round parity, 8, h, wC, wD, wE, wA, wB
+ mix_state x9, x10, x13, x15, x8
+ sha1_round parity, 9, l, wB, wC, wD, wE, wA
+ sha1_round parity, 9, h, wA, wB, wC, wD, wE
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wE, wA, wB, wC, wD
+ sha1_round parity, 10, h, wD, wE, wA, wB, wC
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wC, wD, wE, wA, wB
+ sha1_round parity, 11, h, wB, wC, wD, wE, wA
+
+ /* round 3 */
+ ldr k, =0x8f1bbcdc
+ mix_state x12, x13, x8, x10, x11
+ sha1_round majority, 12, l, wA, wB, wC, wD, wE
+ sha1_round majority, 12, h, wE, wA, wB, wC, wD
+ mix_state x13, x14, x9, x11, x12
+ sha1_round majority, 13, l, wD, wE, wA, wB, wC
+ sha1_round majority, 13, h, wC, wD, wE, wA, wB
+ mix_state x14, x15, x10, x12, x13
+ sha1_round majority, 14, l, wB, wC, wD, wE, wA
+ sha1_round majority, 14, h, wA, wB, wC, wD, wE
+ mix_state x15, x8, x11, x13, x14
+ sha1_round majority, 15, l, wE, wA, wB, wC, wD
+ sha1_round majority, 15, h, wD, wE, wA, wB, wC
+ mix_state x8, x9, x12, x14, x15
+ sha1_round majority, 8, l, wC, wD, wE, wA, wB
+ sha1_round majority, 8, h, wB, wC, wD, wE, wA
+ mix_state x9, x10, x13, x15, x8
+ sha1_round majority, 9, l, wA, wB, wC, wD, wE
+ sha1_round majority, 9, h, wE, wA, wB, wC, wD
+ mix_state x10, x11, x14, x8, x9
+ sha1_round majority, 10, l, wD, wE, wA, wB, wC
+ sha1_round majority, 10, h, wC, wD, wE, wA, wB
+ mix_state x11, x12, x15, x9, x10
+ sha1_round majority, 11, l, wB, wC, wD, wE, wA
+ sha1_round majority, 11, h, wA, wB, wC, wD, wE
+ mix_state x12, x13, x8, x10, x11
+ sha1_round majority, 12, l, wE, wA, wB, wC, wD
+ sha1_round majority, 12, h, wD, wE, wA, wB, wC
+ mix_state x13, x14, x9, x11, x12
+ sha1_round majority, 13, l, wC, wD, wE, wA, wB
+ sha1_round majority, 13, h, wB, wC, wD, wE, wA
+
+ /* round 4 */
+ ldr k, =0xca62c1d6
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wA, wB, wC, wD, wE
+ sha1_round parity, 14, h, wE, wA, wB, wC, wD
+ mix_state x15, x8, x11, x13, x14
+ sha1_round parity, 15, l, wD, wE, wA, wB, wC
+ sha1_round parity, 15, h, wC, wD, wE, wA, wB
+ mix_state x8, x9, x12, x14, x15
+ sha1_round parity, 8, l, wB, wC, wD, wE, wA
+ sha1_round parity, 8, h, wA, wB, wC, wD, wE
+ mix_state x9, x10, x13, x15, x8
+ sha1_round parity, 9, l, wE, wA, wB, wC, wD
+ sha1_round parity, 9, h, wD, wE, wA, wB, wC
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wC, wD, wE, wA, wB
+ sha1_round parity, 10 ,h, wB, wC, wD, wE, wA
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wA, wB, wC, wD, wE
+ sha1_round parity, 11, h, wE, wA, wB, wC, wD
+ mix_state x12, x13, x8, x10, x11
+ sha1_round parity, 12, l, wD, wE, wA, wB, wC
+ sha1_round parity, 12, h, wC, wD, wE, wA, wB
+ mix_state x13, x14, x9, x11, x12
+ sha1_round parity, 13, l, wB, wC, wD, wE, wA
+ sha1_round parity, 13, h, wA, wB, wC, wD, wE
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wE, wA, wB, wC, wD
+ sha1_round parity, 14, h, wD, wE, wA, wB, wC
+ mix_state x15, x8, x11, x13, x14
+
+ /* reload digest input */
+ ldr w8, [x0]
+ ldp w9, w10, [x0, #4]
+ ldp w11, w12, [x0, #12]
+
+ sha1_round parity, 15, l, wC, wD, wE, wA, wB
+ sha1_round parity, 15, h, wB, wC, wD, wE, wA
+
+ /* add this round's output to digest */
+ add wA, wA, w8
+ add wB, wB, w9
+ add wC, wC, w10
+ add wD, wD, w11
+ add wE, wE, w12
+
+ /* store digest */
+ str wA, [x0]
+ stp wB, wC, [x0, #4]
+ stp wD, wE, [x0, #12]
+ ret
+ENDPROC(sha_transform)
+
+ /*
+ * void sha_init(__u32 *buf)
+ */
+ENTRY(sha_init)
+ ldr w1, =0x67452301
+ ldr w2, =0xefcdab89
+ ldr w3, =0x98badcfe
+ ldr w4, =0x10325476
+ ldr w5, =0xc3d2e1f0
+ str w1, [x0]
+ stp w2, w3, [x0, #4]
+ stp w4, w5, [x0, #12]
+ ret
+ENDPROC(sha_init)
--
1.8.3.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH] arm64/lib: add optimized implementation of sha_transform
2014-03-14 15:02 [PATCH] arm64/lib: add optimized implementation of sha_transform Ard Biesheuvel
@ 2014-03-17 18:18 ` Marek Vasut
2014-03-18 7:26 ` Ard Biesheuvel
0 siblings, 1 reply; 5+ messages in thread
From: Marek Vasut @ 2014-03-17 18:18 UTC (permalink / raw)
To: linux-arm-kernel
On Friday, March 14, 2014 at 04:02:33 PM, Ard Biesheuvel wrote:
> This implementation keeps the 64 bytes of workspace in registers rather
> than on the stack, eliminating most of the loads and stores, and reducing
> the instruction count by about 25%.
>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
> Hello all,
>
> No performance numbers I am allowed to share, unfortunately, so if anyone
> else (with access to actual, representative hardware) would care to have a
> go, I would be very grateful.
>
> This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m),
> and inserting the module using 'mode=303' as a parameter (note that the
> insmod always fails, but produces its test output to the kernel log). Also
> note that the sha_transform() function will be part of the kernel proper,
> so just rebuilding the sha1_generic module is not sufficient.
>
> Cheers,
Won't the function sha_transform() collide with the one in lib/sha1.c ? Or will
the one in lib/sha1.c be overriden somehow ?
Otherwise:
Reviewed-by: Marek Vasut <marex@denx.de>
Best regards,
Marek Vasut
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH] arm64/lib: add optimized implementation of sha_transform
2014-03-17 18:18 ` Marek Vasut
@ 2014-03-18 7:26 ` Ard Biesheuvel
2014-03-18 11:51 ` Marek Vasut
0 siblings, 1 reply; 5+ messages in thread
From: Ard Biesheuvel @ 2014-03-18 7:26 UTC (permalink / raw)
To: linux-arm-kernel
On 17 March 2014 22:18, Marek Vasut <marex@denx.de> wrote:
> On Friday, March 14, 2014 at 04:02:33 PM, Ard Biesheuvel wrote:
>> This implementation keeps the 64 bytes of workspace in registers rather
>> than on the stack, eliminating most of the loads and stores, and reducing
>> the instruction count by about 25%.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>> Hello all,
>>
>> No performance numbers I am allowed to share, unfortunately, so if anyone
>> else (with access to actual, representative hardware) would care to have a
>> go, I would be very grateful.
>>
>> This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m),
>> and inserting the module using 'mode=303' as a parameter (note that the
>> insmod always fails, but produces its test output to the kernel log). Also
>> note that the sha_transform() function will be part of the kernel proper,
>> so just rebuilding the sha1_generic module is not sufficient.
>>
>> Cheers,
>
> Won't the function sha_transform() collide with the one in lib/sha1.c ? Or will
> the one in lib/sha1.c be overriden somehow ?
>
No, this works pretty well, in fact: arch/*/lib has precedence over
lib/, and objects (declared with lib-y +=) are only included to
satisfy unresolved dependencies. So the second (generic) sha1.o will
not get linked.
> Otherwise:
>
> Reviewed-by: Marek Vasut <marex@denx.de>
>
Thanks. I did send a v2 which is actually a lot different from the
version you reviewed, so I won't carry over your reviewed-by without
your acknowledgement.
Cheers,
Ard.
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH] arm64/lib: add optimized implementation of sha_transform
2014-03-18 7:26 ` Ard Biesheuvel
@ 2014-03-18 11:51 ` Marek Vasut
0 siblings, 0 replies; 5+ messages in thread
From: Marek Vasut @ 2014-03-18 11:51 UTC (permalink / raw)
To: linux-arm-kernel
On Tuesday, March 18, 2014 at 08:26:00 AM, Ard Biesheuvel wrote:
> On 17 March 2014 22:18, Marek Vasut <marex@denx.de> wrote:
> > On Friday, March 14, 2014 at 04:02:33 PM, Ard Biesheuvel wrote:
> >> This implementation keeps the 64 bytes of workspace in registers rather
> >> than on the stack, eliminating most of the loads and stores, and
> >> reducing the instruction count by about 25%.
> >>
> >> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> >> ---
> >> Hello all,
> >>
> >> No performance numbers I am allowed to share, unfortunately, so if
> >> anyone else (with access to actual, representative hardware) would care
> >> to have a go, I would be very grateful.
> >>
> >> This can be done by building the tcrypt.ko module
> >> (CONFIG_CRYPTO_TEST=m), and inserting the module using 'mode=303' as a
> >> parameter (note that the insmod always fails, but produces its test
> >> output to the kernel log). Also note that the sha_transform() function
> >> will be part of the kernel proper, so just rebuilding the sha1_generic
> >> module is not sufficient.
> >>
> >> Cheers,
> >
> > Won't the function sha_transform() collide with the one in lib/sha1.c ?
> > Or will the one in lib/sha1.c be overriden somehow ?
>
> No, this works pretty well, in fact: arch/*/lib has precedence over
> lib/, and objects (declared with lib-y +=) are only included to
> satisfy unresolved dependencies. So the second (generic) sha1.o will
> not get linked.
Thanks for clearing this !
> > Otherwise:
> >
> > Reviewed-by: Marek Vasut <marex@denx.de>
>
> Thanks. I did send a v2 which is actually a lot different from the
> version you reviewed, so I won't carry over your reviewed-by without
> your acknowledgement.
Thanks!
Best regards,
Marek Vasut
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH] arm64/lib: add optimized implementation of sha_transform
@ 2014-03-17 15:55 Ard Biesheuvel
0 siblings, 0 replies; 5+ messages in thread
From: Ard Biesheuvel @ 2014-03-17 15:55 UTC (permalink / raw)
To: linux-arm-kernel
This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
@Catalin: I assumed x18 has no special significance in the kernel, so I am
using it as a temp register without preserving it. Is this correct?
Changes since v1:
- as suggested in feedback I received off list, it makes sense to schedule
more carefully for an in-order pipeline (A53?), so the rounds are now
2-way interleaved and combined with the schedule updates
- use named constants rather than bare numbers
- use ldnp for loading the input (non-temporal hint)
arch/arm64/kernel/arm64ksyms.c | 3 +
arch/arm64/lib/Makefile | 2 +-
arch/arm64/lib/sha1.S | 277 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 281 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/lib/sha1.S
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
EXPORT_SYMBOL(test_and_clear_bit);
EXPORT_SYMBOL(change_bit);
EXPORT_SYMBOL(test_and_change_bit);
+
+ /* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
- strchr.o strrchr.o
+ strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..5c472f32f917
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,277 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .altmacro
+
+ wA .req w2
+ wB .req w3
+ wC .req w4
+ wD .req w5
+ wE .req w6
+
+ k .req w7
+
+ t0 .req w16
+ t1 .req w17
+ t2 .req w18
+ t3 .req w1
+
+ xt0 .req x16
+ xt1 .req x17
+ xt2 .req x18
+ xt3 .req x1
+
+ .macro load_k_hi, reg, rc
+ .ifnb rc
+ movz \reg, #:abs_g1:\rc
+ .endif
+ .endm
+
+ .macro load_k_lo, reg, rc
+ .ifnb rc
+ movk \reg, #:abs_g0_nc:\rc
+ .endif
+ .endm
+
+ .macro inp_2rounds, in, a, b, c, d, e, rc
+ eor t0, \c, \d
+ .irp in2, %(in | 1)
+ .ifne in ^ in2
+ ldnp x\in, x\in2, [x1, #8 * (\in - 8)]
+ .endif
+ .endr
+ load_k_hi k, \rc
+ and t0, t0, \b
+ load_k_lo k, \rc
+ ror \b, \b, #2
+ eor t0, t0, \d
+ eor t1, \b, \c
+CPU_LE( rev32 x\in, x\in )
+ add t0, t0, \e
+ ror \e, \a, #(32 - 5)
+ and t1, t1, \a
+ add \e, \e, k
+ add t0, t0, w\in
+ eor t1, t1, \c
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add xt1, xt1, x\in, lsr #32
+ add \d, \d, k
+ ror \a, \a, #2
+ add \d, \d, t1
+ .endm
+
+ .macro cho_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7
+ extr xt2, x\st7, x\st6, #32
+ eor t0, \c, \d
+ eor x\st0, x\st0, x\st1
+ and t0, t0, \b
+ eor xt2, xt2, x\st4
+ ror \b, \b, #2
+ eor xt2, xt2, x\st0
+ eor t0, t0, \d
+ eor t1, \b, \c
+ ror t3, t2, #(32 - 1)
+ add t0, t0, \e
+ lsr xt2, xt2, #32
+ and t1, t1, \a
+ ror t2, t2, #(32 - 1)
+ ror \e, \a, #(32 - 5)
+ eor t1, t1, \c
+ add \e, \e, k
+ add t0, t0, t3
+ ror \a, \a, #2
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add t1, t1, t2
+ add \d, \d, k
+ orr x\st0, xt3, xt2, lsl #32
+ add \d, \d, t1
+ .endm
+
+ .macro par_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7, rc
+ extr xt2, x\st7, x\st6, #32
+ load_k_hi k, \rc
+ eor x\st0, x\st0, x\st1
+ eor t0, \b, \c
+ load_k_lo k, \rc
+ eor xt2, xt2, x\st4
+ ror \b, \b, #2
+ eor xt2, xt2, x\st0
+ eor t0, t0, \d
+ ror t3, t2, #(32 - 1)
+ eor t1, \a, \b
+ lsr xt2, xt2, #32
+ add t0, t0, \e
+ ror t2, t2, #(32 - 1)
+ ror \e, \a, #(32 - 5)
+ eor t1, t1, \c
+ add \e, \e, k
+ add t0, t0, t3
+ ror \a, \a, #2
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add t1, t1, t2
+ add \d, \d, k
+ orr x\st0, xt3, xt2, lsl #32
+ add \d, \d, t1
+ .endm
+
+ .macro maj_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7, rc
+ extr xt2, x\st7, x\st6, #32
+ load_k_hi k, \rc
+ eor t1, \b, \c
+ eor x\st0, x\st0, x\st1
+ and t0, \b, \c
+ load_k_lo k, \rc
+ eor xt2, xt2, x\st4
+ ror \b, \b, #2
+ and t1, t1, \d
+ eor t3, \a, \b
+ add t0, t0, t1
+ and t1, \a, \b
+ and t3, t3, \c
+ eor xt2, xt2, x\st0
+ add t1, t1, t3
+ ror t3, t2, #(32 - 1)
+ lsr xt2, xt2, #32
+ add t0, t0, \e
+ ror \e, \a, #(32 - 5)
+ ror t2, t2, #(32 - 1)
+ add \e, \e, k
+ add t0, t0, t3
+ ror \a, \a, #2
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add t1, t1, t2
+ add \d, \d, k
+ orr x\st0, xt3, xt2, lsl #32
+ add \d, \d, t1
+ .endm
+
+ .macro mix_2rounds, in, a, b, c, d, e, f, rc
+ st1 = (in + 1) % 8 + 8
+ st4 = (in + 4) % 8 + 8
+ st6 = (in + 6) % 8 + 8
+ st7 = (in + 7) % 8 + 8
+ \f\()_2rounds \a, \b, \c, \d, \e, \in, %st1, %st4, %st6, %st7, \rc
+ .endm
+
+ /*
+ * The SHA-1 round constants
+ */
+ .set sha_rcon1, 0x5a827999
+ .set sha_rcon2, 0x6ed9eba1
+ .set sha_rcon3, 0x8f1bbcdc
+ .set sha_rcon4, 0xca62c1d6
+
+ /*
+ * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+ */
+ENTRY(sha_transform)
+ /* load digest input */
+ ldp wC, wD, [x0, #8]
+ ldp wA, wB, [x0]
+ ldr wE, [x0, #16]
+
+ inp_2rounds 8, wA, wB, wC, wD, wE, sha_rcon1
+ inp_2rounds 9, wD, wE, wA, wB, wC
+ inp_2rounds 10, wB, wC, wD, wE, wA
+ inp_2rounds 11, wE, wA, wB, wC, wD
+ inp_2rounds 12, wC, wD, wE, wA, wB
+ inp_2rounds 13, wA, wB, wC, wD, wE
+ inp_2rounds 14, wD, wE, wA, wB, wC
+ inp_2rounds 15, wB, wC, wD, wE, wA
+ mix_2rounds 8, wE, wA, wB, wC, wD, cho
+ mix_2rounds 9, wC, wD, wE, wA, wB, cho
+
+ mix_2rounds 10, wA, wB, wC, wD, wE, par, sha_rcon2
+ mix_2rounds 11, wD, wE, wA, wB, wC, par
+ mix_2rounds 12, wB, wC, wD, wE, wA, par
+ mix_2rounds 13, wE, wA, wB, wC, wD, par
+ mix_2rounds 14, wC, wD, wE, wA, wB, par
+ mix_2rounds 15, wA, wB, wC, wD, wE, par
+ mix_2rounds 8, wD, wE, wA, wB, wC, par
+ mix_2rounds 9, wB, wC, wD, wE, wA, par
+ mix_2rounds 10, wE, wA, wB, wC, wD, par
+ mix_2rounds 11, wC, wD, wE, wA, wB, par
+
+ mix_2rounds 12, wA, wB, wC, wD, wE, maj, sha_rcon3
+ mix_2rounds 13, wD, wE, wA, wB, wC, maj
+ mix_2rounds 14, wB, wC, wD, wE, wA, maj
+ mix_2rounds 15, wE, wA, wB, wC, wD, maj
+ mix_2rounds 8, wC, wD, wE, wA, wB, maj
+ mix_2rounds 9, wA, wB, wC, wD, wE, maj
+ mix_2rounds 10, wD, wE, wA, wB, wC, maj
+ mix_2rounds 11, wB, wC, wD, wE, wA, maj
+ mix_2rounds 12, wE, wA, wB, wC, wD, maj
+ mix_2rounds 13, wC, wD, wE, wA, wB, maj
+
+ mix_2rounds 14, wA, wB, wC, wD, wE, par, sha_rcon4
+ mix_2rounds 15, wD, wE, wA, wB, wC, par
+ mix_2rounds 8, wB, wC, wD, wE, wA, par
+ mix_2rounds 9, wE, wA, wB, wC, wD, par
+ mix_2rounds 10, wC, wD, wE, wA, wB, par
+ mix_2rounds 11, wA, wB, wC, wD, wE, par
+ mix_2rounds 12, wD, wE, wA, wB, wC, par
+ mix_2rounds 13, wB, wC, wD, wE, wA, par
+ mix_2rounds 14, wE, wA, wB, wC, wD, par
+ mix_2rounds 15, wC, wD, wE, wA, wB, par
+
+ /* reload digest input */
+ ldr w8, [x0]
+ ldp w9, w10, [x0, #4]
+ ldp w11, w12, [x0, #12]
+
+ /* add this block's output to digest */
+ add wA, wA, w8
+ add wB, wB, w9
+ add wC, wC, w10
+ add wD, wD, w11
+ add wE, wE, w12
+
+ /* store digest */
+ str wA, [x0]
+ stp wB, wC, [x0, #4]
+ stp wD, wE, [x0, #12]
+ ret
+ENDPROC(sha_transform)
+
+ /*
+ * The SHA-1 digest initial values
+ */
+.Lsha_init:
+ .word 0x67452301
+ .word 0xefcdab89
+ .word 0x98badcfe
+ .word 0x10325476
+ .word 0xc3d2e1f0
+
+ /*
+ * void sha_init(__u32 *buf)
+ */
+ENTRY(sha_init)
+ adr xt0, .Lsha_init
+ ldr wA, [xt0]
+ ldp wB, wC, [xt0, #4]
+ ldp wD, wE, [xt0, #12]
+ str wA, [x0]
+ stp wB, wC, [x0, #4]
+ stp wD, wE, [x0, #12]
+ ret
+ENDPROC(sha_init)
--
1.8.3.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2014-03-18 11:51 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-03-14 15:02 [PATCH] arm64/lib: add optimized implementation of sha_transform Ard Biesheuvel
2014-03-17 18:18 ` Marek Vasut
2014-03-18 7:26 ` Ard Biesheuvel
2014-03-18 11:51 ` Marek Vasut
-- strict thread matches above, loose matches on Subject: below --
2014-03-17 15:55 Ard Biesheuvel
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).