* [PATCH net-next v6 05/23] zinc: import Andy Polyakov's ChaCha20 ARM and ARM64 implementations
[not found] <20180925145622.29959-1-Jason@zx2c4.com>
@ 2018-09-25 14:56 ` Jason A. Donenfeld
[not found] ` <CAKv+Gu8-EwxFhQSUPxjEvTA5ZPz34RieMokM6CUqwURDr74jtg@mail.gmail.com>
2018-09-25 14:56 ` [PATCH net-next v6 06/23] zinc: port " Jason A. Donenfeld
` (4 subsequent siblings)
5 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-25 14:56 UTC (permalink / raw)
To: linux-arm-kernel
These NEON and non-NEON implementations come from Andy Polyakov's
implementation, and are included here in raw form without modification,
so that subsequent commits that fix these up for the kernel can see how
it has changed. This awkward commit splitting has been requested for the
ARM[64] implementations in particular.
While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit 87cc649f30aaf69b351701875b9dac07c29ce8a2
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Based-on-code-from: Andy Polyakov <appro@openssl.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel at lists.infradead.org
---
lib/zinc/chacha20/chacha20-arm-cryptogams.S | 1440 ++++++++++++
lib/zinc/chacha20/chacha20-arm64-cryptogams.S | 1973 +++++++++++++++++
2 files changed, 3413 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-arm-cryptogams.S
create mode 100644 lib/zinc/chacha20/chacha20-arm64-cryptogams.S
diff --git a/lib/zinc/chacha20/chacha20-arm-cryptogams.S b/lib/zinc/chacha20/chacha20-arm-cryptogams.S
new file mode 100644
index 000000000000..05a3a9e6e93f
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm-cryptogams.S
@@ -0,0 +1,1440 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+.Lrot8:
+.long 0x02010003,0x06050407
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.LChaCha20_ctr32
+#else
+.word -1
+#endif
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+.LChaCha20_ctr32:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r14,pc,#16 @ ChaCha20_ctr32
+#else
+ adr r14,.LChaCha20_ctr32
+#endif
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data
+#if __ARM_MAX_ARCH__>=7
+ cmp r2,#192 @ test len
+ bls .Lshort
+ ldr r4,[r14,#-24]
+ ldr r4,[r14,r4]
+# ifdef __APPLE__
+ ldr r4,[r4]
+# endif
+ tst r4,#ARMV7_NEON
+ bne .LChaCha20_neon
+.Lshort:
+#endif
+ ldmia r12,{r4-r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+ sub r14,r14,#64 @ .Lsigma
+ stmdb sp!,{r4-r7} @ copy counter and nonce
+ ldmia r3,{r4-r11} @ load key
+ ldmia r14,{r0-r3} @ load sigma
+ stmdb sp!,{r4-r11} @ copy key
+ stmdb sp!,{r0-r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0-r9} @ load key material
+ str r11,[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr r11, [sp,#4*(15)]
+ mov r4,r4,ror#19 @ twist b[0..3]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ mov r5,r5,ror#19
+ ldr r10, [sp,#4*(13)]
+ mov r6,r6,ror#19
+ ldr r14,[sp,#4*(14)]
+ mov r7,r7,ror#19
+ mov r11,r11,ror#8 @ twist d[0..3]
+ mov r12,r12,ror#8
+ mov r10,r10,ror#8
+ mov r14,r14,ror#8
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs r11,r11,#1
+ add r0,r0,r4,ror#13
+ add r1,r1,r5,ror#13
+ eor r12,r0,r12,ror#24
+ eor r10,r1,r10,ror#24
+ add r8,r8,r12,ror#16
+ add r9,r9,r10,ror#16
+ eor r4,r8,r4,ror#13
+ eor r5,r9,r5,ror#13
+ add r0,r0,r4,ror#20
+ add r1,r1,r5,ror#20
+ eor r12,r0,r12,ror#16
+ eor r10,r1,r10,ror#16
+ add r8,r8,r12,ror#24
+ str r10,[sp,#4*(16+13)]
+ add r9,r9,r10,ror#24
+ ldr r10,[sp,#4*(16+15)]
+ str r8,[sp,#4*(16+8)]
+ eor r4,r4,r8,ror#12
+ str r9,[sp,#4*(16+9)]
+ eor r5,r5,r9,ror#12
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6,ror#13
+ ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7,ror#13
+ eor r14,r2,r14,ror#24
+ eor r10,r3,r10,ror#24
+ add r8,r8,r14,ror#16
+ add r9,r9,r10,ror#16
+ eor r6,r8,r6,ror#13
+ eor r7,r9,r7,ror#13
+ add r2,r2,r6,ror#20
+ add r3,r3,r7,ror#20
+ eor r14,r2,r14,ror#16
+ eor r10,r3,r10,ror#16
+ add r8,r8,r14,ror#24
+ add r9,r9,r10,ror#24
+ eor r6,r6,r8,ror#12
+ eor r7,r7,r9,ror#12
+ add r0,r0,r5,ror#13
+ add r1,r1,r6,ror#13
+ eor r10,r0,r10,ror#24
+ eor r12,r1,r12,ror#24
+ add r8,r8,r10,ror#16
+ add r9,r9,r12,ror#16
+ eor r5,r8,r5,ror#13
+ eor r6,r9,r6,ror#13
+ add r0,r0,r5,ror#20
+ add r1,r1,r6,ror#20
+ eor r10,r0,r10,ror#16
+ eor r12,r1,r12,ror#16
+ str r10,[sp,#4*(16+15)]
+ add r8,r8,r10,ror#24
+ ldr r10,[sp,#4*(16+13)]
+ add r9,r9,r12,ror#24
+ str r8,[sp,#4*(16+10)]
+ eor r5,r5,r8,ror#12
+ str r9,[sp,#4*(16+11)]
+ eor r6,r6,r9,ror#12
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7,ror#13
+ ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4,ror#13
+ eor r10,r2,r10,ror#24
+ eor r14,r3,r14,ror#24
+ add r8,r8,r10,ror#16
+ add r9,r9,r14,ror#16
+ eor r7,r8,r7,ror#13
+ eor r4,r9,r4,ror#13
+ add r2,r2,r7,ror#20
+ add r3,r3,r4,ror#20
+ eor r10,r2,r10,ror#16
+ eor r14,r3,r14,ror#16
+ add r8,r8,r10,ror#24
+ add r9,r9,r14,ror#24
+ eor r7,r7,r8,ror#12
+ eor r4,r4,r9,ror#12
+ bne .Loop
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ cmp r11,#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr r8,[sp,#4*(0)] @ load key material
+ ldr r9,[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+ orr r10,r12,r14
+ tst r10,#3 @ are input and output aligned?
+ ldr r10,[sp,#4*(2)]
+ bne .Lunaligned
+ cmp r11,#64 @ restore flags
+# else
+ ldr r10,[sp,#4*(2)]
+# endif
+ ldr r11,[sp,#4*(3)]
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8 @ xor with input
+ eorhs r1,r1,r9
+ add r8,sp,#4*(4)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r1,[r14,#-12]
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r8,r4,ror#13 @ accumulate key material
+ add r5,r9,r5,ror#13
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r10,r6,ror#13
+ add r7,r11,r7,ror#13
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+ add r8,sp,#4*(8)
+ str r4,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r5,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8
+ eorhs r1,r1,r9
+ add r8,sp,#4*(12)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ str r1,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r8,r4,ror#24 @ accumulate key material
+ add r5,r9,r5,ror#24
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r10,r6,ror#24
+ add r7,r11,r7,ror#24
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r4,[r14],#16 @ store output
+ str r5,[r14,#-12]
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+# if __ARM_ARCH__<7
+ b .Ltail
+
+.align 4
+.Lunaligned: @ unaligned endian-neutral path
+ cmp r11,#64 @ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+ ldr r11,[sp,#4*(3)]
+ add r0,r8,r0 @ accumulate key material
+ add r1,r9,r1
+ add r2,r10,r2
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r11,r3
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+0)
+ ldmia r8,{r8-r11} @ load key material
+ add r0,sp,#4*(16+8)
+ add r4,r8,r4,ror#13 @ accumulate key material
+ add r5,r9,r5,ror#13
+ add r6,r10,r6,ror#13
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r11,r7,ror#13
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+ add r8,sp,#4*(4+4)
+ ldmia r8,{r8-r11} @ load key material
+ ldmia r0,{r0-r7} @ load second half
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx"
+ strhi r11,[sp,#4*(16+11)] @ copy "rx"
+ add r0,r8,r0 @ accumulate key material
+ add r1,r9,r1
+ add r2,r10,r2
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r11,r3
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+8)
+ ldmia r8,{r8-r11} @ load key material
+ add r4,r8,r4,ror#24 @ accumulate key material
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+ add r5,r9,r5,ror#24
+ add r6,r10,r6,ror#24
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r11,r7,ror#24
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add r9,sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb r10,[r9],#1 @ read buffer on stack
+ ldrb r11,[r12],#1 @ read input
+ subs r8,r8,#1
+ eor r11,r11,r10
+ strb r11,[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+.Lno_data:
+ ldmia sp!,{r4-r11,pc}
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+.LChaCha20_neon:
+ adr r14,.Lsigma
+ vstmdb sp!,{d8-d15} @ ABI spec says so
+ stmdb sp!,{r0-r3}
+
+ vld1.32 {q1-q2},[r3] @ load key
+ ldmia r3,{r4-r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {q3},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0-r3} @ load sigma
+ vld1.32 {q0},[r14]! @ load sigma
+ vld1.32 {q12},[r14]! @ one
+ @ vld1.32 {d30},[r14] @ rot8
+ vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {q0-q1},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ vshl.i32 d26,d24,#1 @ two
+ vstr d24,[sp,#4*(16+0)]
+ vshl.i32 d28,d24,#2 @ four
+ vstr d26,[sp,#4*(16+2)]
+ vmov q4,q0
+ vstr d28,[sp,#4*(16+4)]
+ vmov q8,q0
+ @ vstr d30,[sp,#4*(16+6)]
+ vmov q5,q1
+ vmov q9,q1
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0-r9} @ load key material
+ cmp r11,#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ @ vldr d30,[sp,#4*(16+6)] @ rot8
+ vmov q4,q0
+ str r11,[sp,#4*(32+2)] @ save len
+ vmov q8,q0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov q5,q1
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov q9,q1
+.Loop_neon_enter:
+ ldr r11, [sp,#4*(15)]
+ mov r4,r4,ror#19 @ twist b[0..3]
+ vadd.i32 q7,q3,q12 @ counter+1
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ mov r5,r5,ror#19
+ vmov q6,q2
+ ldr r10, [sp,#4*(13)]
+ mov r6,r6,ror#19
+ vmov q10,q2
+ ldr r14,[sp,#4*(14)]
+ mov r7,r7,ror#19
+ vadd.i32 q11,q7,q12 @ counter+2
+ add r12,r12,#3 @ counter+3
+ mov r11,r11,ror#8 @ twist d[0..3]
+ mov r12,r12,ror#8
+ mov r10,r10,ror#8
+ mov r14,r14,ror#8
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs r11,r11,#1
+ vadd.i32 q0,q0,q1
+ add r0,r0,r4,ror#13
+ vadd.i32 q4,q4,q5
+ add r1,r1,r5,ror#13
+ vadd.i32 q8,q8,q9
+ eor r12,r0,r12,ror#24
+ veor q3,q3,q0
+ eor r10,r1,r10,ror#24
+ veor q7,q7,q4
+ add r8,r8,r12,ror#16
+ veor q11,q11,q8
+ add r9,r9,r10,ror#16
+ vrev32.16 q3,q3
+ eor r4,r8,r4,ror#13
+ vrev32.16 q7,q7
+ eor r5,r9,r5,ror#13
+ vrev32.16 q11,q11
+ add r0,r0,r4,ror#20
+ vadd.i32 q2,q2,q3
+ add r1,r1,r5,ror#20
+ vadd.i32 q6,q6,q7
+ eor r12,r0,r12,ror#16
+ vadd.i32 q10,q10,q11
+ eor r10,r1,r10,ror#16
+ veor q12,q1,q2
+ add r8,r8,r12,ror#24
+ veor q13,q5,q6
+ str r10,[sp,#4*(16+13)]
+ veor q14,q9,q10
+ add r9,r9,r10,ror#24
+ vshr.u32 q1,q12,#20
+ ldr r10,[sp,#4*(16+15)]
+ vshr.u32 q5,q13,#20
+ str r8,[sp,#4*(16+8)]
+ vshr.u32 q9,q14,#20
+ eor r4,r4,r8,ror#12
+ vsli.32 q1,q12,#12
+ str r9,[sp,#4*(16+9)]
+ vsli.32 q5,q13,#12
+ eor r5,r5,r9,ror#12
+ vsli.32 q9,q14,#12
+ ldr r8,[sp,#4*(16+10)]
+ vadd.i32 q0,q0,q1
+ add r2,r2,r6,ror#13
+ vadd.i32 q4,q4,q5
+ ldr r9,[sp,#4*(16+11)]
+ vadd.i32 q8,q8,q9
+ add r3,r3,r7,ror#13
+ veor q12,q3,q0
+ eor r14,r2,r14,ror#24
+ veor q13,q7,q4
+ eor r10,r3,r10,ror#24
+ veor q14,q11,q8
+ add r8,r8,r14,ror#16
+ vshr.u32 q3,q12,#24
+ add r9,r9,r10,ror#16
+ vshr.u32 q7,q13,#24
+ eor r6,r8,r6,ror#13
+ vshr.u32 q11,q14,#24
+ eor r7,r9,r7,ror#13
+ vsli.32 q3,q12,#8
+ add r2,r2,r6,ror#20
+ vsli.32 q7,q13,#8
+ add r3,r3,r7,ror#20
+ vsli.32 q11,q14,#8
+ eor r14,r2,r14,ror#16
+ vadd.i32 q2,q2,q3
+ eor r10,r3,r10,ror#16
+ vadd.i32 q6,q6,q7
+ add r8,r8,r14,ror#24
+ vadd.i32 q10,q10,q11
+ add r9,r9,r10,ror#24
+ veor q12,q1,q2
+ eor r6,r6,r8,ror#12
+ veor q13,q5,q6
+ eor r7,r7,r9,ror#12
+ veor q14,q9,q10
+ vshr.u32 q1,q12,#25
+ vshr.u32 q5,q13,#25
+ vshr.u32 q9,q14,#25
+ vsli.32 q1,q12,#7
+ vsli.32 q5,q13,#7
+ vsli.32 q9,q14,#7
+ vext.8 q2,q2,q2,#8
+ vext.8 q6,q6,q6,#8
+ vext.8 q10,q10,q10,#8
+ vext.8 q1,q1,q1,#4
+ vext.8 q5,q5,q5,#4
+ vext.8 q9,q9,q9,#4
+ vext.8 q3,q3,q3,#12
+ vext.8 q7,q7,q7,#12
+ vext.8 q11,q11,q11,#12
+ vadd.i32 q0,q0,q1
+ add r0,r0,r5,ror#13
+ vadd.i32 q4,q4,q5
+ add r1,r1,r6,ror#13
+ vadd.i32 q8,q8,q9
+ eor r10,r0,r10,ror#24
+ veor q3,q3,q0
+ eor r12,r1,r12,ror#24
+ veor q7,q7,q4
+ add r8,r8,r10,ror#16
+ veor q11,q11,q8
+ add r9,r9,r12,ror#16
+ vrev32.16 q3,q3
+ eor r5,r8,r5,ror#13
+ vrev32.16 q7,q7
+ eor r6,r9,r6,ror#13
+ vrev32.16 q11,q11
+ add r0,r0,r5,ror#20
+ vadd.i32 q2,q2,q3
+ add r1,r1,r6,ror#20
+ vadd.i32 q6,q6,q7
+ eor r10,r0,r10,ror#16
+ vadd.i32 q10,q10,q11
+ eor r12,r1,r12,ror#16
+ veor q12,q1,q2
+ str r10,[sp,#4*(16+15)]
+ veor q13,q5,q6
+ add r8,r8,r10,ror#24
+ veor q14,q9,q10
+ ldr r10,[sp,#4*(16+13)]
+ vshr.u32 q1,q12,#20
+ add r9,r9,r12,ror#24
+ vshr.u32 q5,q13,#20
+ str r8,[sp,#4*(16+10)]
+ vshr.u32 q9,q14,#20
+ eor r5,r5,r8,ror#12
+ vsli.32 q1,q12,#12
+ str r9,[sp,#4*(16+11)]
+ vsli.32 q5,q13,#12
+ eor r6,r6,r9,ror#12
+ vsli.32 q9,q14,#12
+ ldr r8,[sp,#4*(16+8)]
+ vadd.i32 q0,q0,q1
+ add r2,r2,r7,ror#13
+ vadd.i32 q4,q4,q5
+ ldr r9,[sp,#4*(16+9)]
+ vadd.i32 q8,q8,q9
+ add r3,r3,r4,ror#13
+ veor q12,q3,q0
+ eor r10,r2,r10,ror#24
+ veor q13,q7,q4
+ eor r14,r3,r14,ror#24
+ veor q14,q11,q8
+ add r8,r8,r10,ror#16
+ vshr.u32 q3,q12,#24
+ add r9,r9,r14,ror#16
+ vshr.u32 q7,q13,#24
+ eor r7,r8,r7,ror#13
+ vshr.u32 q11,q14,#24
+ eor r4,r9,r4,ror#13
+ vsli.32 q3,q12,#8
+ add r2,r2,r7,ror#20
+ vsli.32 q7,q13,#8
+ add r3,r3,r4,ror#20
+ vsli.32 q11,q14,#8
+ eor r10,r2,r10,ror#16
+ vadd.i32 q2,q2,q3
+ eor r14,r3,r14,ror#16
+ vadd.i32 q6,q6,q7
+ add r8,r8,r10,ror#24
+ vadd.i32 q10,q10,q11
+ add r9,r9,r14,ror#24
+ veor q12,q1,q2
+ eor r7,r7,r8,ror#12
+ veor q13,q5,q6
+ eor r4,r4,r9,ror#12
+ veor q14,q9,q10
+ vshr.u32 q1,q12,#25
+ vshr.u32 q5,q13,#25
+ vshr.u32 q9,q14,#25
+ vsli.32 q1,q12,#7
+ vsli.32 q5,q13,#7
+ vsli.32 q9,q14,#7
+ vext.8 q2,q2,q2,#8
+ vext.8 q6,q6,q6,#8
+ vext.8 q10,q10,q10,#8
+ vext.8 q1,q1,q1,#12
+ vext.8 q5,q5,q5,#12
+ vext.8 q9,q9,q9,#12
+ vext.8 q3,q3,q3,#4
+ vext.8 q7,q7,q7,#4
+ vext.8 q11,q11,q11,#4
+ bne .Loop_neon
+
+ add r11,sp,#32
+ vld1.32 {q12-q13},[sp] @ load key material
+ vld1.32 {q14-q15},[r11]
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 q0,q0,q12 @ accumulate key material
+ vadd.i32 q4,q4,q12
+ vadd.i32 q8,q8,q12
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ vadd.i32 q1,q1,q13
+ vadd.i32 q5,q5,q13
+ vadd.i32 q9,q9,q13
+ vldr d26,[sp,#4*(16+2)] @ two
+
+ vadd.i32 q2,q2,q14
+ vadd.i32 q6,q6,q14
+ vadd.i32 q10,q10,q14
+ vadd.i32 d14,d14,d24 @ counter+1
+ vadd.i32 d22,d22,d26 @ counter+2
+
+ vadd.i32 q3,q3,q15
+ vadd.i32 q7,q7,q15
+ vadd.i32 q11,q11,q15
+
+ cmp r11,#64*4
+ blo .Ltail_neon
+
+ vld1.8 {q12-q13},[r12]! @ load input
+ mov r11,sp
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12 @ xor with input
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ vst1.8 {q0-q1},[r14]! @ store output
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vld1.32 {q0-q1},[r11]! @ load for next iteration
+ veor d25,d25,d25
+ vldr d24,[sp,#4*(16+4)] @ four
+ veor q9,q9,q13
+ vld1.32 {q2-q3},[r11]
+ veor q10,q10,q14
+ vst1.8 {q4-q5},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q6-q7},[r14]!
+
+ vadd.i32 d6,d6,d24 @ next counter value
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ vst1.8 {q8-q9},[r14]!
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+ vst1.8 {q10-q11},[r14]!
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8 @ xor with input
+ add r8,sp,#4*(4)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r8,r4,ror#13 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r5,r9,r5,ror#13
+ ldr r9,[r12,#-12]
+ add r6,r10,r6,ror#13
+ ldr r10,[r12,#-8]
+ add r7,r11,r7,ror#13
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+ add r8,sp,#4*(8)
+ eor r5,r5,r9
+ str r4,[r14],#16 @ store output
+ eor r6,r6,r10
+ str r5,[r14,#-12]
+ eor r7,r7,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8
+ add r8,sp,#4*(12)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r8,r4,ror#24 @ accumulate key material
+ add r8,r8,#4 @ next counter value
+ add r5,r9,r5,ror#24
+ str r8,[sp,#4*(12)] @ save next counter value
+ ldr r8,[r12],#16 @ load input
+ add r6,r10,r6,ror#24
+ add r4,r4,#3 @ counter+3
+ ldr r9,[r12,#-12]
+ add r7,r11,r7,ror#24
+ ldr r10,[r12,#-8]
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi r8,[sp,#4*(32+2)] @ re-load len
+ eor r5,r5,r9
+ eor r6,r6,r10
+ str r4,[r14],#16 @ store output
+ eor r7,r7,r11
+ str r5,[r14,#-12]
+ sub r11,r8,#64*4 @ len-=64*4
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str r11, [sp,#4*(20+32+2)] @ save len
+ add r11,sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr r12,[sp,#4*(16+10)]
+ ldr r14,[sp,#4*(16+11)]
+ vldmia r11,{d8-d15} @ fulfill ABI requirement
+ str r12,[sp,#4*(20+16+10)] @ copy "rx"
+ str r14,[sp,#4*(20+16+11)] @ copy "rx"
+
+ ldr r11, [sp,#4*(15)]
+ mov r4,r4,ror#19 @ twist b[0..3]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ mov r5,r5,ror#19
+ ldr r10, [sp,#4*(13)]
+ mov r6,r6,ror#19
+ ldr r14,[sp,#4*(14)]
+ mov r7,r7,ror#19
+ mov r11,r11,ror#8 @ twist d[0..3]
+ mov r12,r12,ror#8
+ mov r10,r10,ror#8
+ mov r14,r14,ror#8
+ str r11, [sp,#4*(20+16+15)]
+ add r11,sp,#4*(20)
+ vst1.32 {q0-q1},[r11]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {q2-q3},[r11]
+ mov r11,#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp r11,#64*3
+ bhs .L192_or_more_neon
+ cmp r11,#64*2
+ bhs .L128_or_more_neon
+ cmp r11,#64*1
+ bhs .L64_or_more_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q0-q1},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q2-q3},[r8]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vst1.8 {q0-q1},[r14]!
+ vst1.8 {q2-q3},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q4-q5},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q6-q7},[r8]
+ sub r11,r11,#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vst1.8 {q0-q1},[r14]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vst1.8 {q4-q5},[r14]!
+ vst1.8 {q6-q7},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q8-q9},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q10-q11},[r8]
+ sub r11,r11,#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q0-q1},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vst1.8 {q2-q3},[r14]!
+ veor q9,q9,q13
+ vst1.8 {q4-q5},[r14]!
+ veor q10,q10,q14
+ vst1.8 {q6-q7},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q8-q9},[r14]!
+ vst1.8 {q10-q11},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(4)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r8,r4,ror#13 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r9,r5,ror#13
+ add r6,r10,r6,ror#13
+ add r7,r11,r7,ror#13
+ ldmia r8,{r8-r11} @ load key material
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia sp,{r0-r7}
+ add r0,sp,#4*(16+8)
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(12)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r8,r4,ror#24 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r9,r5,ror#24
+ add r4,r4,#3 @ counter+3
+ add r6,r10,r6,ror#24
+ add r7,r11,r7,ror#24
+ ldr r11,[sp,#4*(32+2)] @ re-load len
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia r8,{r0-r7}
+ add r10,sp,#4*(0)
+ sub r11,r11,#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb r8,[r10],#1 @ read buffer on stack
+ ldrb r9,[r12],#1 @ read input
+ subs r11,r11,#1
+ eor r8,r8,r9
+ strb r8,[r14],#1 @ store output
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8-d15}
+ add sp,sp,#4*(16+3)
+ ldmia sp!,{r4-r11,pc}
+.size ChaCha20_neon,.-ChaCha20_neon
+.comm OPENSSL_armcap_P,4,4
+#endif
diff --git a/lib/zinc/chacha20/chacha20-arm64-cryptogams.S b/lib/zinc/chacha20/chacha20-arm64-cryptogams.S
new file mode 100644
index 000000000000..4d029bfdad3a
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm64-cryptogams.S
@@ -0,0 +1,1973 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include "arm_arch.h"
+
+.text
+
+
+
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
+.quad OPENSSL_armcap_P-.
+#endif
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.globl ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+ cbz x2,.Labort
+ adr x5,.LOPENSSL_armcap_P
+ cmp x2,#192
+ b.lo .Lshort
+#ifdef __ILP32__
+ ldrsw x6,[x5]
+#else
+ ldr x6,[x5]
+#endif
+ ldr w17,[x6,x5]
+ tst w17,#ARMV7_NEON
+ b.ne ChaCha20_neon
+
+.Lshort:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __ARMEB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+.Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+.Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,.Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo .Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add x2,x2,#64
+.Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+.Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,.Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo .Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo .Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+.Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_neon,.-ChaCha20_neon
+.type ChaCha20_512_neon,%function
+.align 5
+ChaCha20_512_neon:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+.Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+.Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs .Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+.size ChaCha20_512_neon,.-ChaCha20_512_neon
--
2.19.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next v6 06/23] zinc: port Andy Polyakov's ChaCha20 ARM and ARM64 implementations
[not found] <20180925145622.29959-1-Jason@zx2c4.com>
2018-09-25 14:56 ` [PATCH net-next v6 05/23] zinc: import Andy Polyakov's ChaCha20 ARM and ARM64 implementations Jason A. Donenfeld
@ 2018-09-25 14:56 ` Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 07/23] zinc: " Jason A. Donenfeld
` (3 subsequent siblings)
5 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-25 14:56 UTC (permalink / raw)
To: linux-arm-kernel
These port and prepare Andy Polyakov's implementations for the kernel,
but don't actually wire up any of the code yet. The wiring will be done
in a subsequent commit, since we'll need to merge these implementations
with another one. We make a few small changes to the assembly:
- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed (prefixed with .L) to fit kernel conventions.
- Constants have been rearranged so that they are closer to the code
that is using them. [ARM only]
- The neon code can jump to the scalar code when it makes sense to do
so.
- The neon_512 function as a separate function has been removed, leaving
the decision up to the main neon entry point. [ARM64 only]
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel at lists.infradead.org
---
lib/zinc/chacha20/chacha20-arm-cryptogams.S | 367 +++++++++---------
lib/zinc/chacha20/chacha20-arm64-cryptogams.S | 57 +--
2 files changed, 193 insertions(+), 231 deletions(-)
diff --git a/lib/zinc/chacha20/chacha20-arm-cryptogams.S b/lib/zinc/chacha20/chacha20-arm-cryptogams.S
index 05a3a9e6e93f..770bab469171 100644
--- a/lib/zinc/chacha20/chacha20-arm-cryptogams.S
+++ b/lib/zinc/chacha20/chacha20-arm-cryptogams.S
@@ -1,9 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/
-#include "arm_arch.h"
+#include <linux/linkage.h>
.text
#if defined(__thumb2__) || defined(__clang__)
@@ -24,48 +27,25 @@
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
.Lone:
.long 1,0,0,0
-.Lrot8:
-.long 0x02010003,0x06050407
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.LChaCha20_ctr32
-#else
.word -1
-#endif
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
.align 5
-ChaCha20_ctr32:
-.LChaCha20_ctr32:
+ENTRY(chacha20_arm)
ldr r12,[sp,#0] @ pull pointer to counter and nonce
stmdb sp!,{r0-r2,r4-r11,lr}
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
- sub r14,pc,#16 @ ChaCha20_ctr32
-#else
- adr r14,.LChaCha20_ctr32
-#endif
cmp r2,#0 @ len==0?
-#ifdef __thumb2__
+#ifdef __thumb2__
itt eq
#endif
addeq sp,sp,#4*3
- beq .Lno_data
-#if __ARM_MAX_ARCH__>=7
- cmp r2,#192 @ test len
- bls .Lshort
- ldr r4,[r14,#-24]
- ldr r4,[r14,r4]
-# ifdef __APPLE__
- ldr r4,[r4]
-# endif
- tst r4,#ARMV7_NEON
- bne .LChaCha20_neon
-.Lshort:
-#endif
+ beq .Lno_data_arm
ldmia r12,{r4-r7} @ load counter and nonce
sub sp,sp,#4*(16) @ off-load area
- sub r14,r14,#64 @ .Lsigma
+#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
+ sub r14,pc,#100 @ .Lsigma
+#else
+ adr r14,.Lsigma @ .Lsigma
+#endif
stmdb sp!,{r4-r7} @ copy counter and nonce
ldmia r3,{r4-r11} @ load key
ldmia r14,{r0-r3} @ load sigma
@@ -191,7 +171,7 @@ ChaCha20_ctr32:
@ rx and second half@sp+4*(16+8)
cmp r11,#64 @ done yet?
-#ifdef __thumb2__
+#ifdef __thumb2__
itete lo
#endif
addlo r12,sp,#4*(0) @ shortcut or ...
@@ -202,49 +182,49 @@ ChaCha20_ctr32:
ldr r8,[sp,#4*(0)] @ load key material
ldr r9,[sp,#4*(1)]
-#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
-# if __ARM_ARCH__<7
+#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ < 7
orr r10,r12,r14
tst r10,#3 @ are input and output aligned?
ldr r10,[sp,#4*(2)]
bne .Lunaligned
cmp r11,#64 @ restore flags
-# else
+#else
ldr r10,[sp,#4*(2)]
-# endif
+#endif
ldr r11,[sp,#4*(3)]
add r0,r0,r8 @ accumulate key material
add r1,r1,r9
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r8,[r12],#16 @ load input
ldrhs r9,[r12,#-12]
add r2,r2,r10
add r3,r3,r11
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r10,[r12,#-8]
ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
rev r0,r0
rev r1,r1
rev r2,r2
rev r3,r3
-# endif
-# ifdef __thumb2__
+#endif
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r0,r0,r8 @ xor with input
eorhs r1,r1,r9
add r8,sp,#4*(4)
str r0,[r14],#16 @ store output
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r2,r2,r10
eorhs r3,r3,r11
ldmia r8,{r8-r11} @ load key material
@@ -254,34 +234,34 @@ ChaCha20_ctr32:
add r4,r8,r4,ror#13 @ accumulate key material
add r5,r9,r5,ror#13
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r8,[r12],#16 @ load input
ldrhs r9,[r12,#-12]
add r6,r10,r6,ror#13
add r7,r11,r7,ror#13
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r10,[r12,#-8]
ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
rev r4,r4
rev r5,r5
rev r6,r6
rev r7,r7
-# endif
-# ifdef __thumb2__
+#endif
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r4,r4,r8
eorhs r5,r5,r9
add r8,sp,#4*(8)
str r4,[r14],#16 @ store output
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r6,r6,r10
eorhs r7,r7,r11
str r5,[r14,#-12]
@@ -294,39 +274,39 @@ ChaCha20_ctr32:
add r0,r0,r8 @ accumulate key material
add r1,r1,r9
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r8,[r12],#16 @ load input
ldrhs r9,[r12,#-12]
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hi
-# endif
+#endif
strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
add r2,r2,r10
add r3,r3,r11
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r10,[r12,#-8]
ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
rev r0,r0
rev r1,r1
rev r2,r2
rev r3,r3
-# endif
-# ifdef __thumb2__
+#endif
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r0,r0,r8
eorhs r1,r1,r9
add r8,sp,#4*(12)
str r0,[r14],#16 @ store output
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r2,r2,r10
eorhs r3,r3,r11
str r1,[r14,#-12]
@@ -336,79 +316,79 @@ ChaCha20_ctr32:
add r4,r8,r4,ror#24 @ accumulate key material
add r5,r9,r5,ror#24
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hi
-# endif
+#endif
addhi r8,r8,#1 @ next counter value
strhi r8,[sp,#4*(12)] @ save next counter value
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r8,[r12],#16 @ load input
ldrhs r9,[r12,#-12]
add r6,r10,r6,ror#24
add r7,r11,r7,ror#24
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhs r10,[r12,#-8]
ldrhs r11,[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
rev r4,r4
rev r5,r5
rev r6,r6
rev r7,r7
-# endif
-# ifdef __thumb2__
+#endif
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r4,r4,r8
eorhs r5,r5,r9
-# ifdef __thumb2__
+#ifdef __thumb2__
it ne
-# endif
+#endif
ldrne r8,[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
eorhs r6,r6,r10
eorhs r7,r7,r11
str r4,[r14],#16 @ store output
str r5,[r14,#-12]
-# ifdef __thumb2__
+#ifdef __thumb2__
it hs
-# endif
+#endif
subhs r11,r8,#64 @ len-=64
str r6,[r14,#-8]
str r7,[r14,#-4]
bhi .Loop_outer
beq .Ldone
-# if __ARM_ARCH__<7
+#if __LINUX_ARM_ARCH__ < 7
b .Ltail
.align 4
.Lunaligned: @ unaligned endian-neutral path
cmp r11,#64 @ restore flags
-# endif
#endif
-#if __ARM_ARCH__<7
+#endif
+#if __LINUX_ARM_ARCH__ < 7
ldr r11,[sp,#4*(3)]
add r0,r8,r0 @ accumulate key material
add r1,r9,r1
add r2,r10,r2
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r8,r8,r8 @ zero or ...
ldrhsb r8,[r12],#16 @ ... load input
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
add r3,r11,r3
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r10,r10,r10
ldrhsb r10,[r12,#-8]
eorlo r11,r11,r11
@@ -416,53 +396,53 @@ ChaCha20_ctr32:
eor r0,r8,r0 @ xor with input (or zero)
eor r1,r9,r1
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-15] @ load more input
ldrhsb r9,[r12,#-11]
eor r2,r10,r2
strb r0,[r14],#16 @ store output
eor r3,r11,r3
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-7]
ldrhsb r11,[r12,#-3]
strb r1,[r14,#-12]
eor r0,r8,r0,lsr#8
strb r2,[r14,#-8]
eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-14] @ load more input
ldrhsb r9,[r12,#-10]
strb r3,[r14,#-4]
eor r2,r10,r2,lsr#8
strb r0,[r14,#-15]
eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-6]
ldrhsb r11,[r12,#-2]
strb r1,[r14,#-11]
eor r0,r8,r0,lsr#8
strb r2,[r14,#-7]
eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-13] @ load more input
ldrhsb r9,[r12,#-9]
strb r3,[r14,#-3]
eor r2,r10,r2,lsr#8
strb r0,[r14,#-14]
eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-5]
ldrhsb r11,[r12,#-1]
strb r1,[r14,#-10]
@@ -482,18 +462,18 @@ ChaCha20_ctr32:
add r4,r8,r4,ror#13 @ accumulate key material
add r5,r9,r5,ror#13
add r6,r10,r6,ror#13
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r8,r8,r8 @ zero or ...
ldrhsb r8,[r12],#16 @ ... load input
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
add r7,r11,r7,ror#13
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r10,r10,r10
ldrhsb r10,[r12,#-8]
eorlo r11,r11,r11
@@ -501,53 +481,53 @@ ChaCha20_ctr32:
eor r4,r8,r4 @ xor with input (or zero)
eor r5,r9,r5
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-15] @ load more input
ldrhsb r9,[r12,#-11]
eor r6,r10,r6
strb r4,[r14],#16 @ store output
eor r7,r11,r7
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-7]
ldrhsb r11,[r12,#-3]
strb r5,[r14,#-12]
eor r4,r8,r4,lsr#8
strb r6,[r14,#-8]
eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-14] @ load more input
ldrhsb r9,[r12,#-10]
strb r7,[r14,#-4]
eor r6,r10,r6,lsr#8
strb r4,[r14,#-15]
eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-6]
ldrhsb r11,[r12,#-2]
strb r5,[r14,#-11]
eor r4,r8,r4,lsr#8
strb r6,[r14,#-7]
eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-13] @ load more input
ldrhsb r9,[r12,#-9]
strb r7,[r14,#-3]
eor r6,r10,r6,lsr#8
strb r4,[r14,#-14]
eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-5]
ldrhsb r11,[r12,#-1]
strb r5,[r14,#-10]
@@ -564,26 +544,26 @@ ChaCha20_ctr32:
add r8,sp,#4*(4+4)
ldmia r8,{r8-r11} @ load key material
ldmia r0,{r0-r7} @ load second half
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hi
-# endif
+#endif
strhi r10,[sp,#4*(16+10)] @ copy "rx"
strhi r11,[sp,#4*(16+11)] @ copy "rx"
add r0,r8,r0 @ accumulate key material
add r1,r9,r1
add r2,r10,r2
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r8,r8,r8 @ zero or ...
ldrhsb r8,[r12],#16 @ ... load input
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
add r3,r11,r3
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r10,r10,r10
ldrhsb r10,[r12,#-8]
eorlo r11,r11,r11
@@ -591,53 +571,53 @@ ChaCha20_ctr32:
eor r0,r8,r0 @ xor with input (or zero)
eor r1,r9,r1
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-15] @ load more input
ldrhsb r9,[r12,#-11]
eor r2,r10,r2
strb r0,[r14],#16 @ store output
eor r3,r11,r3
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-7]
ldrhsb r11,[r12,#-3]
strb r1,[r14,#-12]
eor r0,r8,r0,lsr#8
strb r2,[r14,#-8]
eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-14] @ load more input
ldrhsb r9,[r12,#-10]
strb r3,[r14,#-4]
eor r2,r10,r2,lsr#8
strb r0,[r14,#-15]
eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-6]
ldrhsb r11,[r12,#-2]
strb r1,[r14,#-11]
eor r0,r8,r0,lsr#8
strb r2,[r14,#-7]
eor r1,r9,r1,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-13] @ load more input
ldrhsb r9,[r12,#-9]
strb r3,[r14,#-3]
eor r2,r10,r2,lsr#8
strb r0,[r14,#-14]
eor r3,r11,r3,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-5]
ldrhsb r11,[r12,#-1]
strb r1,[r14,#-10]
@@ -654,25 +634,25 @@ ChaCha20_ctr32:
add r8,sp,#4*(4+8)
ldmia r8,{r8-r11} @ load key material
add r4,r8,r4,ror#24 @ accumulate key material
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hi
-# endif
+#endif
addhi r8,r8,#1 @ next counter value
strhi r8,[sp,#4*(12)] @ save next counter value
add r5,r9,r5,ror#24
add r6,r10,r6,ror#24
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r8,r8,r8 @ zero or ...
ldrhsb r8,[r12],#16 @ ... load input
eorlo r9,r9,r9
ldrhsb r9,[r12,#-12]
add r7,r11,r7,ror#24
-# ifdef __thumb2__
+#ifdef __thumb2__
itete lo
-# endif
+#endif
eorlo r10,r10,r10
ldrhsb r10,[r12,#-8]
eorlo r11,r11,r11
@@ -680,53 +660,53 @@ ChaCha20_ctr32:
eor r4,r8,r4 @ xor with input (or zero)
eor r5,r9,r5
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-15] @ load more input
ldrhsb r9,[r12,#-11]
eor r6,r10,r6
strb r4,[r14],#16 @ store output
eor r7,r11,r7
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-7]
ldrhsb r11,[r12,#-3]
strb r5,[r14,#-12]
eor r4,r8,r4,lsr#8
strb r6,[r14,#-8]
eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-14] @ load more input
ldrhsb r9,[r12,#-10]
strb r7,[r14,#-4]
eor r6,r10,r6,lsr#8
strb r4,[r14,#-15]
eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-6]
ldrhsb r11,[r12,#-2]
strb r5,[r14,#-11]
eor r4,r8,r4,lsr#8
strb r6,[r14,#-7]
eor r5,r9,r5,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r8,[r12,#-13] @ load more input
ldrhsb r9,[r12,#-9]
strb r7,[r14,#-3]
eor r6,r10,r6,lsr#8
strb r4,[r14,#-14]
eor r7,r11,r7,lsr#8
-# ifdef __thumb2__
+#ifdef __thumb2__
itt hs
-# endif
+#endif
ldrhsb r10,[r12,#-5]
ldrhsb r11,[r12,#-1]
strb r5,[r14,#-10]
@@ -740,13 +720,13 @@ ChaCha20_ctr32:
eor r7,r11,r7,lsr#8
strb r6,[r14,#-5]
strb r7,[r14,#-1]
-# ifdef __thumb2__
+#ifdef __thumb2__
it ne
-# endif
+#endif
ldrne r8,[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
+#ifdef __thumb2__
it hs
-# endif
+#endif
subhs r11,r8,#64 @ len-=64
bhi .Loop_outer
@@ -768,20 +748,33 @@ ChaCha20_ctr32:
.Ldone:
add sp,sp,#4*(32+3)
-.Lno_data:
+.Lno_data_arm:
ldmia sp!,{r4-r11,pc}
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-#if __ARM_MAX_ARCH__>=7
+ENDPROC(chacha20_arm)
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+.align 5
+.Lsigma2:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone2:
+.long 1,0,0,0
+.word -1
+
.arch armv7-a
.fpu neon
-.type ChaCha20_neon,%function
.align 5
-ChaCha20_neon:
+ENTRY(chacha20_neon)
ldr r12,[sp,#0] @ pull pointer to counter and nonce
stmdb sp!,{r0-r2,r4-r11,lr}
-.LChaCha20_neon:
- adr r14,.Lsigma
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_neon
+.Lchacha20_neon_begin:
+ adr r14,.Lsigma2
vstmdb sp!,{d8-d15} @ ABI spec says so
stmdb sp!,{r0-r3}
@@ -1121,12 +1114,12 @@ ChaCha20_neon:
ldr r10,[r12,#-8]
add r3,r3,r11
ldr r11,[r12,#-4]
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r0,r0
rev r1,r1
rev r2,r2
rev r3,r3
-# endif
+#endif
eor r0,r0,r8 @ xor with input
add r8,sp,#4*(4)
eor r1,r1,r9
@@ -1146,12 +1139,12 @@ ChaCha20_neon:
ldr r10,[r12,#-8]
add r7,r11,r7,ror#13
ldr r11,[r12,#-4]
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r4,r4
rev r5,r5
rev r6,r6
rev r7,r7
-# endif
+#endif
eor r4,r4,r8
add r8,sp,#4*(8)
eor r5,r5,r9
@@ -1170,24 +1163,24 @@ ChaCha20_neon:
ldr r8,[r12],#16 @ load input
add r1,r1,r9
ldr r9,[r12,#-12]
-# ifdef __thumb2__
+#ifdef __thumb2__
it hi
-# endif
+#endif
strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
add r2,r2,r10
ldr r10,[r12,#-8]
-# ifdef __thumb2__
+#ifdef __thumb2__
it hi
-# endif
+#endif
strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
add r3,r3,r11
ldr r11,[r12,#-4]
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r0,r0
rev r1,r1
rev r2,r2
rev r3,r3
-# endif
+#endif
eor r0,r0,r8
add r8,sp,#4*(12)
eor r1,r1,r9
@@ -1210,16 +1203,16 @@ ChaCha20_neon:
add r7,r11,r7,ror#24
ldr r10,[r12,#-8]
ldr r11,[r12,#-4]
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r4,r4
rev r5,r5
rev r6,r6
rev r7,r7
-# endif
+#endif
eor r4,r4,r8
-# ifdef __thumb2__
+#ifdef __thumb2__
it hi
-# endif
+#endif
ldrhi r8,[sp,#4*(32+2)] @ re-load len
eor r5,r5,r9
eor r6,r6,r10
@@ -1379,7 +1372,7 @@ ChaCha20_neon:
add r6,r10,r6,ror#13
add r7,r11,r7,ror#13
ldmia r8,{r8-r11} @ load key material
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r0,r0
rev r1,r1
rev r2,r2
@@ -1388,7 +1381,7 @@ ChaCha20_neon:
rev r5,r5
rev r6,r6
rev r7,r7
-# endif
+#endif
stmia sp,{r0-r7}
add r0,sp,#4*(16+8)
@@ -1408,7 +1401,7 @@ ChaCha20_neon:
add r6,r10,r6,ror#24
add r7,r11,r7,ror#24
ldr r11,[sp,#4*(32+2)] @ re-load len
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r0,r0
rev r1,r1
rev r2,r2
@@ -1417,7 +1410,7 @@ ChaCha20_neon:
rev r5,r5
rev r6,r6
rev r7,r7
-# endif
+#endif
stmia r8,{r0-r7}
add r10,sp,#4*(0)
sub r11,r11,#64*3 @ len-=64*3
@@ -1434,7 +1427,7 @@ ChaCha20_neon:
add sp,sp,#4*(32+4)
vldmia sp,{d8-d15}
add sp,sp,#4*(16+3)
+.Lno_data_neon:
ldmia sp!,{r4-r11,pc}
-.size ChaCha20_neon,.-ChaCha20_neon
-.comm OPENSSL_armcap_P,4,4
+ENDPROC(chacha20_neon)
#endif
diff --git a/lib/zinc/chacha20/chacha20-arm64-cryptogams.S b/lib/zinc/chacha20/chacha20-arm64-cryptogams.S
index 4d029bfdad3a..5037510edc6f 100644
--- a/lib/zinc/chacha20/chacha20-arm64-cryptogams.S
+++ b/lib/zinc/chacha20/chacha20-arm64-cryptogams.S
@@ -1,46 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/
-#include "arm_arch.h"
+#include <linux/linkage.h>
.text
-
-
-
.align 5
.Lsigma:
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lone:
.long 1,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
-#endif
-.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
.align 5
-ChaCha20_ctr32:
+ENTRY(chacha20_arm)
cbz x2,.Labort
- adr x5,.LOPENSSL_armcap_P
- cmp x2,#192
- b.lo .Lshort
-#ifdef __ILP32__
- ldrsw x6,[x5]
-#else
- ldr x6,[x5]
-#endif
- ldr w17,[x6,x5]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
-.Lshort:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@@ -309,11 +287,13 @@ ChaCha20_ctr32:
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
+ENDPROC(chacha20_arm)
-.type ChaCha20_neon,%function
+#ifdef CONFIG_KERNEL_MODE_NEON
.align 5
-ChaCha20_neon:
+ENTRY(chacha20_neon)
+ cbz x2,.Labort_neon
+
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@@ -803,19 +783,6 @@ ChaCha20_neon:
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
-.size ChaCha20_neon,.-ChaCha20_neon
-.type ChaCha20_512_neon,%function
-.align 5
-ChaCha20_512_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr x5,.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
.L512_or_more_neon:
sub sp,sp,#128+64
@@ -1969,5 +1936,7 @@ ChaCha20_512_neon:
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
+.Labort_neon:
ret
-.size ChaCha20_512_neon,.-ChaCha20_512_neon
+ENDPROC(chacha20_neon)
+#endif
--
2.19.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
[not found] <20180925145622.29959-1-Jason@zx2c4.com>
2018-09-25 14:56 ` [PATCH net-next v6 05/23] zinc: import Andy Polyakov's ChaCha20 ARM and ARM64 implementations Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 06/23] zinc: port " Jason A. Donenfeld
@ 2018-09-25 14:56 ` Jason A. Donenfeld
2018-09-26 8:59 ` Ard Biesheuvel
2018-09-28 16:01 ` Ard Biesheuvel
2018-09-25 14:56 ` [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 " Jason A. Donenfeld
` (2 subsequent siblings)
5 siblings, 2 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-25 14:56 UTC (permalink / raw)
To: linux-arm-kernel
These wire Andy Polyakov's implementations up to the kernel for ARMv7,8
NEON, and introduce Eric Biggers' ultra-fast scalar implementation for
CPUs without NEON or for CPUs with slow NEON (Cortex-A5,7).
This commit does the following:
- Adds the glue code for the assembly implementations.
- Renames the ARMv8 code into place, since it can at this point be
used wholesale.
- Merges Andy Polyakov's ARMv7 NEON code with Eric Biggers' <=ARMv7
scalar code.
Commit note: Eric Biggers' scalar code is brand new, and quite possibly
prematurely added to this commit, and so it may require a bit of revision.
This commit delivers approximately the same or much better performance than
the existing crypto API's code and has been measured to do as such on:
- ARM1176JZF-S [ARMv6]
- Cortex-A7 [ARMv7]
- Cortex-A8 [ARMv7]
- Cortex-A9 [ARMv7]
- Cortex-A17 [ARMv7]
- Cortex-A53 [ARMv8]
- Cortex-A55 [ARMv8]
- Cortex-A73 [ARMv8]
- Cortex-A75 [ARMv8]
Interestingly, Andy Polyakov's scalar code is slower than Eric Biggers',
but is also significantly shorter. This has the advantage that it does
not evict other code from L1 cache -- particularly on ARM11 chips -- and
so in certain circumstances it can actually be faster. However, it wasn't
found that this had an affect on any code existing in the kernel today.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Co-authored-by: Eric Biggers <ebiggers@google.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel at lists.infradead.org
---
lib/zinc/Makefile | 2 +
lib/zinc/chacha20/chacha20-arm-glue.h | 88 +++
...acha20-arm-cryptogams.S => chacha20-arm.S} | 502 ++++++++++++++++--
...20-arm64-cryptogams.S => chacha20-arm64.S} | 0
lib/zinc/chacha20/chacha20.c | 2 +
5 files changed, 556 insertions(+), 38 deletions(-)
create mode 100644 lib/zinc/chacha20/chacha20-arm-glue.h
rename lib/zinc/chacha20/{chacha20-arm-cryptogams.S => chacha20-arm.S} (71%)
rename lib/zinc/chacha20/{chacha20-arm64-cryptogams.S => chacha20-arm64.S} (100%)
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 223a0816c918..e47f64e12bbd 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -4,4 +4,6 @@ ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
zinc_chacha20-y := chacha20/chacha20.o
zinc_chacha20-$(CONFIG_ZINC_ARCH_X86_64) += chacha20/chacha20-x86_64.o
+zinc_chacha20-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o
+zinc_chacha20-$(CONFIG_ZINC_ARCH_ARM64) += chacha20/chacha20-arm64.o
obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
diff --git a/lib/zinc/chacha20/chacha20-arm-glue.h b/lib/zinc/chacha20/chacha20-arm-glue.h
new file mode 100644
index 000000000000..86cce851ed02
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm-glue.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#if defined(CONFIG_ARM)
+#include <asm/system_info.h>
+#include <asm/cputype.h>
+#endif
+
+asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#if defined(CONFIG_ARM)
+asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]);
+#endif
+#if defined(CONFIG_KERNEL_MODE_NEON)
+asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_neon __ro_after_init;
+
+static void __init chacha20_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+ chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+ switch (read_cpuid_part()) {
+ case ARM_CPU_PART_CORTEX_A7:
+ case ARM_CPU_PART_CORTEX_A5:
+ /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON
+ * implementation but do incredibly with the scalar one and use
+ * less power.
+ */
+ break;
+ default:
+ chacha20_use_neon = elf_hwcap & HWCAP_NEON;
+ }
+#endif
+}
+
+static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
+ const u8 *src, size_t len,
+ simd_context_t *simd_context)
+{
+#if defined(CONFIG_KERNEL_MODE_NEON)
+ if (chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
+ simd_use(simd_context))
+ chacha20_neon(dst, src, len, state->key, state->counter);
+ else
+#endif
+ chacha20_arm(dst, src, len, state->key, state->counter);
+
+ state->counter[0] += (len + 63) / 64;
+ return true;
+}
+
+static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
+ const u8 nonce[HCHACHA20_NONCE_SIZE],
+ const u8 key[HCHACHA20_KEY_SIZE],
+ simd_context_t *simd_context)
+{
+#if defined(CONFIG_ARM)
+ u32 x[] = { CHACHA20_CONSTANT_EXPA,
+ CHACHA20_CONSTANT_ND_3,
+ CHACHA20_CONSTANT_2_BY,
+ CHACHA20_CONSTANT_TE_K,
+ get_unaligned_le32(key + 0),
+ get_unaligned_le32(key + 4),
+ get_unaligned_le32(key + 8),
+ get_unaligned_le32(key + 12),
+ get_unaligned_le32(key + 16),
+ get_unaligned_le32(key + 20),
+ get_unaligned_le32(key + 24),
+ get_unaligned_le32(key + 28),
+ get_unaligned_le32(nonce + 0),
+ get_unaligned_le32(nonce + 4),
+ get_unaligned_le32(nonce + 8),
+ get_unaligned_le32(nonce + 12)
+ };
+ hchacha20_arm(x, derived_key);
+ return true;
+#else
+ return false;
+#endif
+}
diff --git a/lib/zinc/chacha20/chacha20-arm-cryptogams.S b/lib/zinc/chacha20/chacha20-arm.S
similarity index 71%
rename from lib/zinc/chacha20/chacha20-arm-cryptogams.S
rename to lib/zinc/chacha20/chacha20-arm.S
index 770bab469171..5abedafcf129 100644
--- a/lib/zinc/chacha20/chacha20-arm-cryptogams.S
+++ b/lib/zinc/chacha20/chacha20-arm.S
@@ -1,13 +1,475 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2018 Google, Inc.
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- *
- * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/
#include <linux/linkage.h>
+/*
+ * The following scalar routine was written by Eric Biggers.
+ *
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used. So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions. This is faster than using explicit rotate
+ * instructions. To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount. The rotation amount is then fixed up just in time
+ * when the values are used. 'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+ // ChaCha state registers
+ X0 .req r0
+ X1 .req r1
+ X2 .req r2
+ X3 .req r3
+ X4 .req r4
+ X5 .req r5
+ X6 .req r6
+ X7 .req r7
+ X8_X10 .req r8 // shared by x8 and x10
+ X9_X11 .req r9 // shared by x9 and x11
+ X12 .req r10
+ X13 .req r11
+ X14 .req r12
+ X15 .req r14
+
+.Lexpand_32byte_k:
+ // "expand 32-byte k"
+ .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+
+#ifdef __thumb2__
+# define adrl adr
+#endif
+
+.macro __rev out, in, t0, t1, t2
+.if __LINUX_ARM_ARCH__ >= 6
+ rev \out, \in
+.else
+ lsl \t0, \in, #24
+ and \t1, \in, #0xff00
+ and \t2, \in, #0xff0000
+ orr \out, \t0, \in, lsr #24
+ orr \out, \out, \t1, lsl #8
+ orr \out, \out, \t2, lsr #8
+.endif
+.endm
+
+.macro _le32_bswap x, t0, t1, t2
+#ifdef __ARMEB__
+ __rev \x, \x, \t0, \t1, \t2
+#endif
+.endm
+
+.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
+ _le32_bswap \a, \t0, \t1, \t2
+ _le32_bswap \b, \t0, \t1, \t2
+ _le32_bswap \c, \t0, \t1, \t2
+ _le32_bswap \d, \t0, \t1, \t2
+.endm
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
+
+ // a += b; d ^= a; d = rol(d, 16);
+ add \a1, \a1, \b1, ror #brot
+ add \a2, \a2, \b2, ror #brot
+ eor \d1, \a1, \d1, ror #drot
+ eor \d2, \a2, \d2, ror #drot
+ // drot == 32 - 16 == 16
+
+ // c += d; b ^= c; b = rol(b, 12);
+ add \c1, \c1, \d1, ror #16
+ add \c2, \c2, \d2, ror #16
+ eor \b1, \c1, \b1, ror #brot
+ eor \b2, \c2, \b2, ror #brot
+ // brot == 32 - 12 == 20
+
+ // a += b; d ^= a; d = rol(d, 8);
+ add \a1, \a1, \b1, ror #20
+ add \a2, \a2, \b2, ror #20
+ eor \d1, \a1, \d1, ror #16
+ eor \d2, \a2, \d2, ror #16
+ // drot == 32 - 8 == 24
+
+ // c += d; b ^= c; b = rol(b, 7);
+ add \c1, \c1, \d1, ror #24
+ add \c2, \c2, \d2, ror #24
+ eor \b1, \c1, \b1, ror #20
+ eor \b2, \c2, \b2, ror #20
+ // brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+ // column round
+
+ // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+ _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
+
+ // save (x8, x9); restore (x10, x11)
+ __strd X8_X10, X9_X11, sp, 0
+ __ldrd X8_X10, X9_X11, sp, 8
+
+ // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+ _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
+
+ .set brot, 25
+ .set drot, 24
+
+ // diagonal round
+
+ // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+ _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
+
+ // save (x10, x11); restore (x8, x9)
+ __strd X8_X10, X9_X11, sp, 8
+ __ldrd X8_X10, X9_X11, sp, 0
+
+ // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+ _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute nrounds
+ .set brot, 0
+ .set drot, 0
+ .rept \nrounds / 2
+ _doubleround
+ .endr
+.endm
+
+.macro _chacha nrounds
+
+.Lnext_block\@:
+ // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+
+ // Do the core ChaCha permutation to update x0-x15.
+ _chacha_permute \nrounds
+
+ add sp, #8
+ // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+ push {X8_X10, X9_X11, X12, X13, X14, X15}
+
+ // Load (OUT, IN, LEN).
+ ldr r14, [sp, #96]
+ ldr r12, [sp, #100]
+ ldr r11, [sp, #104]
+
+ orr r10, r14, r12
+
+ // Use slow path if fewer than 64 bytes remain.
+ cmp r11, #64
+ blt .Lxor_slowpath\@
+
+ // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
+ // ARMv6+, since ldmia and stmia (used below) still require alignment.
+ tst r10, #3
+ bne .Lxor_slowpath\@
+
+ // Fast path: XOR 64 bytes of aligned data.
+
+ // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // x0-x3
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ ldmia r12!, {r8-r11}
+ eor X0, X0, r8
+ eor X1, X1, r9
+ eor X2, X2, r10
+ eor X3, X3, r11
+ stmia r14!, {X0-X3}
+
+ // x4-x7
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ ldmia r12!, {X0-X3}
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ eor X4, X4, X0
+ eor X5, X5, X1
+ eor X6, X6, X2
+ eor X7, X7, X3
+ stmia r14!, {X4-X7}
+
+ // x8-x15
+ pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8 // x8
+ eor r1, r1, r9 // x9
+ eor r6, r6, r10 // x10
+ eor r7, r7, r11 // x11
+ stmia r14!, {r0,r1,r6,r7}
+ ldmia r12!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ ldr r9, [sp, #72] // load LEN
+ eor r2, r2, r0 // x12
+ eor r3, r3, r1 // x13
+ eor r4, r4, r6 // x14
+ eor r5, r5, r7 // x15
+ subs r9, #64 // decrement and check LEN
+ stmia r14!, {r2-r5}
+
+ beq .Ldone\@
+
+.Lprepare_for_next_block\@:
+
+ // Stack: x0-x15 OUT IN LEN
+
+ // Increment block counter (x12)
+ add r8, #1
+
+ // Store updated (OUT, IN, LEN)
+ str r14, [sp, #64]
+ str r12, [sp, #68]
+ str r9, [sp, #72]
+
+ mov r14, sp
+
+ // Store updated block counter (x12)
+ str r8, [sp, #48]
+
+ sub sp, #16
+
+ // Reload state and do next block
+ ldmia r14!, {r0-r11} // load x0-x11
+ __strd r10, r11, sp, 8 // store x10-x11 before state
+ ldmia r14, {r10-r12,r14} // load x12-x15
+ b .Lnext_block\@
+
+.Lxor_slowpath\@:
+ // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+ // We handle it by storing the 64 bytes of keystream to the stack, then
+ // XOR-ing the needed portion with the data.
+
+ // Allocate keystream buffer
+ sub sp, #64
+ mov r14, sp
+
+ // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Save keystream for x0-x3
+ __ldrd r8, r9, sp, 96
+ __ldrd r10, r11, sp, 104
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ stmia r14!, {X0-X3}
+
+ // Save keystream for x4-x7
+ __ldrd r8, r9, sp, 112
+ __ldrd r10, r11, sp, 120
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ add r8, sp, #64
+ stmia r14!, {X4-X7}
+
+ // Save keystream for x8-x15
+ ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 128
+ __ldrd r10, r11, sp, 136
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ stmia r14!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 144
+ __ldrd r10, r11, sp, 152
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ stmia r14, {r2-r5}
+
+ // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+ // Registers: r8 is block counter, r12 is IN.
+
+ ldr r9, [sp, #168] // LEN
+ ldr r14, [sp, #160] // OUT
+ cmp r9, #64
+ mov r0, sp
+ movle r1, r9
+ movgt r1, #64
+ // r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+ orr r2, r12, r14
+ tst r2, #3 // IN or OUT misaligned?
+ bne .Lxor_next_byte\@
+.endif
+
+ // XOR a word at a time
+.rept 16
+ subs r1, #4
+ blt .Lxor_words_done\@
+ ldr r2, [r12], #4
+ ldr r3, [r0], #4
+ eor r2, r2, r3
+ str r2, [r14], #4
+.endr
+ b .Lxor_slowpath_done\@
+.Lxor_words_done\@:
+ ands r1, r1, #3
+ beq .Lxor_slowpath_done\@
+
+ // XOR a byte@a time
+.Lxor_next_byte\@:
+ ldrb r2, [r12], #1
+ ldrb r3, [r0], #1
+ eor r2, r2, r3
+ strb r2, [r14], #1
+ subs r1, #1
+ bne .Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+ subs r9, #64
+ add sp, #96
+ bgt .Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm // _chacha
+
+/*
+ * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
+ * const u32 iv[4]);
+ */
+ENTRY(chacha20_arm)
+ cmp r2, #0 // len == 0?
+ bxeq lr
+
+ push {r0-r2,r4-r11,lr}
+
+ // Push state x0-x15 onto stack.
+ // Also store an extra copy of x10-x11 just before the state.
+
+ ldr r4, [sp, #48] // iv
+ mov r0, sp
+ sub sp, #80
+
+ // iv: x12-x15
+ ldm r4, {X12,X13,X14,X15}
+ stmdb r0!, {X12,X13,X14,X15}
+
+ // key: x4-x11
+ __ldrd X8_X10, X9_X11, r3, 24
+ __strd X8_X10, X9_X11, sp, 8
+ stmdb r0!, {X8_X10, X9_X11}
+ ldm r3, {X4-X9_X11}
+ stmdb r0!, {X4-X9_X11}
+
+ // constants: x0-x3
+ adrl X3, .Lexpand_32byte_k
+ ldm X3, {X0-X3}
+ __strd X0, X1, sp, 16
+ __strd X2, X3, sp, 24
+
+ _chacha 20
+
+ add sp, #76
+ pop {r4-r11, pc}
+ENDPROC(chacha20_arm)
+
+/*
+ * void hchacha20_arm(const u32 state[16], u32 out[8]);
+ */
+ENTRY(hchacha20_arm)
+ push {r1,r4-r11,lr}
+
+ mov r14, r0
+ ldmia r14!, {r0-r11} // load x0-x11
+ push {r10-r11} // store x10-x11 to stack
+ ldm r14, {r10-r12,r14} // load x12-x15
+ sub sp, #8
+
+ _chacha_permute 20
+
+ // Skip over (unused0-unused1, x10-x11)
+ add sp, #16
+
+ // Fix up rotations of x12-x15
+ ror X12, X12, #drot
+ ror X13, X13, #drot
+ pop {r4} // load 'out'
+ ror X14, X14, #drot
+ ror X15, X15, #drot
+
+ // Store (x0-x3,x12-x15) to 'out'
+ stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+ pop {r4-r11,pc}
+ENDPROC(hchacha20_arm)
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+/*
+ * This following NEON routine was ported from Andy Polyakov's implementation
+ * from CRYPTOGAMS. It begins with parts of the CRYPTOGAMS scalar routine,
+ * since certain NEON code paths actually branch to it.
+ */
+
.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
@@ -22,39 +484,6 @@
#define ldrhsb ldrbhs
#endif
-.align 5
-.Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
-.Lone:
-.long 1,0,0,0
-.word -1
-
-.align 5
-ENTRY(chacha20_arm)
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
- cmp r2,#0 @ len==0?
-#ifdef __thumb2__
- itt eq
-#endif
- addeq sp,sp,#4*3
- beq .Lno_data_arm
- ldmia r12,{r4-r7} @ load counter and nonce
- sub sp,sp,#4*(16) @ off-load area
-#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
- sub r14,pc,#100 @ .Lsigma
-#else
- adr r14,.Lsigma @ .Lsigma
-#endif
- stmdb sp!,{r4-r7} @ copy counter and nonce
- ldmia r3,{r4-r11} @ load key
- ldmia r14,{r0-r3} @ load sigma
- stmdb sp!,{r4-r11} @ copy key
- stmdb sp!,{r0-r3} @ copy sigma
- str r10,[sp,#4*(16+10)] @ off-load "rx"
- str r11,[sp,#4*(16+11)] @ off-load "rx"
- b .Loop_outer_enter
-
.align 4
.Loop_outer:
ldmia sp,{r0-r9} @ load key material
@@ -748,11 +1177,8 @@ ENTRY(chacha20_arm)
.Ldone:
add sp,sp,#4*(32+3)
-.Lno_data_arm:
ldmia sp!,{r4-r11,pc}
-ENDPROC(chacha20_arm)
-#ifdef CONFIG_KERNEL_MODE_NEON
.align 5
.Lsigma2:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
diff --git a/lib/zinc/chacha20/chacha20-arm64-cryptogams.S b/lib/zinc/chacha20/chacha20-arm64.S
similarity index 100%
rename from lib/zinc/chacha20/chacha20-arm64-cryptogams.S
rename to lib/zinc/chacha20/chacha20-arm64.S
diff --git a/lib/zinc/chacha20/chacha20.c b/lib/zinc/chacha20/chacha20.c
index 4354b874a6a5..fc4f74fca653 100644
--- a/lib/zinc/chacha20/chacha20.c
+++ b/lib/zinc/chacha20/chacha20.c
@@ -16,6 +16,8 @@
#if defined(CONFIG_ZINC_ARCH_X86_64)
#include "chacha20-x86_64-glue.h"
+#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
+#include "chacha20-arm-glue.h"
#else
void __init chacha20_fpu_init(void)
{
--
2.19.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 ARM and ARM64 implementations
[not found] <20180925145622.29959-1-Jason@zx2c4.com>
` (2 preceding siblings ...)
2018-09-25 14:56 ` [PATCH net-next v6 07/23] zinc: " Jason A. Donenfeld
@ 2018-09-25 14:56 ` Jason A. Donenfeld
2018-10-03 6:12 ` Eric Biggers
2018-09-25 14:56 ` [PATCH net-next v6 12/23] zinc: " Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation Jason A. Donenfeld
5 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-25 14:56 UTC (permalink / raw)
To: linux-arm-kernel
These NEON and non-NEON implementations come from Andy Polyakov's
implementation, and are included here in raw form without modification,
so that subsequent commits that fix these up for the kernel can see how
it has changed. This awkward commit splitting has been requested for the
ARM[64] implementations in particular.
While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit 5bb1cd2292b388263a0cc05392bb99141212aa53
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Based-on-code-from: Andy Polyakov <appro@openssl.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel at lists.infradead.org
---
lib/zinc/poly1305/poly1305-arm-cryptogams.S | 1172 +++++++++++++++++
lib/zinc/poly1305/poly1305-arm64-cryptogams.S | 869 ++++++++++++
2 files changed, 2041 insertions(+)
create mode 100644 lib/zinc/poly1305/poly1305-arm-cryptogams.S
create mode 100644 lib/zinc/poly1305/poly1305-arm64-cryptogams.S
diff --git a/lib/zinc/poly1305/poly1305-arm-cryptogams.S b/lib/zinc/poly1305/poly1305-arm-cryptogams.S
new file mode 100644
index 000000000000..884b465030e4
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm-cryptogams.S
@@ -0,0 +1,1172 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include "arm_arch.h"
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp r1,#0
+ str r3,[r0,#0] @ zero hash value
+ str r3,[r0,#4]
+ str r3,[r0,#8]
+ str r3,[r0,#12]
+ str r3,[r0,#16]
+ str r3,[r0,#36] @ is_base2_26
+ add r0,r0,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+#endif
+ ldrb r4,[r1,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[r1,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[r1,#2]
+ ldrb r7,[r1,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[r1,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[r1,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[r1,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# ifdef __APPLE__
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[r1,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[r1,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[r1,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[r1,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __APPLE__
+ adr r9,poly1305_blocks_neon
+ adr r11,poly1305_blocks
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r11,r9
+ adr r12,poly1305_emit
+ adr r10,poly1305_emit_neon
+# ifdef __thumb2__
+ it ne
+# endif
+ movne r12,r10
+# else
+# ifdef __thumb2__
+ itete eq
+# endif
+ addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
+# endif
+# ifdef __thumb2__
+ orr r12,r12,#1 @ thumb-ify address
+ orr r11,r11,#1
+# endif
+#endif
+ ldrb r9,[r1,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[r1,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[r1,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[r1,#14]
+ and r6,r6,r3
+
+ ldrb r10,[r1,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[r0,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[r0,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[r0,#8]
+ and r7,r7,r3
+ str r7,[r0,#12]
+#if __ARM_MAX_ARCH__>=7
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands r2,r2,#-16
+ beq .Lno_data
+
+ cmp r3,#0
+ add r2,r2,r1 @ end pointer
+ sub sp,sp,#32
+
+ ldmia r0,{r4-r12} @ load context
+
+ str r0,[sp,#12] @ offload stuff
+ mov lr,r1
+ str r2,[sp,#16]
+ str r10,[sp,#20]
+ str r11,[sp,#24]
+ str r12,[sp,#28]
+ b .Loop
+
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi r8,r8,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds r4,r4,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs r5,r5,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs r6,r6,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add r10,r10,r10,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi r8,r8,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds r4,r4,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs r5,r5,r1
+ add r10,r10,r10,lsr#2
+ adcs r6,r6,r2
+#endif
+ add r11,r11,r11,lsr#2
+ adcs r7,r7,r3
+ add r12,r12,r12,lsr#2
+
+ umull r2,r3,r5,r9
+ adc r8,r8,#0
+ umull r0,r1,r4,r9
+ umlal r2,r3,r8,r10
+ umlal r0,r1,r7,r10
+ ldr r10,[sp,#20] @ reload r10
+ umlal r2,r3,r6,r12
+ umlal r0,r1,r5,r12
+ umlal r2,r3,r7,r11
+ umlal r0,r1,r6,r11
+ umlal r2,r3,r4,r10
+ str r0,[sp,#0] @ future r4
+ mul r0,r11,r8
+ ldr r11,[sp,#24] @ reload r11
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future r6
+ str r2,[sp,#4] @ future r5
+
+ mul r2,r12,r8
+ eor r3,r3,r3
+ umlal r0,r1,r7,r12
+ ldr r12,[sp,#28] @ reload r12
+ umlal r2,r3,r7,r9
+ umlal r0,r1,r6,r9
+ umlal r2,r3,r6,r10
+ umlal r0,r1,r5,r10
+ umlal r2,r3,r5,r11
+ umlal r0,r1,r4,r11
+ umlal r2,r3,r4,r12
+ ldr r4,[sp,#0]
+ mul r8,r9,r8
+ ldr r5,[sp,#4]
+
+ adds r6,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds r7,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add r8,r8,r3 @ h4+=d3>>32
+
+ and r1,r8,#-4
+ and r8,r8,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds r4,r4,r1
+ adcs r5,r5,#0
+ adcs r6,r6,#0
+ adcs r7,r7,#0
+ adc r8,r8,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr r0,[sp,#12]
+ add sp,sp,#32
+ stmia r0,{r4-r8} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ stmdb sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+
+ ldmia r0,{r3-r7}
+ adds r8,r3,#5 @ compare to modulus
+ adcs r9,r4,#0
+ adcs r10,r5,#0
+ adcs r11,r6,#0
+ adc r7,r7,#0
+ tst r7,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r3,r8
+ ldr r8,[r2,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r4,r9
+ ldr r9,[r2,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r5,r10
+ ldr r10,[r2,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r6,r11
+ ldr r11,[r2,#12]
+
+ adds r3,r3,r8
+ adcs r4,r4,r9
+ adcs r5,r5,r10
+ adc r6,r6,r11
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+# endif
+ str r3,[r1,#0]
+ str r4,[r1,#4]
+ str r5,[r1,#8]
+ str r6,[r1,#12]
+#else
+ strb r3,[r1,#0]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#4]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#8]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#12]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#1]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#5]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#9]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#13]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#2]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#6]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#10]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#14]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#3]
+ strb r4,[r1,#7]
+ strb r5,[r1,#11]
+ strb r6,[r1,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+ ldr r4,[r0,#20] @ load key base 2^32
+ ldr r5,[r0,#24]
+ ldr r6,[r0,#28]
+ ldr r7,[r0,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 d0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 d1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 d2,r2
+ vdup.32 d3,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 d4,r3
+ vdup.32 d5,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 d6,r4
+ vdup.32 d7,r6
+ vdup.32 d8,r5
+
+ mov r5,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 q5,d0,d0[1]
+ vmull.u32 q6,d1,d0[1]
+ vmull.u32 q7,d3,d0[1]
+ vmull.u32 q8,d5,d0[1]
+ vmull.u32 q9,d7,d0[1]
+
+ vmlal.u32 q5,d7,d2[1]
+ vmlal.u32 q6,d0,d1[1]
+ vmlal.u32 q7,d1,d1[1]
+ vmlal.u32 q8,d3,d1[1]
+ vmlal.u32 q9,d5,d1[1]
+
+ vmlal.u32 q5,d5,d4[1]
+ vmlal.u32 q6,d7,d4[1]
+ vmlal.u32 q8,d1,d3[1]
+ vmlal.u32 q7,d0,d3[1]
+ vmlal.u32 q9,d3,d3[1]
+
+ vmlal.u32 q5,d3,d6[1]
+ vmlal.u32 q8,d0,d5[1]
+ vmlal.u32 q6,d5,d6[1]
+ vmlal.u32 q7,d7,d6[1]
+ vmlal.u32 q9,d1,d5[1]
+
+ vmlal.u32 q8,d7,d8[1]
+ vmlal.u32 q5,d1,d8[1]
+ vmlal.u32 q6,d3,d8[1]
+ vmlal.u32 q7,d5,d8[1]
+ vmlal.u32 q9,d0,d7[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vbic.i32 d18,#0xfc000000
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vadd.i32 d10,d10,d30 @ h4 -> h0
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vbic.i32 d14,#0xfc000000
+
+ vshr.u32 d30,d10,#26
+ vbic.i32 d10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+
+ subs r5,r5,#1
+ beq .Lsquare_break_neon
+
+ add r6,r0,#(48+0*9*4)
+ add r7,r0,#(48+1*9*4)
+
+ vtrn.32 d0,d10 @ r^2:r^1
+ vtrn.32 d3,d14
+ vtrn.32 d5,d16
+ vtrn.32 d1,d12
+ vtrn.32 d7,d18
+
+ vshl.u32 d4,d3,#2 @ *5
+ vshl.u32 d6,d5,#2
+ vshl.u32 d2,d1,#2
+ vshl.u32 d8,d7,#2
+ vadd.i32 d4,d4,d3
+ vadd.i32 d2,d2,d1
+ vadd.i32 d6,d6,d5
+ vadd.i32 d8,d8,d7
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6,:32]
+ vst1.32 {d8[1]},[r7,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add r6,r0,#(48+2*4*9)
+ add r7,r0,#(48+3*4*9)
+
+ vmov d0,d10 @ r^4:r^3
+ vshl.u32 d2,d12,#2 @ *5
+ vmov d1,d12
+ vshl.u32 d4,d14,#2
+ vmov d3,d14
+ vshl.u32 d6,d16,#2
+ vmov d5,d16
+ vshl.u32 d8,d18,#2
+ vmov d7,d18
+ vadd.i32 d2,d2,d12
+ vadd.i32 d4,d4,d14
+ vadd.i32 d6,d6,d16
+ vadd.i32 d8,d8,d18
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6]
+ vst1.32 {d8[1]},[r7]
+
+ bx lr @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr ip,[r0,#36] @ is_base2_26
+ ands r2,r2,#-16
+ beq .Lno_data_neon
+
+ cmp r2,#64
+ bhs .Lenter_neon
+ tst ip,ip @ is_base2_26?
+ beq .Lpoly1305_blocks
+
+.Lenter_neon:
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl poly1305_init_neon
+
+ ldr r4,[r0,#0] @ load hash value base 2^32
+ ldr r5,[r0,#4]
+ ldr r6,[r0,#8]
+ ldr r7,[r0,#12]
+ ldr ip,[r0,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor d10,d10,d10
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor d12,d12,d12
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor d14,d14,d14
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor d16,d16,d16
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor d18,d18,d18
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[r0,#36] @ is_base2_26
+
+ vmov.32 d10[0],r2
+ vmov.32 d12[0],r3
+ vmov.32 d14[0],r4
+ vmov.32 d16[0],r5
+ vmov.32 d18[0],r6
+ adr r5,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lbase2_32_neon
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor d10,d10,d10
+ veor d12,d12,d12
+ veor d14,d14,d14
+ veor d16,d16,d16
+ veor d18,d18,d18
+ vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ adr r5,.Lzeros
+ vld1.32 {d18[0]},[r0]
+ sub r0,r0,#16 @ rewind
+
+.Lbase2_32_neon:
+ add r4,r1,#32
+ mov r3,r3,lsl#24
+ tst r2,#31
+ beq .Leven
+
+ vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
+ vmov.32 d28[0],r3
+ sub r2,r2,#16
+ add r4,r1,#32
+
+# ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+# endif
+ vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
+ vshl.u32 d26,d26,#18
+
+ vsri.u32 d26,d24,#14
+ vshl.u32 d24,d24,#12
+ vadd.i32 d29,d28,d18 @ add hash value and move to #hi
+
+ vbic.i32 d26,#0xfc000000
+ vsri.u32 d24,d22,#20
+ vshl.u32 d22,d22,#6
+
+ vbic.i32 d24,#0xfc000000
+ vsri.u32 d22,d20,#26
+ vadd.i32 d27,d26,d16
+
+ vbic.i32 d20,#0xfc000000
+ vbic.i32 d22,#0xfc000000
+ vadd.i32 d25,d24,d14
+
+ vadd.i32 d21,d20,d10
+ vadd.i32 d23,d22,d12
+
+ mov r7,r5
+ add r6,r0,#48
+
+ cmp r2,r2
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs r2,r2,#64
+ it lo
+ movlo r4,r5
+
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+ itt hi
+ addhi r7,r0,#(48+1*9*4)
+ addhi r6,r0,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+# endif
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vshl.u32 q13,q13,#18
+
+ vsri.u32 q13,q12,#14
+ vshl.u32 q12,q12,#12
+
+ vbic.i32 q13,#0xfc000000
+ vsri.u32 q12,q11,#20
+ vshl.u32 q11,q11,#6
+
+ vbic.i32 q12,#0xfc000000
+ vsri.u32 q11,q10,#26
+
+ vbic.i32 q10,#0xfc000000
+ vbic.i32 q11,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ ___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ ___________________/ ____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
+ vmull.u32 q7,d25,d0[1]
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0[1]
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0[1]
+ vmlal.u32 q7,d23,d1[1]
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0[1]
+
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0[1]
+ subs r2,r2,#64
+ vmlal.u32 q5,d29,d2[1]
+ it lo
+ movlo r4,r5
+ vmlal.u32 q8,d25,d1[1]
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q6,d21,d1[1]
+ vmlal.u32 q9,d27,d1[1]
+
+ vmlal.u32 q5,d27,d4[1]
+ vmlal.u32 q8,d23,d3[1]
+ vmlal.u32 q9,d25,d3[1]
+ vmlal.u32 q6,d29,d4[1]
+ vmlal.u32 q7,d21,d3[1]
+
+ vmlal.u32 q8,d21,d5[1]
+ vmlal.u32 q5,d25,d6[1]
+ vmlal.u32 q9,d23,d5[1]
+ vmlal.u32 q6,d27,d6[1]
+ vmlal.u32 q7,d29,d6[1]
+
+ vmlal.u32 q8,d29,d8[1]
+ vmlal.u32 q5,d23,d8[1]
+ vmlal.u32 q9,d21,d7[1]
+ vmlal.u32 q6,d25,d8[1]
+ vmlal.u32 q7,d27,d8[1]
+
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 q8,d26,d0[0]
+ vmlal.u32 q5,d20,d0[0]
+ vmlal.u32 q9,d28,d0[0]
+ vmlal.u32 q6,d22,d0[0]
+ vmlal.u32 q7,d24,d0[0]
+ vld1.32 d8[0],[r6,:32]
+
+ vmlal.u32 q8,d24,d1[0]
+ vmlal.u32 q5,d28,d2[0]
+ vmlal.u32 q9,d26,d1[0]
+ vmlal.u32 q6,d20,d1[0]
+ vmlal.u32 q7,d22,d1[0]
+
+ vmlal.u32 q8,d22,d3[0]
+ vmlal.u32 q5,d26,d4[0]
+ vmlal.u32 q9,d24,d3[0]
+ vmlal.u32 q6,d28,d4[0]
+ vmlal.u32 q7,d20,d3[0]
+
+ vmlal.u32 q8,d20,d5[0]
+ vmlal.u32 q5,d24,d6[0]
+ vmlal.u32 q9,d22,d5[0]
+ vmlal.u32 q6,d26,d6[0]
+ vmlal.u32 q8,d28,d8[0]
+
+ vmlal.u32 q7,d28,d6[0]
+ vmlal.u32 q5,d22,d8[0]
+ vmlal.u32 q9,d20,d7[0]
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vmlal.u32 q6,d24,d8[0]
+ vmlal.u32 q7,d26,d8[0]
+
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+# ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+ vrev32.8 q13,q13
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vshl.u32 q13,q13,#18
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vsri.u32 q13,q12,#14
+ vbic.i32 d18,#0xfc000000
+ vshl.u32 q12,q12,#12
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vbic.i32 q13,#0xfc000000
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
+ vsri.u32 q12,q11,#20
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vshl.u32 q11,q11,#6
+ vbic.i32 d14,#0xfc000000
+ vbic.i32 q12,#0xfc000000
+
+ vshrn.u64 d30,q5,#26 @ re-narrow
+ vmovn.i64 d10,q5
+ vsri.u32 q11,q10,#26
+ vbic.i32 q10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vbic.i32 d10,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+ vbic.i32 q11,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add r7,r0,#(48+0*9*4)
+ add r6,r0,#(48+1*9*4)
+ adds r2,r2,#32
+ it ne
+ movne r2,#0
+ bne .Long_tail
+
+ vadd.i32 d25,d24,d14 @ add hash value and move to #hi
+ vadd.i32 d21,d20,d10
+ vadd.i32 d27,d26,d16
+ vadd.i32 d23,d22,d12
+ vadd.i32 d29,d28,d18
+
+.Long_tail:
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
+
+ vadd.i32 d24,d24,d14 @ can be redundant
+ vmull.u32 q7,d25,d0
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0
+
+ vmlal.u32 q5,d29,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d25,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d21,d1
+ vmlal.u32 q9,d27,d1
+ vmlal.u32 q7,d23,d1
+
+ vmlal.u32 q8,d23,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d27,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d25,d3
+ vmlal.u32 q6,d29,d4
+ vmlal.u32 q7,d21,d3
+
+ vmlal.u32 q8,d21,d5
+ it ne
+ addne r7,r0,#(48+2*9*4)
+ vmlal.u32 q5,d25,d6
+ it ne
+ addne r6,r0,#(48+3*9*4)
+ vmlal.u32 q9,d23,d5
+ vmlal.u32 q6,d27,d6
+ vmlal.u32 q7,d29,d6
+
+ vmlal.u32 q8,d29,d8
+ vorn q0,q0,q0 @ all-ones, can be redundant
+ vmlal.u32 q5,d23,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d21,d7
+ vmlal.u32 q6,d25,d8
+ vmlal.u32 q7,d27,d8
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+
+ vmlal.u32 q7,d24,d0
+ vmlal.u32 q5,d20,d0
+ vmlal.u32 q8,d26,d0
+ vmlal.u32 q6,d22,d0
+ vmlal.u32 q9,d28,d0
+
+ vmlal.u32 q5,d28,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d24,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d20,d1
+ vmlal.u32 q9,d26,d1
+ vmlal.u32 q7,d22,d1
+
+ vmlal.u32 q8,d22,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d26,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d24,d3
+ vmlal.u32 q6,d28,d4
+ vmlal.u32 q7,d20,d3
+
+ vmlal.u32 q8,d20,d5
+ vmlal.u32 q5,d24,d6
+ vmlal.u32 q9,d22,d5
+ vmlal.u32 q6,d26,d6
+ vmlal.u32 q7,d28,d6
+
+ vmlal.u32 q8,d28,d8
+ vorn q0,q0,q0 @ all-ones
+ vmlal.u32 q5,d22,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d20,d7
+ vmlal.u32 q6,d24,d8
+ vmlal.u32 q7,d26,d8
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 d16,d16,d17
+ vadd.i64 d10,d10,d11
+ vadd.i64 d18,d18,d19
+ vadd.i64 d12,d12,d13
+ vadd.i64 d14,d14,d15
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 q15,q8,#26
+ vand.i64 q8,q8,q0
+ vshr.u64 q4,q5,#26
+ vand.i64 q5,q5,q0
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+
+ vshr.u64 q15,q9,#26
+ vand.i64 q9,q9,q0
+ vshr.u64 q4,q6,#26
+ vand.i64 q6,q6,q0
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+
+ vadd.i64 q5,q5,q15
+ vshl.u64 q15,q15,#2
+ vshr.u64 q4,q7,#26
+ vand.i64 q7,q7,q0
+ vadd.i64 q5,q5,q15 @ h4 -> h0
+ vadd.i64 q8,q8,q4 @ h2 -> h3
+
+ vshr.u64 q15,q5,#26
+ vand.i64 q5,q5,q0
+ vshr.u64 q4,q8,#26
+ vand.i64 q8,q8,q0
+ vadd.i64 q6,q6,q15 @ h0 -> h1
+ vadd.i64 q9,q9,q4 @ h3 -> h4
+
+ cmp r2,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ vst1.32 {d18[0]},[r0]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+.Lno_data_neon:
+ bx lr @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr ip,[r0,#36] @ is_base2_26
+
+ stmdb sp!,{r4-r11}
+
+ tst ip,ip
+ beq .Lpoly1305_emit_enter
+
+ ldmia r0,{r3-r7}
+ eor r8,r8,r8
+
+ adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
+ mov r4,r4,lsr#6
+ adcs r4,r4,r5,lsl#20
+ mov r5,r5,lsr#12
+ adcs r5,r5,r6,lsl#14
+ mov r6,r6,lsr#18
+ adcs r6,r6,r7,lsl#8
+ adc r7,r8,r7,lsr#24 @ can be partially reduced ...
+
+ and r8,r7,#-4 @ ... so reduce
+ and r7,r6,#3
+ add r8,r8,r8,lsr#2 @ *= 5
+ adds r3,r3,r8
+ adcs r4,r4,#0
+ adcs r5,r5,#0
+ adcs r6,r6,#0
+ adc r7,r7,#0
+
+ adds r8,r3,#5 @ compare to modulus
+ adcs r9,r4,#0
+ adcs r10,r5,#0
+ adcs r11,r6,#0
+ adc r7,r7,#0
+ tst r7,#4 @ did it carry/borrow?
+
+ it ne
+ movne r3,r8
+ ldr r8,[r2,#0]
+ it ne
+ movne r4,r9
+ ldr r9,[r2,#4]
+ it ne
+ movne r5,r10
+ ldr r10,[r2,#8]
+ it ne
+ movne r6,r11
+ ldr r11,[r2,#12]
+
+ adds r3,r3,r8 @ accumulate nonce
+ adcs r4,r4,r9
+ adcs r5,r5,r10
+ adc r6,r6,r11
+
+# ifdef __ARMEB__
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+# endif
+ str r3,[r1,#0] @ store the result
+ str r4,[r1,#4]
+ str r5,[r1,#8]
+ str r6,[r1,#12]
+
+ ldmia sp!,{r4-r11}
+ bx lr @ bx lr
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lpoly1305_init
+#endif
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+#endif
diff --git a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S b/lib/zinc/poly1305/poly1305-arm64-cryptogams.S
new file mode 100644
index 000000000000..0ecb50a83ec0
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm64-cryptogams.S
@@ -0,0 +1,869 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include "arm_arch.h"
+
+.text
+
+// forward "declarations" are required for Apple
+
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp x1,xzr
+ stp xzr,xzr,[x0] // zero hash value
+ stp xzr,xzr,[x0,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifdef __ILP32__
+ ldrsw x11,.LOPENSSL_armcap_P
+#else
+ ldr x11,.LOPENSSL_armcap_P
+#endif
+ adr x10,.LOPENSSL_armcap_P
+
+ ldp x7,x8,[x1] // load key
+ mov x9,#0xfffffffc0fffffff
+ movk x9,#0x0fff,lsl#48
+ ldr w17,[x10,x11]
+#ifdef __ARMEB__
+ rev x7,x7 // flip bytes
+ rev x8,x8
+#endif
+ and x7,x7,x9 // &=0ffffffc0fffffff
+ and x9,x9,#-4
+ and x8,x8,x9 // &=0ffffffc0ffffffc
+ stp x7,x8,[x0,#32] // save key value
+
+ tst w17,#ARMV7_NEON
+
+ adr x12,poly1305_blocks
+ adr x7,poly1305_blocks_neon
+ adr x13,poly1305_emit
+ adr x8,poly1305_emit_neon
+
+ csel x12,x12,x7,eq
+ csel x13,x13,x8,eq
+
+#ifdef __ILP32__
+ stp w12,w13,[x2]
+#else
+ stp x12,x13,[x2]
+#endif
+
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+ ands x2,x2,#-16
+ b.eq .Lno_data
+
+ ldp x4,x5,[x0] // load hash value
+ ldp x7,x8,[x0,#32] // load key value
+ ldr x6,[x0,#16]
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ b .Loop
+
+.align 5
+.Loop:
+ ldp x10,x11,[x1],#16 // load input
+ sub x2,x2,#16
+#ifdef __ARMEB__
+ rev x10,x10
+ rev x11,x11
+#endif
+ adds x4,x4,x10 // accumulate input
+ adcs x5,x5,x11
+
+ mul x12,x4,x7 // h0*r0
+ adc x6,x6,x3
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ cbnz x2,.Loop
+
+ stp x4,x5,[x0] // store hash value
+ str x6,[x0,#16]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+ ldp x4,x5,[x0] // load hash base 2^64
+ ldr x6,[x0,#16]
+ ldp x10,x11,[x2] // load nonce
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __ARMEB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __ARMEB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul x12,x4,x7 // h0*r0
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 5
+poly1305_splat:
+ and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,x4,#26,#26
+ extr x14,x5,x4,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,x5,#14,#26
+ extr x16,x6,x5,#40
+
+ str w12,[x0,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[x0,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[x0,#16*2] // s1
+ str w14,[x0,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[x0,#16*4] // s2
+ str w15,[x0,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[x0,#16*6] // s3
+ str w16,[x0,#16*7] // r4
+ str w15,[x0,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+ ldr x17,[x0,#24]
+ cmp x2,#128
+ b.hs .Lblocks_neon
+ cbz x17,poly1305_blocks
+
+.Lblocks_neon:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ ands x2,x2,#-16
+ b.eq .Lno_data_neon
+
+ cbz x17,.Lbase2_64_neon
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ tst x2,#31
+ b.eq .Leven_neon
+
+ ldp x7,x8,[x0,#32] // load key value
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x14,x6,xzr // can be partially reduced...
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+
+ and x10,x14,#-4 // ... so reduce
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x4,x10
+ adcs x5,x5,xzr
+ adc x6,x6,xzr
+
+#ifdef __ARMEB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl poly1305_mult
+ ldr x30,[sp,#8]
+
+ cbz x3,.Lstore_base2_64_neon
+
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ cbnz x2,.Leven_neon
+
+ stp w10,w11,[x0] // store hash value base 2^26
+ stp w12,w13,[x0,#8]
+ str w14,[x0,#16]
+ b .Lno_data_neon
+
+.align 4
+.Lstore_base2_64_neon:
+ stp x4,x5,[x0] // store hash value base 2^64
+ stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
+ b .Lno_data_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp x7,x8,[x0,#32] // load key value
+
+ ldp x4,x5,[x0] // load hash value base 2^64
+ ldr x6,[x0,#16]
+
+ tst x2,#31
+ b.eq .Linit_neon
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __ARMEB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl poly1305_mult
+
+.Linit_neon:
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+ ////////////////////////////////// initialize r^n table
+ mov x4,x7 // r^1
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov x5,x8
+ mov x6,xzr
+ add x0,x0,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub x0,x0,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub x0,x0,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub x0,x0,#4
+ bl poly1305_splat
+ ldr x30,[sp,#8]
+
+ add x16,x1,#32
+ adr x17,.Lzeros
+ subs x2,x2,#64
+ csel x16,x17,x16,lo
+
+ mov x4,#1
+ str x4,[x0,#-24] // set is_base2_26
+ sub x0,x0,#48 // restore original x0
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ add x16,x1,#32
+ adr x17,.Lzeros
+ subs x2,x2,#64
+ csel x16,x17,x16,lo
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+.Ldo_neon:
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ ldp x9,x13,[x16],#48
+
+ lsl x3,x3,#24
+ add x15,x0,#48
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d14,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d15,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov d16,x8
+ fmov d17,x10
+ fmov d18,x12
+
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ ldp x9,x13,[x1],#48
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
+ ld1 {v8.4s},[x15]
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d9,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d10,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi v31.2d,#-1
+ fmov d11,x8
+ fmov d12,x10
+ fmov d13,x12
+ ushr v31.2d,v31.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // ___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // ___________________/ ____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs x2,x2,#64
+ umull v23.2d,v14.2s,v7.s[2]
+ csel x16,x17,x16,lo
+ umull v22.2d,v14.2s,v5.s[2]
+ umull v21.2d,v14.2s,v3.s[2]
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ umull v20.2d,v14.2s,v1.s[2]
+ ldp x9,x13,[x16],#48
+ umull v19.2d,v14.2s,v0.s[2]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal v23.2d,v15.2s,v5.s[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v22.2d,v15.2s,v3.s[2]
+ and x5,x9,#0x03ffffff
+ umlal v21.2d,v15.2s,v1.s[2]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v15.2s,v0.s[2]
+ ubfx x7,x9,#26,#26
+ umlal v19.2d,v15.2s,v8.s[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal v23.2d,v16.2s,v3.s[2]
+ extr x8,x12,x8,#52
+ umlal v22.2d,v16.2s,v1.s[2]
+ extr x9,x13,x9,#52
+ umlal v21.2d,v16.2s,v0.s[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v20.2d,v16.2s,v8.s[2]
+ fmov d14,x4
+ umlal v19.2d,v16.2s,v6.s[2]
+ and x8,x8,#0x03ffffff
+
+ umlal v23.2d,v17.2s,v1.s[2]
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v17.2s,v0.s[2]
+ ubfx x10,x12,#14,#26
+ umlal v21.2d,v17.2s,v8.s[2]
+ ubfx x11,x13,#14,#26
+ umlal v20.2d,v17.2s,v6.s[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v19.2d,v17.2s,v4.s[2]
+ fmov d15,x6
+
+ add v11.2s,v11.2s,v26.2s
+ add x12,x3,x12,lsr#40
+ umlal v23.2d,v18.2s,v0.s[2]
+ add x13,x3,x13,lsr#40
+ umlal v22.2d,v18.2s,v8.s[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v21.2d,v18.2s,v6.s[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v18.2s,v4.s[2]
+ fmov d16,x8
+ umlal v19.2d,v18.2s,v2.s[2]
+ fmov d17,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ fmov d18,x12
+ umlal v22.2d,v11.2s,v1.s[0]
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ umlal v19.2d,v11.2s,v6.s[0]
+ ldp x9,x13,[x1],#48
+ umlal v23.2d,v11.2s,v3.s[0]
+ umlal v20.2d,v11.2s,v8.s[0]
+ umlal v21.2d,v11.2s,v0.s[0]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.s[0]
+ umlal v23.2d,v9.2s,v7.s[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v21.2d,v9.2s,v3.s[0]
+ and x5,x9,#0x03ffffff
+ umlal v19.2d,v9.2s,v0.s[0]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v9.2s,v1.s[0]
+ ubfx x7,x9,#26,#26
+
+ add v12.2s,v12.2s,v27.2s
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal v22.2d,v10.2s,v3.s[0]
+ extr x8,x12,x8,#52
+ umlal v23.2d,v10.2s,v5.s[0]
+ extr x9,x13,x9,#52
+ umlal v19.2d,v10.2s,v8.s[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v21.2d,v10.2s,v1.s[0]
+ fmov d9,x4
+ umlal v20.2d,v10.2s,v0.s[0]
+ and x8,x8,#0x03ffffff
+
+ add v13.2s,v13.2s,v28.2s
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v12.2s,v0.s[0]
+ ubfx x10,x12,#14,#26
+ umlal v19.2d,v12.2s,v4.s[0]
+ ubfx x11,x13,#14,#26
+ umlal v23.2d,v12.2s,v1.s[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v20.2d,v12.2s,v6.s[0]
+ fmov d10,x6
+ umlal v21.2d,v12.2s,v8.s[0]
+ add x12,x3,x12,lsr#40
+
+ umlal v22.2d,v13.2s,v8.s[0]
+ add x13,x3,x13,lsr#40
+ umlal v19.2d,v13.2s,v2.s[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v23.2d,v13.2s,v0.s[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v13.2s,v4.s[0]
+ fmov d11,x8
+ umlal v21.2d,v13.2s,v6.s[0]
+ fmov d12,x10
+ fmov d13,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr v29.2d,v22.2d,#26
+ xtn v27.2s,v22.2d
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ xtn v28.2s,v23.2d
+ ushr v30.2d,v20.2d,#26
+ xtn v25.2s,v20.2d
+ bic v28.2s,#0xfc,lsl#24
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ shrn v30.2s,v21.2d,#26
+ xtn v26.2s,v21.2d
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ bic v25.2s,#0xfc,lsl#24
+ add v27.2s,v27.2s,v30.2s // h2 -> h3
+ bic v26.2s,#0xfc,lsl#24
+
+ shrn v29.2s,v19.2d,#26
+ xtn v24.2s,v19.2d
+ ushr v30.2s,v27.2s,#26
+ bic v27.2s,#0xfc,lsl#24
+ bic v24.2s,#0xfc,lsl#24
+ add v25.2s,v25.2s,v29.2s // h0 -> h1
+ add v28.2s,v28.2s,v30.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup v16.2d,v16.d[0]
+ add v11.2s,v11.2s,v26.2s
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds x2,x2,#32
+ b.ne .Long_tail
+
+ dup v16.2d,v11.d[0]
+ add v14.2s,v9.2s,v24.2s
+ add v17.2s,v12.2s,v27.2s
+ add v15.2s,v10.2s,v25.2s
+ add v18.2s,v13.2s,v28.2s
+
+.Long_tail:
+ dup v14.2d,v14.d[0]
+ umull2 v19.2d,v16.4s,v6.4s
+ umull2 v22.2d,v16.4s,v1.4s
+ umull2 v23.2d,v16.4s,v3.4s
+ umull2 v21.2d,v16.4s,v0.4s
+ umull2 v20.2d,v16.4s,v8.4s
+
+ dup v15.2d,v15.d[0]
+ umlal2 v19.2d,v14.4s,v0.4s
+ umlal2 v21.2d,v14.4s,v3.4s
+ umlal2 v22.2d,v14.4s,v5.4s
+ umlal2 v23.2d,v14.4s,v7.4s
+ umlal2 v20.2d,v14.4s,v1.4s
+
+ dup v17.2d,v17.d[0]
+ umlal2 v19.2d,v15.4s,v8.4s
+ umlal2 v22.2d,v15.4s,v3.4s
+ umlal2 v21.2d,v15.4s,v1.4s
+ umlal2 v23.2d,v15.4s,v5.4s
+ umlal2 v20.2d,v15.4s,v0.4s
+
+ dup v18.2d,v18.d[0]
+ umlal2 v22.2d,v17.4s,v0.4s
+ umlal2 v23.2d,v17.4s,v1.4s
+ umlal2 v19.2d,v17.4s,v4.4s
+ umlal2 v20.2d,v17.4s,v6.4s
+ umlal2 v21.2d,v17.4s,v8.4s
+
+ umlal2 v22.2d,v18.4s,v8.4s
+ umlal2 v19.2d,v18.4s,v2.4s
+ umlal2 v23.2d,v18.4s,v0.4s
+ umlal2 v20.2d,v18.4s,v4.4s
+ umlal2 v21.2d,v18.4s,v6.4s
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ umlal v22.2d,v11.2s,v1.2s
+ umlal v19.2d,v11.2s,v6.2s
+ umlal v23.2d,v11.2s,v3.2s
+ umlal v20.2d,v11.2s,v8.2s
+ umlal v21.2d,v11.2s,v0.2s
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.2s
+ umlal v19.2d,v9.2s,v0.2s
+ umlal v23.2d,v9.2s,v7.2s
+ umlal v20.2d,v9.2s,v1.2s
+ umlal v21.2d,v9.2s,v3.2s
+
+ add v12.2s,v12.2s,v27.2s
+ umlal v22.2d,v10.2s,v3.2s
+ umlal v19.2d,v10.2s,v8.2s
+ umlal v23.2d,v10.2s,v5.2s
+ umlal v20.2d,v10.2s,v0.2s
+ umlal v21.2d,v10.2s,v1.2s
+
+ add v13.2s,v13.2s,v28.2s
+ umlal v22.2d,v12.2s,v0.2s
+ umlal v19.2d,v12.2s,v4.2s
+ umlal v23.2d,v12.2s,v1.2s
+ umlal v20.2d,v12.2s,v6.2s
+ umlal v21.2d,v12.2s,v8.2s
+
+ umlal v22.2d,v13.2s,v8.2s
+ umlal v19.2d,v13.2s,v2.2s
+ umlal v23.2d,v13.2s,v0.2s
+ umlal v20.2d,v13.2s,v4.2s
+ umlal v21.2d,v13.2s,v6.2s
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp v22.2d,v22.2d,v22.2d
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp v19.2d,v19.2d,v19.2d
+ ldp d10,d11,[sp,#32]
+ addp v23.2d,v23.2d,v23.2d
+ ldp d12,d13,[sp,#48]
+ addp v20.2d,v20.2d,v20.2d
+ ldp d14,d15,[sp,#64]
+ addp v21.2d,v21.2d,v21.2d
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr v29.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ and v23.16b,v23.16b,v31.16b
+ ushr v30.2d,v20.2d,#26
+ and v20.16b,v20.16b,v31.16b
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ ushr v30.2d,v21.2d,#26
+ and v21.16b,v21.16b,v31.16b
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ add v22.2d,v22.2d,v30.2d // h2 -> h3
+
+ ushr v29.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ ushr v30.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ add v20.2d,v20.2d,v29.2d // h0 -> h1
+ add v23.2d,v23.2d,v30.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
+ st1 {v23.s}[0],[x0]
+
+.Lno_data_neon:
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.type poly1305_emit_neon,%function
+.align 5
+poly1305_emit_neon:
+ ldr x17,[x0,#24]
+ cbz x17,poly1305_emit
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x6,x6,xzr // can be partially reduced...
+
+ ldp x10,x11,[x2] // load nonce
+
+ and x12,x6,#-4 // ... so reduce
+ add x12,x12,x6,lsr#2
+ and x6,x6,#3
+ adds x4,x4,x12
+ adcs x5,x5,xzr
+ adc x6,x6,xzr
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __ARMEB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __ARMEB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+.size poly1305_emit_neon,.-poly1305_emit_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.LOPENSSL_armcap_P:
+#ifdef __ILP32__
+.long OPENSSL_armcap_P-.
+#else
+.quad OPENSSL_armcap_P-.
+#endif
+.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
--
2.19.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next v6 12/23] zinc: Poly1305 ARM and ARM64 implementations
[not found] <20180925145622.29959-1-Jason@zx2c4.com>
` (3 preceding siblings ...)
2018-09-25 14:56 ` [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 " Jason A. Donenfeld
@ 2018-09-25 14:56 ` Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation Jason A. Donenfeld
5 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-25 14:56 UTC (permalink / raw)
To: linux-arm-kernel
These wire Andy Polyakov's implementations up to the kernel. We make a
few small changes to the assembly:
- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed to fit kernel conventions.
- The neon code can jump to the scalar code when it makes sense to do
so.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel at lists.infradead.org
---
lib/zinc/Makefile | 2 +
lib/zinc/poly1305/poly1305-arm-glue.h | 119 ++++++++++++++
...ly1305-arm-cryptogams.S => poly1305-arm.S} | 147 ++++++------------
...05-arm64-cryptogams.S => poly1305-arm64.S} | 103 ++++--------
lib/zinc/poly1305/poly1305.c | 2 +
5 files changed, 198 insertions(+), 175 deletions(-)
create mode 100644 lib/zinc/poly1305/poly1305-arm-glue.h
rename lib/zinc/poly1305/{poly1305-arm-cryptogams.S => poly1305-arm.S} (91%)
rename lib/zinc/poly1305/{poly1305-arm64-cryptogams.S => poly1305-arm64.S} (90%)
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index a8943d960b6a..c09fd3de60f9 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -12,4 +12,6 @@ obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
zinc_poly1305-y := poly1305/poly1305.o
zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM) += poly1305/poly1305-arm.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM64) += poly1305/poly1305-arm64.o
obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o
diff --git a/lib/zinc/poly1305/poly1305-arm-glue.h b/lib/zinc/poly1305/poly1305-arm-glue.h
new file mode 100644
index 000000000000..ddeb58a2b547
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm-glue.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
+asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
+#if defined(CONFIG_KERNEL_MODE_NEON)
+asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
+#endif
+
+static bool poly1305_use_neon __ro_after_init;
+
+static void __init poly1305_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+ poly1305_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+ poly1305_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+#if defined(CONFIG_ARM64)
+struct poly1305_arch_internal {
+ union {
+ u32 h[5];
+ struct {
+ u64 h0, h1, h2;
+ };
+ };
+ u32 is_base2_26;
+ u64 r[2];
+};
+#elif defined(CONFIG_ARM)
+struct poly1305_arch_internal {
+ union {
+ u32 h[5];
+ struct {
+ u64 h0, h1;
+ u32 h2;
+ } __packed;
+ };
+ u32 r[4];
+ u32 is_base2_26;
+};
+#endif
+
+#if defined(CONFIG_KERNEL_MODE_NEON)
+static void convert_to_base2_64(void *ctx)
+{
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->h0 = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->h1 = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->h2 = state->h[4] >> 24;
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->h2 >> 2) + (state->h2 & ~3ULL);
+ state->h2 &= 3;
+ state->h0 += cy;
+ state->h1 += (cy = ULT(state->h0, cy));
+ state->h2 += ULT(state->h1, cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
+#endif
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_arm(ctx, key);
+ return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit,
+ simd_context_t *simd_context)
+{
+#if defined(CONFIG_KERNEL_MODE_NEON)
+ if (poly1305_use_neon && simd_use(simd_context)) {
+ poly1305_blocks_neon(ctx, inp, len, padbit);
+ return true;
+ }
+ convert_to_base2_64(ctx);
+#endif
+
+ poly1305_blocks_arm(ctx, inp, len, padbit);
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t *simd_context)
+{
+#if defined(CONFIG_KERNEL_MODE_NEON)
+ if (poly1305_use_neon && simd_use(simd_context)) {
+ poly1305_emit_neon(ctx, mac, nonce);
+ return true;
+ }
+ convert_to_base2_64(ctx);
+#endif
+
+ poly1305_emit_arm(ctx, mac, nonce);
+ return true;
+}
diff --git a/lib/zinc/poly1305/poly1305-arm-cryptogams.S b/lib/zinc/poly1305/poly1305-arm.S
similarity index 91%
rename from lib/zinc/poly1305/poly1305-arm-cryptogams.S
rename to lib/zinc/poly1305/poly1305-arm.S
index 884b465030e4..4a0e9d451119 100644
--- a/lib/zinc/poly1305/poly1305-arm-cryptogams.S
+++ b/lib/zinc/poly1305/poly1305-arm.S
@@ -1,9 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/
-#include "arm_arch.h"
+#include <linux/linkage.h>
.text
#if defined(__thumb2__)
@@ -13,13 +16,8 @@
.code 32
#endif
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
.align 5
-poly1305_init:
-.Lpoly1305_init:
+ENTRY(poly1305_init_arm)
stmdb sp!,{r4-r11}
eor r3,r3,r3
@@ -38,10 +36,6 @@ poly1305_init:
moveq r0,#0
beq .Lno_key
-#if __ARM_MAX_ARCH__>=7
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-#endif
ldrb r4,[r1,#0]
mov r10,#0x0fffffff
ldrb r5,[r1,#1]
@@ -56,12 +50,6 @@ poly1305_init:
ldrb r7,[r1,#6]
and r4,r4,r10
-#if __ARM_MAX_ARCH__>=7
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
- ldr r12,[r12]
-# endif
-#endif
ldrb r8,[r1,#7]
orr r5,r5,r6,lsl#8
ldrb r6,[r1,#8]
@@ -71,35 +59,6 @@ poly1305_init:
ldrb r8,[r1,#10]
and r5,r5,r3
-#if __ARM_MAX_ARCH__>=7
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
- movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
- movne r12,r10
-# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
-# endif
-#endif
ldrb r9,[r1,#11]
orr r6,r6,r7,lsl#8
ldrb r7,[r1,#12]
@@ -118,26 +77,20 @@ poly1305_init:
str r6,[r0,#8]
and r7,r7,r3
str r7,[r0,#12]
-#if __ARM_MAX_ARCH__>=7
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
.Lno_key:
ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
bx lr @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size poly1305_init,.-poly1305_init
-.type poly1305_blocks,%function
+ENDPROC(poly1305_init_arm)
+
.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
stmdb sp!,{r3-r11,lr}
ands r2,r2,#-16
@@ -158,11 +111,11 @@ poly1305_blocks:
b .Loop
.Loop:
-#if __ARM_ARCH__<7
+#if __LINUX_ARM_ARCH__ < 7
ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
+#ifdef __thumb2__
it hi
-# endif
+#endif
addhi r8,r8,#1 @ 1<<128
ldrb r1,[lr,#-15]
ldrb r2,[lr,#-14]
@@ -201,19 +154,19 @@ poly1305_blocks:
orr r3,r2,r3,lsl#24
#else
ldr r0,[lr],#16 @ load input
-# ifdef __thumb2__
+#ifdef __thumb2__
it hi
-# endif
+#endif
addhi r8,r8,#1 @ padbit
ldr r1,[lr,#-12]
ldr r2,[lr,#-8]
ldr r3,[lr,#-4]
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r0,r0
rev r1,r1
rev r2,r2
rev r3,r3
-# endif
+#endif
adds r4,r4,r0 @ accumulate input
str lr,[sp,#8] @ offload input pointer
adcs r5,r5,r1
@@ -283,7 +236,7 @@ poly1305_blocks:
stmia r0,{r4-r8} @ store the result
.Lno_data:
-#if __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
ldmia sp!,{r3-r11,pc}
#else
ldmia sp!,{r3-r11,lr}
@@ -291,13 +244,12 @@ poly1305_blocks:
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size poly1305_blocks,.-poly1305_blocks
-.type poly1305_emit,%function
+ENDPROC(poly1305_blocks_arm)
+
.align 5
-poly1305_emit:
+ENTRY(poly1305_emit_arm)
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
-
ldmia r0,{r3-r7}
adds r8,r3,#5 @ compare to modulus
adcs r9,r4,#0
@@ -332,13 +284,13 @@ poly1305_emit:
adcs r5,r5,r10
adc r6,r6,r11
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
+#if __LINUX_ARM_ARCH__ >= 7
+#ifdef __ARMEB__
rev r3,r3
rev r4,r4
rev r5,r5
rev r6,r6
-# endif
+#endif
str r3,[r1,#0]
str r4,[r1,#4]
str r5,[r1,#8]
@@ -377,20 +329,22 @@ poly1305_emit:
strb r6,[r1,#15]
#endif
ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
+#if __LINUX_ARM_ARCH__ >= 5
bx lr @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
-.size poly1305_emit,.-poly1305_emit
-#if __ARM_MAX_ARCH__>=7
+ENDPROC(poly1305_emit_arm)
+
+
+#ifdef CONFIG_KERNEL_MODE_NEON
.fpu neon
-.type poly1305_init_neon,%function
.align 5
-poly1305_init_neon:
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
ldr r4,[r0,#20] @ load key base 2^32
ldr r5,[r0,#24]
ldr r6,[r0,#28]
@@ -600,11 +554,10 @@ poly1305_init_neon:
vst1.32 {d8[1]},[r7]
bx lr @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
+ENDPROC(poly1305_init_neon)
-.type poly1305_blocks_neon,%function
.align 5
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
ldr ip,[r0,#36] @ is_base2_26
ands r2,r2,#-16
beq .Lno_data_neon
@@ -612,7 +565,7 @@ poly1305_blocks_neon:
cmp r2,#64
bhs .Lenter_neon
tst ip,ip @ is_base2_26?
- beq .Lpoly1305_blocks
+ beq .Lpoly1305_blocks_arm
.Lenter_neon:
stmdb sp!,{r4-r7}
@@ -622,7 +575,7 @@ poly1305_blocks_neon:
bne .Lbase2_26_neon
stmdb sp!,{r1-r3,lr}
- bl poly1305_init_neon
+ bl .Lpoly1305_init_neon
ldr r4,[r0,#0] @ load hash value base 2^32
ldr r5,[r0,#4]
@@ -686,12 +639,12 @@ poly1305_blocks_neon:
sub r2,r2,#16
add r4,r1,#32
-# ifdef __ARMEB__
+#ifdef __ARMEB__
vrev32.8 q10,q10
vrev32.8 q13,q13
vrev32.8 q11,q11
vrev32.8 q12,q12
-# endif
+#endif
vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
vshl.u32 d26,d26,#18
@@ -735,12 +688,12 @@ poly1305_blocks_neon:
addhi r7,r0,#(48+1*9*4)
addhi r6,r0,#(48+3*9*4)
-# ifdef __ARMEB__
+#ifdef __ARMEB__
vrev32.8 q10,q10
vrev32.8 q13,q13
vrev32.8 q11,q11
vrev32.8 q12,q12
-# endif
+#endif
vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
vshl.u32 q13,q13,#18
@@ -866,12 +819,12 @@ poly1305_blocks_neon:
vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
add r1,r1,#64
-# ifdef __ARMEB__
+#ifdef __ARMEB__
vrev32.8 q10,q10
vrev32.8 q11,q11
vrev32.8 q12,q12
vrev32.8 q13,q13
-# endif
+#endif
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction interleaved with base 2^32 -> base 2^26 of
@@ -1086,11 +1039,10 @@ poly1305_blocks_neon:
ldmia sp!,{r4-r7}
.Lno_data_neon:
bx lr @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
+ENDPROC(poly1305_blocks_neon)
-.type poly1305_emit_neon,%function
.align 5
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
ldr ip,[r0,#36] @ is_base2_26
stmdb sp!,{r4-r11}
@@ -1144,12 +1096,12 @@ poly1305_emit_neon:
adcs r5,r5,r10
adc r6,r6,r11
-# ifdef __ARMEB__
+#ifdef __ARMEB__
rev r3,r3
rev r4,r4
rev r5,r5
rev r6,r6
-# endif
+#endif
str r3,[r1,#0] @ store the result
str r4,[r1,#4]
str r5,[r1,#8]
@@ -1157,16 +1109,9 @@ poly1305_emit_neon:
ldmia sp!,{r4-r11}
bx lr @ bx lr
-.size poly1305_emit_neon,.-poly1305_emit_neon
+ENDPROC(poly1305_emit_neon)
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lpoly1305_init
-#endif
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7
-.comm OPENSSL_armcap_P,4,4
#endif
diff --git a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S b/lib/zinc/poly1305/poly1305-arm64.S
similarity index 90%
rename from lib/zinc/poly1305/poly1305-arm64-cryptogams.S
rename to lib/zinc/poly1305/poly1305-arm64.S
index 0ecb50a83ec0..84a654479cac 100644
--- a/lib/zinc/poly1305/poly1305-arm64-cryptogams.S
+++ b/lib/zinc/poly1305/poly1305-arm64.S
@@ -1,21 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/
-#include "arm_arch.h"
-
+#include <linux/linkage.h>
.text
-// forward "declarations" are required for Apple
-
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type poly1305_init,%function
.align 5
-poly1305_init:
+ENTRY(poly1305_init_arm)
cmp x1,xzr
stp xzr,xzr,[x0] // zero hash value
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
@@ -23,17 +18,9 @@ poly1305_init:
csel x0,xzr,x0,eq
b.eq .Lno_key
-#ifdef __ILP32__
- ldrsw x11,.LOPENSSL_armcap_P
-#else
- ldr x11,.LOPENSSL_armcap_P
-#endif
- adr x10,.LOPENSSL_armcap_P
-
ldp x7,x8,[x1] // load key
mov x9,#0xfffffffc0fffffff
movk x9,#0x0fff,lsl#48
- ldr w17,[x10,x11]
#ifdef __ARMEB__
rev x7,x7 // flip bytes
rev x8,x8
@@ -43,30 +30,12 @@ poly1305_init:
and x8,x8,x9 // &=0ffffffc0ffffffc
stp x7,x8,[x0,#32] // save key value
- tst w17,#ARMV7_NEON
-
- adr x12,poly1305_blocks
- adr x7,poly1305_blocks_neon
- adr x13,poly1305_emit
- adr x8,poly1305_emit_neon
-
- csel x12,x12,x7,eq
- csel x13,x13,x8,eq
-
-#ifdef __ILP32__
- stp w12,w13,[x2]
-#else
- stp x12,x13,[x2]
-#endif
-
- mov x0,#1
.Lno_key:
ret
-.size poly1305_init,.-poly1305_init
+ENDPROC(poly1305_init_arm)
-.type poly1305_blocks,%function
.align 5
-poly1305_blocks:
+ENTRY(poly1305_blocks_arm)
ands x2,x2,#-16
b.eq .Lno_data
@@ -126,11 +95,10 @@ poly1305_blocks:
.Lno_data:
ret
-.size poly1305_blocks,.-poly1305_blocks
+ENDPROC(poly1305_blocks_arm)
-.type poly1305_emit,%function
.align 5
-poly1305_emit:
+ENTRY(poly1305_emit_arm)
ldp x4,x5,[x0] // load hash base 2^64
ldr x6,[x0,#16]
ldp x10,x11,[x2] // load nonce
@@ -157,10 +125,10 @@ poly1305_emit:
stp x4,x5,[x1] // write result
ret
-.size poly1305_emit,.-poly1305_emit
-.type poly1305_mult,%function
+ENDPROC(poly1305_emit_arm)
+
.align 5
-poly1305_mult:
+__poly1305_mult:
mul x12,x4,x7 // h0*r0
umulh x13,x4,x7
@@ -193,11 +161,8 @@ poly1305_mult:
adc x6,x6,xzr
ret
-.size poly1305_mult,.-poly1305_mult
-.type poly1305_splat,%function
-.align 5
-poly1305_splat:
+__poly1305_splat:
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,x4,#26,#26
extr x14,x5,x4,#52
@@ -220,15 +185,14 @@ poly1305_splat:
str w15,[x0,#16*8] // s4
ret
-.size poly1305_splat,.-poly1305_splat
-.type poly1305_blocks_neon,%function
+#ifdef CONFIG_KERNEL_MODE_NEON
.align 5
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
ldr x17,[x0,#24]
cmp x2,#128
b.hs .Lblocks_neon
- cbz x17,poly1305_blocks
+ cbz x17,poly1305_blocks_arm
.Lblocks_neon:
stp x29,x30,[sp,#-80]!
@@ -276,7 +240,7 @@ poly1305_blocks_neon:
adcs x5,x5,x13
adc x6,x6,x3
- bl poly1305_mult
+ bl __poly1305_mult
ldr x30,[sp,#8]
cbz x3,.Lstore_base2_64_neon
@@ -322,7 +286,7 @@ poly1305_blocks_neon:
adcs x5,x5,x13
adc x6,x6,x3
- bl poly1305_mult
+ bl __poly1305_mult
.Linit_neon:
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
@@ -349,19 +313,19 @@ poly1305_blocks_neon:
mov x5,x8
mov x6,xzr
add x0,x0,#48+12
- bl poly1305_splat
+ bl __poly1305_splat
- bl poly1305_mult // r^2
+ bl __poly1305_mult // r^2
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat
- bl poly1305_mult // r^3
+ bl __poly1305_mult // r^3
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat
- bl poly1305_mult // r^4
+ bl __poly1305_mult // r^4
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat
ldr x30,[sp,#8]
add x16,x1,#32
@@ -801,13 +765,12 @@ poly1305_blocks_neon:
.Lno_data_neon:
ldr x29,[sp],#80
ret
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
+ENDPROC(poly1305_blocks_neon)
-.type poly1305_emit_neon,%function
.align 5
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
ldr x17,[x0,#24]
- cbz x17,poly1305_emit
+ cbz x17,poly1305_emit_arm
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
@@ -853,17 +816,9 @@ poly1305_emit_neon:
stp x4,x5,[x1] // write result
ret
-.size poly1305_emit_neon,.-poly1305_emit_neon
+ENDPROC(poly1305_emit_neon)
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
#endif
-.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 2
-.align 2
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
index 2ae1b3cb66cd..647aa3354d38 100644
--- a/lib/zinc/poly1305/poly1305.c
+++ b/lib/zinc/poly1305/poly1305.c
@@ -17,6 +17,8 @@
#if defined(CONFIG_ZINC_ARCH_X86_64)
#include "poly1305-x86_64-glue.h"
+#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
+#include "poly1305-arm-glue.h"
#else
static inline bool poly1305_init_arch(void *ctx,
const u8 key[POLY1305_KEY_SIZE])
--
2.19.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
[not found] <20180925145622.29959-1-Jason@zx2c4.com>
` (4 preceding siblings ...)
2018-09-25 14:56 ` [PATCH net-next v6 12/23] zinc: " Jason A. Donenfeld
@ 2018-09-25 14:56 ` Jason A. Donenfeld
2018-10-02 16:59 ` Ard Biesheuvel
5 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-25 14:56 UTC (permalink / raw)
To: linux-arm-kernel
This comes from Dan Bernstein and Peter Schwabe's public domain NEON
code, and has been modified to be friendly for kernel space, as well as
removing some qhasm strangeness to be more idiomatic.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel at lists.infradead.org
---
lib/zinc/Makefile | 1 +
lib/zinc/curve25519/curve25519-arm-glue.h | 42 +
lib/zinc/curve25519/curve25519-arm.S | 2095 +++++++++++++++++++++
lib/zinc/curve25519/curve25519.c | 2 +
4 files changed, 2140 insertions(+)
create mode 100644 lib/zinc/curve25519/curve25519-arm-glue.h
create mode 100644 lib/zinc/curve25519/curve25519-arm.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 65440438c6e5..be73c342f9ba 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -27,4 +27,5 @@ zinc_blake2s-$(CONFIG_ZINC_ARCH_X86_64) += blake2s/blake2s-x86_64.o
obj-$(CONFIG_ZINC_BLAKE2S) += zinc_blake2s.o
zinc_curve25519-y := curve25519/curve25519.o
+zinc_curve25519-$(CONFIG_ZINC_ARCH_ARM) += curve25519/curve25519-arm.o
obj-$(CONFIG_ZINC_CURVE25519) += zinc_curve25519.o
diff --git a/lib/zinc/curve25519/curve25519-arm-glue.h b/lib/zinc/curve25519/curve25519-arm-glue.h
new file mode 100644
index 000000000000..9211bcab5615
--- /dev/null
+++ b/lib/zinc/curve25519/curve25519-arm-glue.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#if defined(CONFIG_KERNEL_MODE_NEON)
+asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE]);
+#endif
+
+static bool curve25519_use_neon __ro_after_init;
+
+static void __init curve25519_fpu_init(void)
+{
+ curve25519_use_neon = elf_hwcap & HWCAP_NEON;
+}
+
+static inline bool curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+#if defined(CONFIG_KERNEL_MODE_NEON)
+ if (curve25519_use_neon && may_use_simd()) {
+ kernel_neon_begin();
+ curve25519_neon(mypublic, secret, basepoint);
+ kernel_neon_end();
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline bool curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ return false;
+}
diff --git a/lib/zinc/curve25519/curve25519-arm.S b/lib/zinc/curve25519/curve25519-arm.S
new file mode 100644
index 000000000000..db6570c20fd1
--- /dev/null
+++ b/lib/zinc/curve25519/curve25519-arm.S
@@ -0,0 +1,2095 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
+ * has been built from SUPERCOP's curve25519/neon2/scalarmult.pq using qhasm,
+ * but has subsequently been manually reworked for use in kernel space.
+ */
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#include <linux/linkage.h>
+
+.text
+.fpu neon
+.arch armv7-a
+.align 4
+
+ENTRY(curve25519_neon)
+ push {r4-r11, lr}
+ mov ip, sp
+ sub r3, sp, #704
+ and r3, r3, #0xfffffff0
+ mov sp, r3
+ movw r4, #0
+ movw r5, #254
+ vmov.i32 q0, #1
+ vshr.u64 q1, q0, #7
+ vshr.u64 q0, q0, #8
+ vmov.i32 d4, #19
+ vmov.i32 d5, #38
+ add r6, sp, #480
+ vst1.8 {d2-d3}, [r6, : 128]
+ add r6, sp, #496
+ vst1.8 {d0-d1}, [r6, : 128]
+ add r6, sp, #512
+ vst1.8 {d4-d5}, [r6, : 128]
+ add r6, r3, #0
+ vmov.i32 q2, #0
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+ add r6, r3, #0
+ movw r7, #960
+ sub r7, r7, #2
+ neg r7, r7
+ sub r7, r7, r7, LSL #7
+ str r7, [r6]
+ add r6, sp, #672
+ vld1.8 {d4-d5}, [r1]!
+ vld1.8 {d6-d7}, [r1]
+ vst1.8 {d4-d5}, [r6, : 128]!
+ vst1.8 {d6-d7}, [r6, : 128]
+ sub r1, r6, #16
+ ldrb r6, [r1]
+ and r6, r6, #248
+ strb r6, [r1]
+ ldrb r6, [r1, #31]
+ and r6, r6, #127
+ orr r6, r6, #64
+ strb r6, [r1, #31]
+ vmov.i64 q2, #0xffffffff
+ vshr.u64 q3, q2, #7
+ vshr.u64 q2, q2, #6
+ vld1.8 {d8}, [r2]
+ vld1.8 {d10}, [r2]
+ add r2, r2, #6
+ vld1.8 {d12}, [r2]
+ vld1.8 {d14}, [r2]
+ add r2, r2, #6
+ vld1.8 {d16}, [r2]
+ add r2, r2, #4
+ vld1.8 {d18}, [r2]
+ vld1.8 {d20}, [r2]
+ add r2, r2, #6
+ vld1.8 {d22}, [r2]
+ add r2, r2, #2
+ vld1.8 {d24}, [r2]
+ vld1.8 {d26}, [r2]
+ vshr.u64 q5, q5, #26
+ vshr.u64 q6, q6, #3
+ vshr.u64 q7, q7, #29
+ vshr.u64 q8, q8, #6
+ vshr.u64 q10, q10, #25
+ vshr.u64 q11, q11, #3
+ vshr.u64 q12, q12, #12
+ vshr.u64 q13, q13, #38
+ vand q4, q4, q2
+ vand q6, q6, q2
+ vand q8, q8, q2
+ vand q10, q10, q2
+ vand q2, q12, q2
+ vand q5, q5, q3
+ vand q7, q7, q3
+ vand q9, q9, q3
+ vand q11, q11, q3
+ vand q3, q13, q3
+ add r2, r3, #48
+ vadd.i64 q12, q4, q1
+ vadd.i64 q13, q10, q1
+ vshr.s64 q12, q12, #26
+ vshr.s64 q13, q13, #26
+ vadd.i64 q5, q5, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q14, q5, q0
+ vadd.i64 q11, q11, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q11, q0
+ vsub.i64 q4, q4, q12
+ vshr.s64 q12, q14, #25
+ vsub.i64 q10, q10, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q14, q6, q1
+ vadd.i64 q2, q2, q13
+ vsub.i64 q5, q5, q12
+ vshr.s64 q12, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q1
+ vadd.i64 q7, q7, q12
+ vshl.i64 q12, q12, #26
+ vadd.i64 q15, q7, q0
+ vsub.i64 q11, q11, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q12
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q3, q0
+ vadd.i64 q8, q8, q12
+ vshl.i64 q12, q12, #25
+ vadd.i64 q15, q8, q1
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q7, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q9, q9, q12
+ vtrn.32 d12, d14
+ vshl.i64 q12, q12, #26
+ vtrn.32 d13, d15
+ vadd.i64 q0, q9, q0
+ vadd.i64 q4, q4, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q6, q13, #4
+ vsub.i64 q7, q8, q12
+ vshr.s64 q0, q0, #25
+ vadd.i64 q4, q4, q6
+ vadd.i64 q6, q10, q0
+ vshl.i64 q0, q0, #25
+ vadd.i64 q8, q6, q1
+ vadd.i64 q4, q4, q13
+ vshl.i64 q10, q13, #25
+ vadd.i64 q1, q4, q1
+ vsub.i64 q0, q9, q0
+ vshr.s64 q8, q8, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d14, d0
+ vshr.s64 q1, q1, #26
+ vtrn.32 d15, d1
+ vadd.i64 q0, q11, q8
+ vst1.8 d14, [r2, : 64]
+ vshl.i64 q7, q8, #26
+ vadd.i64 q5, q5, q1
+ vtrn.32 d4, d6
+ vshl.i64 q1, q1, #26
+ vtrn.32 d5, d7
+ vsub.i64 q3, q6, q7
+ add r2, r2, #16
+ vsub.i64 q1, q4, q1
+ vst1.8 d4, [r2, : 64]
+ vtrn.32 d6, d0
+ vtrn.32 d7, d1
+ sub r2, r2, #8
+ vtrn.32 d2, d10
+ vtrn.32 d3, d11
+ vst1.8 d6, [r2, : 64]
+ sub r2, r2, #24
+ vst1.8 d2, [r2, : 64]
+ add r2, r3, #96
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #144
+ vmov.i32 q0, #0
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #240
+ vmov.i32 q0, #0
+ vmov.i64 d2, #0xff
+ vmov.i64 d3, #0
+ vshr.u32 q1, q1, #7
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 d0, [r2, : 64]
+ add r2, r3, #48
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d4, [r6, : 64]
+.Lmainloop:
+ mov r2, r5, LSR #3
+ and r6, r5, #7
+ ldrb r2, [r1, r2]
+ mov r2, r2, LSR r6
+ and r2, r2, #1
+ str r5, [sp, #456]
+ eor r4, r4, r2
+ str r2, [sp, #460]
+ neg r2, r4
+ add r4, r3, #96
+ add r5, r3, #192
+ add r6, r3, #144
+ vld1.8 {d8-d9}, [r4, : 128]!
+ add r7, r3, #240
+ vld1.8 {d10-d11}, [r5, : 128]!
+ veor q6, q4, q5
+ vld1.8 {d14-d15}, [r6, : 128]!
+ vdup.i32 q8, r2
+ vld1.8 {d18-d19}, [r7, : 128]!
+ veor q10, q7, q9
+ vld1.8 {d22-d23}, [r4, : 128]!
+ vand q6, q6, q8
+ vld1.8 {d24-d25}, [r5, : 128]!
+ vand q10, q10, q8
+ vld1.8 {d26-d27}, [r6, : 128]!
+ veor q4, q4, q6
+ vld1.8 {d28-d29}, [r7, : 128]!
+ veor q5, q5, q6
+ vld1.8 {d0}, [r4, : 64]
+ veor q6, q7, q10
+ vld1.8 {d2}, [r5, : 64]
+ veor q7, q9, q10
+ vld1.8 {d4}, [r6, : 64]
+ veor q9, q11, q12
+ vld1.8 {d6}, [r7, : 64]
+ veor q10, q0, q1
+ sub r2, r4, #32
+ vand q9, q9, q8
+ sub r4, r5, #32
+ vand q10, q10, q8
+ sub r5, r6, #32
+ veor q11, q11, q9
+ sub r6, r7, #32
+ veor q0, q0, q10
+ veor q9, q12, q9
+ veor q1, q1, q10
+ veor q10, q13, q14
+ veor q12, q2, q3
+ vand q10, q10, q8
+ vand q8, q12, q8
+ veor q12, q13, q10
+ veor q2, q2, q8
+ veor q10, q14, q10
+ veor q3, q3, q8
+ vadd.i32 q8, q4, q6
+ vsub.i32 q4, q4, q6
+ vst1.8 {d16-d17}, [r2, : 128]!
+ vadd.i32 q6, q11, q12
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q4, q11, q12
+ vst1.8 {d12-d13}, [r2, : 128]!
+ vadd.i32 q6, q0, q2
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vsub.i32 q0, q0, q2
+ vst1.8 d12, [r2, : 64]
+ vadd.i32 q2, q5, q7
+ vst1.8 d0, [r5, : 64]
+ vsub.i32 q0, q5, q7
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q9, q10
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q9, q10
+ vst1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q2, q1, q3
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vsub.i32 q0, q1, q3
+ vst1.8 d4, [r4, : 64]
+ vst1.8 d0, [r6, : 64]
+ add r2, sp, #512
+ add r4, r3, #96
+ add r5, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ add r2, sp, #496
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #288
+ vshl.i64 q12, q12, #25
+ add r4, r3, #336
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #240
+ add r4, r3, #96
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #144
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #192
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q6, q13, #1
+ add r2, sp, #544
+ vst1.8 {d20-d21}, [r2, : 128]
+ vshl.i32 q10, q14, #1
+ add r2, sp, #560
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ add r2, sp, #576
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ add r2, sp, #592
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ add r2, sp, #608
+ vst1.8 {d20-d21}, [r2, : 128]
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ add r2, sp, #624
+ vst1.8 {d10-d11}, [r2, : 128]
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ add r2, sp, #640
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ add r2, sp, #544
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ add r2, sp, #576
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #144
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, r3, #288
+ add r4, r3, #336
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vsub.i32 q1, q1, q2
+ add r5, r3, #240
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vsub.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #144
+ add r4, r3, #96
+ add r5, r3, #144
+ add r6, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vsub.i32 q2, q0, q1
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vsub.i32 q4, q1, q3
+ vadd.i32 q1, q1, q3
+ vld1.8 {d6}, [r2, : 64]
+ vld1.8 {d10}, [r4, : 64]
+ vsub.i32 q6, q3, q5
+ vadd.i32 q3, q3, q5
+ vst1.8 {d4-d5}, [r5, : 128]!
+ vst1.8 {d0-d1}, [r6, : 128]!
+ vst1.8 {d8-d9}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r6, : 128]!
+ vst1.8 d12, [r5, : 64]
+ vst1.8 d6, [r6, : 64]
+ add r2, r3, #0
+ add r4, r3, #240
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #336
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #288
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q6, q13, #1
+ add r2, sp, #544
+ vst1.8 {d20-d21}, [r2, : 128]
+ vshl.i32 q10, q14, #1
+ add r2, sp, #560
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ add r2, sp, #576
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ add r2, sp, #592
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ add r2, sp, #608
+ vst1.8 {d20-d21}, [r2, : 128]
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ add r2, sp, #624
+ vst1.8 {d10-d11}, [r2, : 128]
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ add r2, sp, #640
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ add r2, sp, #544
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ add r2, sp, #576
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #288
+ vshl.i64 q7, q7, #25
+ add r4, r3, #96
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ add r2, sp, #512
+ add r4, r3, #144
+ add r5, r3, #192
+ vld1.8 {d0-d1}, [r2, : 128]
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vzip.i32 q1, q2
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vld1.8 {d8-d9}, [r5, : 128]!
+ vshl.i32 q5, q1, #1
+ vzip.i32 q3, q4
+ vshl.i32 q6, q2, #1
+ vld1.8 {d14}, [r4, : 64]
+ vshl.i32 q8, q3, #1
+ vld1.8 {d15}, [r5, : 64]
+ vshl.i32 q9, q4, #1
+ vmul.i32 d21, d7, d1
+ vtrn.32 d14, d15
+ vmul.i32 q11, q4, q0
+ vmul.i32 q0, q7, q0
+ vmull.s32 q12, d2, d2
+ vmlal.s32 q12, d11, d1
+ vmlal.s32 q12, d12, d0
+ vmlal.s32 q12, d13, d23
+ vmlal.s32 q12, d16, d22
+ vmlal.s32 q12, d7, d21
+ vmull.s32 q10, d2, d11
+ vmlal.s32 q10, d4, d1
+ vmlal.s32 q10, d13, d0
+ vmlal.s32 q10, d6, d23
+ vmlal.s32 q10, d17, d22
+ vmull.s32 q13, d10, d4
+ vmlal.s32 q13, d11, d3
+ vmlal.s32 q13, d13, d1
+ vmlal.s32 q13, d16, d0
+ vmlal.s32 q13, d17, d23
+ vmlal.s32 q13, d8, d22
+ vmull.s32 q1, d10, d5
+ vmlal.s32 q1, d11, d4
+ vmlal.s32 q1, d6, d1
+ vmlal.s32 q1, d17, d0
+ vmlal.s32 q1, d8, d23
+ vmull.s32 q14, d10, d6
+ vmlal.s32 q14, d11, d13
+ vmlal.s32 q14, d4, d4
+ vmlal.s32 q14, d17, d1
+ vmlal.s32 q14, d18, d0
+ vmlal.s32 q14, d9, d23
+ vmull.s32 q11, d10, d7
+ vmlal.s32 q11, d11, d6
+ vmlal.s32 q11, d12, d5
+ vmlal.s32 q11, d8, d1
+ vmlal.s32 q11, d19, d0
+ vmull.s32 q15, d10, d8
+ vmlal.s32 q15, d11, d17
+ vmlal.s32 q15, d12, d6
+ vmlal.s32 q15, d13, d5
+ vmlal.s32 q15, d19, d1
+ vmlal.s32 q15, d14, d0
+ vmull.s32 q2, d10, d9
+ vmlal.s32 q2, d11, d8
+ vmlal.s32 q2, d12, d7
+ vmlal.s32 q2, d13, d6
+ vmlal.s32 q2, d14, d1
+ vmull.s32 q0, d15, d1
+ vmlal.s32 q0, d10, d14
+ vmlal.s32 q0, d11, d19
+ vmlal.s32 q0, d12, d8
+ vmlal.s32 q0, d13, d17
+ vmlal.s32 q0, d6, d6
+ add r2, sp, #480
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmull.s32 q3, d16, d7
+ vmlal.s32 q3, d10, d15
+ vmlal.s32 q3, d11, d14
+ vmlal.s32 q3, d12, d9
+ vmlal.s32 q3, d13, d8
+ add r2, sp, #496
+ vld1.8 {d8-d9}, [r2, : 128]
+ vadd.i64 q5, q12, q9
+ vadd.i64 q6, q15, q9
+ vshr.s64 q5, q5, #26
+ vshr.s64 q6, q6, #26
+ vadd.i64 q7, q10, q5
+ vshl.i64 q5, q5, #26
+ vadd.i64 q8, q7, q4
+ vadd.i64 q2, q2, q6
+ vshl.i64 q6, q6, #26
+ vadd.i64 q10, q2, q4
+ vsub.i64 q5, q12, q5
+ vshr.s64 q8, q8, #25
+ vsub.i64 q6, q15, q6
+ vshr.s64 q10, q10, #25
+ vadd.i64 q12, q13, q8
+ vshl.i64 q8, q8, #25
+ vadd.i64 q13, q12, q9
+ vadd.i64 q0, q0, q10
+ vsub.i64 q7, q7, q8
+ vshr.s64 q8, q13, #26
+ vshl.i64 q10, q10, #25
+ vadd.i64 q13, q0, q9
+ vadd.i64 q1, q1, q8
+ vshl.i64 q8, q8, #26
+ vadd.i64 q15, q1, q4
+ vsub.i64 q2, q2, q10
+ vshr.s64 q10, q13, #26
+ vsub.i64 q8, q12, q8
+ vshr.s64 q12, q15, #25
+ vadd.i64 q3, q3, q10
+ vshl.i64 q10, q10, #26
+ vadd.i64 q13, q3, q4
+ vadd.i64 q14, q14, q12
+ add r2, r3, #144
+ vshl.i64 q12, q12, #25
+ add r4, r3, #192
+ vadd.i64 q15, q14, q9
+ add r2, r2, #8
+ vsub.i64 q0, q0, q10
+ add r4, r4, #8
+ vshr.s64 q10, q13, #25
+ vsub.i64 q1, q1, q12
+ vshr.s64 q12, q15, #26
+ vadd.i64 q13, q10, q10
+ vadd.i64 q11, q11, q12
+ vtrn.32 d16, d2
+ vshl.i64 q12, q12, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q11, q4
+ vadd.i64 q4, q5, q13
+ vst1.8 d16, [r2, : 64]!
+ vshl.i64 q5, q10, #4
+ vst1.8 d17, [r4, : 64]!
+ vsub.i64 q8, q14, q12
+ vshr.s64 q1, q1, #25
+ vadd.i64 q4, q4, q5
+ vadd.i64 q5, q6, q1
+ vshl.i64 q1, q1, #25
+ vadd.i64 q6, q5, q9
+ vadd.i64 q4, q4, q10
+ vshl.i64 q10, q10, #25
+ vadd.i64 q9, q4, q9
+ vsub.i64 q1, q11, q1
+ vshr.s64 q6, q6, #26
+ vsub.i64 q3, q3, q10
+ vtrn.32 d16, d2
+ vshr.s64 q9, q9, #26
+ vtrn.32 d17, d3
+ vadd.i64 q1, q2, q6
+ vst1.8 d16, [r2, : 64]
+ vshl.i64 q2, q6, #26
+ vst1.8 d17, [r4, : 64]
+ vadd.i64 q6, q7, q9
+ vtrn.32 d0, d6
+ vshl.i64 q7, q9, #26
+ vtrn.32 d1, d7
+ vsub.i64 q2, q5, q2
+ add r2, r2, #16
+ vsub.i64 q3, q4, q7
+ vst1.8 d0, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d1, [r4, : 64]
+ vtrn.32 d4, d2
+ vtrn.32 d5, d3
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d6, d12
+ vtrn.32 d7, d13
+ vst1.8 d4, [r2, : 64]
+ vst1.8 d5, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d6, [r2, : 64]
+ vst1.8 d7, [r4, : 64]
+ add r2, r3, #336
+ add r4, r3, #288
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vadd.i32 q0, q0, q1
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4-d5}, [r4, : 128]!
+ vadd.i32 q1, q1, q2
+ add r5, r3, #288
+ vld1.8 {d4}, [r2, : 64]
+ vld1.8 {d6}, [r4, : 64]
+ vadd.i32 q2, q2, q3
+ vst1.8 {d0-d1}, [r5, : 128]!
+ vst1.8 {d2-d3}, [r5, : 128]!
+ vst1.8 d4, [r5, : 64]
+ add r2, r3, #48
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r4, : 128]!
+ vld1.8 {d2-d3}, [r4, : 128]!
+ vld1.8 {d4}, [r4, : 64]
+ add r4, r3, #288
+ vld1.8 {d6-d7}, [r4, : 128]!
+ vtrn.32 q0, q3
+ vld1.8 {d8-d9}, [r4, : 128]!
+ vshl.i32 q5, q0, #4
+ vtrn.32 q1, q4
+ vshl.i32 q6, q3, #4
+ vadd.i32 q5, q5, q0
+ vadd.i32 q6, q6, q3
+ vshl.i32 q7, q1, #4
+ vld1.8 {d5}, [r4, : 64]
+ vshl.i32 q8, q4, #4
+ vtrn.32 d4, d5
+ vadd.i32 q7, q7, q1
+ vadd.i32 q8, q8, q4
+ vld1.8 {d18-d19}, [r2, : 128]!
+ vshl.i32 q10, q2, #4
+ vld1.8 {d22-d23}, [r2, : 128]!
+ vadd.i32 q10, q10, q2
+ vld1.8 {d24}, [r2, : 64]
+ vadd.i32 q5, q5, q0
+ add r2, r3, #240
+ vld1.8 {d26-d27}, [r2, : 128]!
+ vadd.i32 q6, q6, q3
+ vld1.8 {d28-d29}, [r2, : 128]!
+ vadd.i32 q8, q8, q4
+ vld1.8 {d25}, [r2, : 64]
+ vadd.i32 q10, q10, q2
+ vtrn.32 q9, q13
+ vadd.i32 q7, q7, q1
+ vadd.i32 q5, q5, q0
+ vtrn.32 q11, q14
+ vadd.i32 q6, q6, q3
+ add r2, sp, #528
+ vadd.i32 q10, q10, q2
+ vtrn.32 d24, d25
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q6, q13, #1
+ add r2, sp, #544
+ vst1.8 {d20-d21}, [r2, : 128]
+ vshl.i32 q10, q14, #1
+ add r2, sp, #560
+ vst1.8 {d12-d13}, [r2, : 128]
+ vshl.i32 q15, q12, #1
+ vadd.i32 q8, q8, q4
+ vext.32 d10, d31, d30, #0
+ vadd.i32 q7, q7, q1
+ add r2, sp, #576
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q8, d18, d5
+ vmlal.s32 q8, d26, d4
+ vmlal.s32 q8, d19, d9
+ vmlal.s32 q8, d27, d3
+ vmlal.s32 q8, d22, d8
+ vmlal.s32 q8, d28, d2
+ vmlal.s32 q8, d23, d7
+ vmlal.s32 q8, d29, d1
+ vmlal.s32 q8, d24, d6
+ vmlal.s32 q8, d25, d0
+ add r2, sp, #592
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q2, d18, d4
+ vmlal.s32 q2, d12, d9
+ vmlal.s32 q2, d13, d8
+ vmlal.s32 q2, d19, d3
+ vmlal.s32 q2, d22, d2
+ vmlal.s32 q2, d23, d1
+ vmlal.s32 q2, d24, d0
+ add r2, sp, #608
+ vst1.8 {d20-d21}, [r2, : 128]
+ vmull.s32 q7, d18, d9
+ vmlal.s32 q7, d26, d3
+ vmlal.s32 q7, d19, d8
+ vmlal.s32 q7, d27, d2
+ vmlal.s32 q7, d22, d7
+ vmlal.s32 q7, d28, d1
+ vmlal.s32 q7, d23, d6
+ vmlal.s32 q7, d29, d0
+ add r2, sp, #624
+ vst1.8 {d10-d11}, [r2, : 128]
+ vmull.s32 q5, d18, d3
+ vmlal.s32 q5, d19, d2
+ vmlal.s32 q5, d22, d1
+ vmlal.s32 q5, d23, d0
+ vmlal.s32 q5, d12, d8
+ add r2, sp, #640
+ vst1.8 {d16-d17}, [r2, : 128]
+ vmull.s32 q4, d18, d8
+ vmlal.s32 q4, d26, d2
+ vmlal.s32 q4, d19, d7
+ vmlal.s32 q4, d27, d1
+ vmlal.s32 q4, d22, d6
+ vmlal.s32 q4, d28, d0
+ vmull.s32 q8, d18, d7
+ vmlal.s32 q8, d26, d1
+ vmlal.s32 q8, d19, d6
+ vmlal.s32 q8, d27, d0
+ add r2, sp, #544
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q7, d24, d21
+ vmlal.s32 q7, d25, d20
+ vmlal.s32 q4, d23, d21
+ vmlal.s32 q4, d29, d20
+ vmlal.s32 q8, d22, d21
+ vmlal.s32 q8, d28, d20
+ vmlal.s32 q5, d24, d20
+ add r2, sp, #544
+ vst1.8 {d14-d15}, [r2, : 128]
+ vmull.s32 q7, d18, d6
+ vmlal.s32 q7, d26, d0
+ add r2, sp, #624
+ vld1.8 {d30-d31}, [r2, : 128]
+ vmlal.s32 q2, d30, d21
+ vmlal.s32 q7, d19, d21
+ vmlal.s32 q7, d27, d20
+ add r2, sp, #592
+ vld1.8 {d26-d27}, [r2, : 128]
+ vmlal.s32 q4, d25, d27
+ vmlal.s32 q8, d29, d27
+ vmlal.s32 q8, d25, d26
+ vmlal.s32 q7, d28, d27
+ vmlal.s32 q7, d29, d26
+ add r2, sp, #576
+ vld1.8 {d28-d29}, [r2, : 128]
+ vmlal.s32 q4, d24, d29
+ vmlal.s32 q8, d23, d29
+ vmlal.s32 q8, d24, d28
+ vmlal.s32 q7, d22, d29
+ vmlal.s32 q7, d23, d28
+ add r2, sp, #576
+ vst1.8 {d8-d9}, [r2, : 128]
+ add r2, sp, #528
+ vld1.8 {d8-d9}, [r2, : 128]
+ vmlal.s32 q7, d24, d9
+ vmlal.s32 q7, d25, d31
+ vmull.s32 q1, d18, d2
+ vmlal.s32 q1, d19, d1
+ vmlal.s32 q1, d22, d0
+ vmlal.s32 q1, d24, d27
+ vmlal.s32 q1, d23, d20
+ vmlal.s32 q1, d12, d7
+ vmlal.s32 q1, d13, d6
+ vmull.s32 q6, d18, d1
+ vmlal.s32 q6, d19, d0
+ vmlal.s32 q6, d23, d27
+ vmlal.s32 q6, d22, d20
+ vmlal.s32 q6, d24, d26
+ vmull.s32 q0, d18, d0
+ vmlal.s32 q0, d22, d27
+ vmlal.s32 q0, d23, d26
+ vmlal.s32 q0, d24, d31
+ vmlal.s32 q0, d19, d20
+ add r2, sp, #608
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q2, d18, d7
+ vmlal.s32 q2, d19, d6
+ vmlal.s32 q5, d18, d6
+ vmlal.s32 q5, d19, d21
+ vmlal.s32 q1, d18, d21
+ vmlal.s32 q1, d19, d29
+ vmlal.s32 q0, d18, d28
+ vmlal.s32 q0, d19, d9
+ vmlal.s32 q6, d18, d29
+ vmlal.s32 q6, d19, d28
+ add r2, sp, #560
+ vld1.8 {d18-d19}, [r2, : 128]
+ add r2, sp, #480
+ vld1.8 {d22-d23}, [r2, : 128]
+ vmlal.s32 q5, d19, d7
+ vmlal.s32 q0, d18, d21
+ vmlal.s32 q0, d19, d29
+ vmlal.s32 q6, d18, d6
+ add r2, sp, #496
+ vld1.8 {d6-d7}, [r2, : 128]
+ vmlal.s32 q6, d19, d21
+ add r2, sp, #544
+ vld1.8 {d18-d19}, [r2, : 128]
+ vmlal.s32 q0, d30, d8
+ add r2, sp, #640
+ vld1.8 {d20-d21}, [r2, : 128]
+ vmlal.s32 q5, d30, d29
+ add r2, sp, #576
+ vld1.8 {d24-d25}, [r2, : 128]
+ vmlal.s32 q1, d30, d28
+ vadd.i64 q13, q0, q11
+ vadd.i64 q14, q5, q11
+ vmlal.s32 q6, d30, d9
+ vshr.s64 q4, q13, #26
+ vshr.s64 q13, q14, #26
+ vadd.i64 q7, q7, q4
+ vshl.i64 q4, q4, #26
+ vadd.i64 q14, q7, q3
+ vadd.i64 q9, q9, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q15, q9, q3
+ vsub.i64 q0, q0, q4
+ vshr.s64 q4, q14, #25
+ vsub.i64 q5, q5, q13
+ vshr.s64 q13, q15, #25
+ vadd.i64 q6, q6, q4
+ vshl.i64 q4, q4, #25
+ vadd.i64 q14, q6, q11
+ vadd.i64 q2, q2, q13
+ vsub.i64 q4, q7, q4
+ vshr.s64 q7, q14, #26
+ vshl.i64 q13, q13, #25
+ vadd.i64 q14, q2, q11
+ vadd.i64 q8, q8, q7
+ vshl.i64 q7, q7, #26
+ vadd.i64 q15, q8, q3
+ vsub.i64 q9, q9, q13
+ vshr.s64 q13, q14, #26
+ vsub.i64 q6, q6, q7
+ vshr.s64 q7, q15, #25
+ vadd.i64 q10, q10, q13
+ vshl.i64 q13, q13, #26
+ vadd.i64 q14, q10, q3
+ vadd.i64 q1, q1, q7
+ add r2, r3, #240
+ vshl.i64 q7, q7, #25
+ add r4, r3, #144
+ vadd.i64 q15, q1, q11
+ add r2, r2, #8
+ vsub.i64 q2, q2, q13
+ add r4, r4, #8
+ vshr.s64 q13, q14, #25
+ vsub.i64 q7, q8, q7
+ vshr.s64 q8, q15, #26
+ vadd.i64 q14, q13, q13
+ vadd.i64 q12, q12, q8
+ vtrn.32 d12, d14
+ vshl.i64 q8, q8, #26
+ vtrn.32 d13, d15
+ vadd.i64 q3, q12, q3
+ vadd.i64 q0, q0, q14
+ vst1.8 d12, [r2, : 64]!
+ vshl.i64 q7, q13, #4
+ vst1.8 d13, [r4, : 64]!
+ vsub.i64 q1, q1, q8
+ vshr.s64 q3, q3, #25
+ vadd.i64 q0, q0, q7
+ vadd.i64 q5, q5, q3
+ vshl.i64 q3, q3, #25
+ vadd.i64 q6, q5, q11
+ vadd.i64 q0, q0, q13
+ vshl.i64 q7, q13, #25
+ vadd.i64 q8, q0, q11
+ vsub.i64 q3, q12, q3
+ vshr.s64 q6, q6, #26
+ vsub.i64 q7, q10, q7
+ vtrn.32 d2, d6
+ vshr.s64 q8, q8, #26
+ vtrn.32 d3, d7
+ vadd.i64 q3, q9, q6
+ vst1.8 d2, [r2, : 64]
+ vshl.i64 q6, q6, #26
+ vst1.8 d3, [r4, : 64]
+ vadd.i64 q1, q4, q8
+ vtrn.32 d4, d14
+ vshl.i64 q4, q8, #26
+ vtrn.32 d5, d15
+ vsub.i64 q5, q5, q6
+ add r2, r2, #16
+ vsub.i64 q0, q0, q4
+ vst1.8 d4, [r2, : 64]
+ add r4, r4, #16
+ vst1.8 d5, [r4, : 64]
+ vtrn.32 d10, d6
+ vtrn.32 d11, d7
+ sub r2, r2, #8
+ sub r4, r4, #8
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vst1.8 d10, [r2, : 64]
+ vst1.8 d11, [r4, : 64]
+ sub r2, r2, #24
+ sub r4, r4, #24
+ vst1.8 d0, [r2, : 64]
+ vst1.8 d1, [r4, : 64]
+ ldr r2, [sp, #456]
+ ldr r4, [sp, #460]
+ subs r5, r2, #1
+ bge .Lmainloop
+ add r1, r3, #144
+ add r2, r3, #336
+ vld1.8 {d0-d1}, [r1, : 128]!
+ vld1.8 {d2-d3}, [r1, : 128]!
+ vld1.8 {d4}, [r1, : 64]
+ vst1.8 {d0-d1}, [r2, : 128]!
+ vst1.8 {d2-d3}, [r2, : 128]!
+ vst1.8 d4, [r2, : 64]
+ movw r1, #0
+.Linvertloop:
+ add r2, r3, #144
+ movw r4, #0
+ movw r5, #2
+ cmp r1, #1
+ moveq r5, #1
+ addeq r2, r3, #336
+ addeq r4, r3, #48
+ cmp r1, #2
+ moveq r5, #1
+ addeq r2, r3, #48
+ cmp r1, #3
+ moveq r5, #5
+ addeq r4, r3, #336
+ cmp r1, #4
+ moveq r5, #10
+ cmp r1, #5
+ moveq r5, #20
+ cmp r1, #6
+ moveq r5, #10
+ addeq r2, r3, #336
+ addeq r4, r3, #336
+ cmp r1, #7
+ moveq r5, #50
+ cmp r1, #8
+ moveq r5, #100
+ cmp r1, #9
+ moveq r5, #50
+ addeq r2, r3, #336
+ cmp r1, #10
+ moveq r5, #5
+ addeq r2, r3, #48
+ cmp r1, #11
+ moveq r5, #0
+ addeq r2, r3, #96
+ add r6, r3, #144
+ add r7, r3, #288
+ vld1.8 {d0-d1}, [r6, : 128]!
+ vld1.8 {d2-d3}, [r6, : 128]!
+ vld1.8 {d4}, [r6, : 64]
+ vst1.8 {d0-d1}, [r7, : 128]!
+ vst1.8 {d2-d3}, [r7, : 128]!
+ vst1.8 d4, [r7, : 64]
+ cmp r5, #0
+ beq .Lskipsquaringloop
+.Lsquaringloop:
+ add r6, r3, #288
+ add r7, r3, #288
+ add r8, r3, #288
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r7, : 128]!
+ vld1.8 {d6-d7}, [r7, : 128]!
+ vld1.8 {d9}, [r7, : 64]
+ vld1.8 {d10-d11}, [r6, : 128]!
+ add r7, sp, #384
+ vld1.8 {d12-d13}, [r6, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r6, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r7, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r7, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r7, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r7, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r7, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r6, r7, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r6, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r6, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r6, r6, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r6, : 64]
+ sub r6, r6, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r7, sp, #480
+ vld1.8 {d14-d15}, [r7, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r6, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r6, : 64]!
+ add r6, sp, #496
+ vld1.8 {d20-d21}, [r6, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r6, r8, #8
+ vst1.8 d0, [r6, : 64]
+ vsub.i64 d19, d19, d9
+ add r6, r6, #16
+ vst1.8 d16, [r6, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r6, r6, #8
+ vst1.8 d6, [r6, : 64]
+ add r6, r6, #16
+ vst1.8 d18, [r6, : 64]
+ vzip.i32 q2, q5
+ sub r6, r6, #32
+ vst1.8 d4, [r6, : 64]
+ subs r5, r5, #1
+ bhi .Lsquaringloop
+.Lskipsquaringloop:
+ mov r2, r2
+ add r5, r3, #288
+ add r6, r3, #144
+ vmov.i32 q0, #19
+ vmov.i32 q1, #0
+ vmov.i32 q2, #1
+ vzip.i32 q1, q2
+ vld1.8 {d4-d5}, [r5, : 128]!
+ vld1.8 {d6-d7}, [r5, : 128]!
+ vld1.8 {d9}, [r5, : 64]
+ vld1.8 {d10-d11}, [r2, : 128]!
+ add r5, sp, #384
+ vld1.8 {d12-d13}, [r2, : 128]!
+ vmul.i32 q7, q2, q0
+ vld1.8 {d8}, [r2, : 64]
+ vext.32 d17, d11, d10, #1
+ vmul.i32 q9, q3, q0
+ vext.32 d16, d10, d8, #1
+ vshl.u32 q10, q5, q1
+ vext.32 d22, d14, d4, #1
+ vext.32 d24, d18, d6, #1
+ vshl.u32 q13, q6, q1
+ vshl.u32 d28, d8, d2
+ vrev64.i32 d22, d22
+ vmul.i32 d1, d9, d1
+ vrev64.i32 d24, d24
+ vext.32 d29, d8, d13, #1
+ vext.32 d0, d1, d9, #1
+ vrev64.i32 d0, d0
+ vext.32 d2, d9, d1, #1
+ vext.32 d23, d15, d5, #1
+ vmull.s32 q4, d20, d4
+ vrev64.i32 d23, d23
+ vmlal.s32 q4, d21, d1
+ vrev64.i32 d2, d2
+ vmlal.s32 q4, d26, d19
+ vext.32 d3, d5, d15, #1
+ vmlal.s32 q4, d27, d18
+ vrev64.i32 d3, d3
+ vmlal.s32 q4, d28, d15
+ vext.32 d14, d12, d11, #1
+ vmull.s32 q5, d16, d23
+ vext.32 d15, d13, d12, #1
+ vmlal.s32 q5, d17, d4
+ vst1.8 d8, [r5, : 64]!
+ vmlal.s32 q5, d14, d1
+ vext.32 d12, d9, d8, #0
+ vmlal.s32 q5, d15, d19
+ vmov.i64 d13, #0
+ vmlal.s32 q5, d29, d18
+ vext.32 d25, d19, d7, #1
+ vmlal.s32 q6, d20, d5
+ vrev64.i32 d25, d25
+ vmlal.s32 q6, d21, d4
+ vst1.8 d11, [r5, : 64]!
+ vmlal.s32 q6, d26, d1
+ vext.32 d9, d10, d10, #0
+ vmlal.s32 q6, d27, d19
+ vmov.i64 d8, #0
+ vmlal.s32 q6, d28, d18
+ vmlal.s32 q4, d16, d24
+ vmlal.s32 q4, d17, d5
+ vmlal.s32 q4, d14, d4
+ vst1.8 d12, [r5, : 64]!
+ vmlal.s32 q4, d15, d1
+ vext.32 d10, d13, d12, #0
+ vmlal.s32 q4, d29, d19
+ vmov.i64 d11, #0
+ vmlal.s32 q5, d20, d6
+ vmlal.s32 q5, d21, d5
+ vmlal.s32 q5, d26, d4
+ vext.32 d13, d8, d8, #0
+ vmlal.s32 q5, d27, d1
+ vmov.i64 d12, #0
+ vmlal.s32 q5, d28, d19
+ vst1.8 d9, [r5, : 64]!
+ vmlal.s32 q6, d16, d25
+ vmlal.s32 q6, d17, d6
+ vst1.8 d10, [r5, : 64]
+ vmlal.s32 q6, d14, d5
+ vext.32 d8, d11, d10, #0
+ vmlal.s32 q6, d15, d4
+ vmov.i64 d9, #0
+ vmlal.s32 q6, d29, d1
+ vmlal.s32 q4, d20, d7
+ vmlal.s32 q4, d21, d6
+ vmlal.s32 q4, d26, d5
+ vext.32 d11, d12, d12, #0
+ vmlal.s32 q4, d27, d4
+ vmov.i64 d10, #0
+ vmlal.s32 q4, d28, d1
+ vmlal.s32 q5, d16, d0
+ sub r2, r5, #32
+ vmlal.s32 q5, d17, d7
+ vmlal.s32 q5, d14, d6
+ vext.32 d30, d9, d8, #0
+ vmlal.s32 q5, d15, d5
+ vld1.8 {d31}, [r2, : 64]!
+ vmlal.s32 q5, d29, d4
+ vmlal.s32 q15, d20, d0
+ vext.32 d0, d6, d18, #1
+ vmlal.s32 q15, d21, d25
+ vrev64.i32 d0, d0
+ vmlal.s32 q15, d26, d24
+ vext.32 d1, d7, d19, #1
+ vext.32 d7, d10, d10, #0
+ vmlal.s32 q15, d27, d23
+ vrev64.i32 d1, d1
+ vld1.8 {d6}, [r2, : 64]
+ vmlal.s32 q15, d28, d22
+ vmlal.s32 q3, d16, d4
+ add r2, r2, #24
+ vmlal.s32 q3, d17, d2
+ vext.32 d4, d31, d30, #0
+ vmov d17, d11
+ vmlal.s32 q3, d14, d1
+ vext.32 d11, d13, d13, #0
+ vext.32 d13, d30, d30, #0
+ vmlal.s32 q3, d15, d0
+ vext.32 d1, d8, d8, #0
+ vmlal.s32 q3, d29, d3
+ vld1.8 {d5}, [r2, : 64]
+ sub r2, r2, #16
+ vext.32 d10, d6, d6, #0
+ vmov.i32 q1, #0xffffffff
+ vshl.i64 q4, q1, #25
+ add r5, sp, #480
+ vld1.8 {d14-d15}, [r5, : 128]
+ vadd.i64 q9, q2, q7
+ vshl.i64 q1, q1, #26
+ vshr.s64 q10, q9, #26
+ vld1.8 {d0}, [r2, : 64]!
+ vadd.i64 q5, q5, q10
+ vand q9, q9, q1
+ vld1.8 {d16}, [r2, : 64]!
+ add r2, sp, #496
+ vld1.8 {d20-d21}, [r2, : 128]
+ vadd.i64 q11, q5, q10
+ vsub.i64 q2, q2, q9
+ vshr.s64 q9, q11, #25
+ vext.32 d12, d5, d4, #0
+ vand q11, q11, q4
+ vadd.i64 q0, q0, q9
+ vmov d19, d7
+ vadd.i64 q3, q0, q7
+ vsub.i64 q5, q5, q11
+ vshr.s64 q11, q3, #26
+ vext.32 d18, d11, d10, #0
+ vand q3, q3, q1
+ vadd.i64 q8, q8, q11
+ vadd.i64 q11, q8, q10
+ vsub.i64 q0, q0, q3
+ vshr.s64 q3, q11, #25
+ vand q11, q11, q4
+ vadd.i64 q3, q6, q3
+ vadd.i64 q6, q3, q7
+ vsub.i64 q8, q8, q11
+ vshr.s64 q11, q6, #26
+ vand q6, q6, q1
+ vadd.i64 q9, q9, q11
+ vadd.i64 d25, d19, d21
+ vsub.i64 q3, q3, q6
+ vshr.s64 d23, d25, #25
+ vand q4, q12, q4
+ vadd.i64 d21, d23, d23
+ vshl.i64 d25, d23, #4
+ vadd.i64 d21, d21, d23
+ vadd.i64 d25, d25, d21
+ vadd.i64 d4, d4, d25
+ vzip.i32 q0, q8
+ vadd.i64 d12, d4, d14
+ add r2, r6, #8
+ vst1.8 d0, [r2, : 64]
+ vsub.i64 d19, d19, d9
+ add r2, r2, #16
+ vst1.8 d16, [r2, : 64]
+ vshr.s64 d22, d12, #26
+ vand q0, q6, q1
+ vadd.i64 d10, d10, d22
+ vzip.i32 q3, q9
+ vsub.i64 d4, d4, d0
+ sub r2, r2, #8
+ vst1.8 d6, [r2, : 64]
+ add r2, r2, #16
+ vst1.8 d18, [r2, : 64]
+ vzip.i32 q2, q5
+ sub r2, r2, #32
+ vst1.8 d4, [r2, : 64]
+ cmp r4, #0
+ beq .Lskippostcopy
+ add r2, r3, #144
+ mov r4, r4
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskippostcopy:
+ cmp r1, #1
+ bne .Lskipfinalcopy
+ add r2, r3, #288
+ add r4, r3, #144
+ vld1.8 {d0-d1}, [r2, : 128]!
+ vld1.8 {d2-d3}, [r2, : 128]!
+ vld1.8 {d4}, [r2, : 64]
+ vst1.8 {d0-d1}, [r4, : 128]!
+ vst1.8 {d2-d3}, [r4, : 128]!
+ vst1.8 d4, [r4, : 64]
+.Lskipfinalcopy:
+ add r1, r1, #1
+ cmp r1, #12
+ blo .Linvertloop
+ add r1, r3, #144
+ ldr r2, [r1], #4
+ ldr r3, [r1], #4
+ ldr r4, [r1], #4
+ ldr r5, [r1], #4
+ ldr r6, [r1], #4
+ ldr r7, [r1], #4
+ ldr r8, [r1], #4
+ ldr r9, [r1], #4
+ ldr r10, [r1], #4
+ ldr r1, [r1]
+ add r11, r1, r1, LSL #4
+ add r11, r11, r1, LSL #1
+ add r11, r11, #16777216
+ mov r11, r11, ASR #25
+ add r11, r11, r2
+ mov r11, r11, ASR #26
+ add r11, r11, r3
+ mov r11, r11, ASR #25
+ add r11, r11, r4
+ mov r11, r11, ASR #26
+ add r11, r11, r5
+ mov r11, r11, ASR #25
+ add r11, r11, r6
+ mov r11, r11, ASR #26
+ add r11, r11, r7
+ mov r11, r11, ASR #25
+ add r11, r11, r8
+ mov r11, r11, ASR #26
+ add r11, r11, r9
+ mov r11, r11, ASR #25
+ add r11, r11, r10
+ mov r11, r11, ASR #26
+ add r11, r11, r1
+ mov r11, r11, ASR #25
+ add r2, r2, r11
+ add r2, r2, r11, LSL #1
+ add r2, r2, r11, LSL #4
+ mov r11, r2, ASR #26
+ add r3, r3, r11
+ sub r2, r2, r11, LSL #26
+ mov r11, r3, ASR #25
+ add r4, r4, r11
+ sub r3, r3, r11, LSL #25
+ mov r11, r4, ASR #26
+ add r5, r5, r11
+ sub r4, r4, r11, LSL #26
+ mov r11, r5, ASR #25
+ add r6, r6, r11
+ sub r5, r5, r11, LSL #25
+ mov r11, r6, ASR #26
+ add r7, r7, r11
+ sub r6, r6, r11, LSL #26
+ mov r11, r7, ASR #25
+ add r8, r8, r11
+ sub r7, r7, r11, LSL #25
+ mov r11, r8, ASR #26
+ add r9, r9, r11
+ sub r8, r8, r11, LSL #26
+ mov r11, r9, ASR #25
+ add r10, r10, r11
+ sub r9, r9, r11, LSL #25
+ mov r11, r10, ASR #26
+ add r1, r1, r11
+ sub r10, r10, r11, LSL #26
+ mov r11, r1, ASR #25
+ sub r1, r1, r11, LSL #25
+ add r2, r2, r3, LSL #26
+ mov r3, r3, LSR #6
+ add r3, r3, r4, LSL #19
+ mov r4, r4, LSR #13
+ add r4, r4, r5, LSL #13
+ mov r5, r5, LSR #19
+ add r5, r5, r6, LSL #6
+ add r6, r7, r8, LSL #25
+ mov r7, r8, LSR #7
+ add r7, r7, r9, LSL #19
+ mov r8, r9, LSR #13
+ add r8, r8, r10, LSL #12
+ mov r9, r10, LSR #20
+ add r1, r9, r1, LSL #6
+ str r2, [r0]
+ str r3, [r0, #4]
+ str r4, [r0, #8]
+ str r5, [r0, #12]
+ str r6, [r0, #16]
+ str r7, [r0, #20]
+ str r8, [r0, #24]
+ str r1, [r0, #28]
+ movw r0, #0
+ mov sp, ip
+ pop {r4-r11, pc}
+ENDPROC(curve25519_neon)
+#endif
diff --git a/lib/zinc/curve25519/curve25519.c b/lib/zinc/curve25519/curve25519.c
index 32536340d39d..0d5ea97762d4 100644
--- a/lib/zinc/curve25519/curve25519.c
+++ b/lib/zinc/curve25519/curve25519.c
@@ -21,6 +21,8 @@
#if defined(CONFIG_ZINC_ARCH_X86_64)
#include "curve25519-x86_64-glue.h"
+#elif defined(CONFIG_ZINC_ARCH_ARM)
+#include "curve25519-arm-glue.h"
#else
void __init curve25519_fpu_init(void)
{
--
2.19.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-25 14:56 ` [PATCH net-next v6 07/23] zinc: " Jason A. Donenfeld
@ 2018-09-26 8:59 ` Ard Biesheuvel
2018-09-26 13:32 ` Jason A. Donenfeld
2018-09-28 16:01 ` Ard Biesheuvel
1 sibling, 1 reply; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-26 8:59 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, 25 Sep 2018 at 17:00, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> These wire Andy Polyakov's implementations up to the kernel for ARMv7,8
> NEON, and introduce Eric Biggers' ultra-fast scalar implementation for
> CPUs without NEON or for CPUs with slow NEON (Cortex-A5,7).
>
> This commit does the following:
> - Adds the glue code for the assembly implementations.
> - Renames the ARMv8 code into place, since it can at this point be
> used wholesale.
> - Merges Andy Polyakov's ARMv7 NEON code with Eric Biggers' <=ARMv7
> scalar code.
>
> Commit note: Eric Biggers' scalar code is brand new, and quite possibly
> prematurely added to this commit, and so it may require a bit of revision.
>
> This commit delivers approximately the same or much better performance than
> the existing crypto API's code and has been measured to do as such on:
>
> - ARM1176JZF-S [ARMv6]
> - Cortex-A7 [ARMv7]
> - Cortex-A8 [ARMv7]
> - Cortex-A9 [ARMv7]
> - Cortex-A17 [ARMv7]
> - Cortex-A53 [ARMv8]
> - Cortex-A55 [ARMv8]
> - Cortex-A73 [ARMv8]
> - Cortex-A75 [ARMv8]
>
> Interestingly, Andy Polyakov's scalar code is slower than Eric Biggers',
> but is also significantly shorter. This has the advantage that it does
> not evict other code from L1 cache -- particularly on ARM11 chips -- and
> so in certain circumstances it can actually be faster. However, it wasn't
> found that this had an affect on any code existing in the kernel today.
>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Co-authored-by: Eric Biggers <ebiggers@google.com>
> Cc: Samuel Neves <sneves@dei.uc.pt>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
> Cc: Russell King <linux@armlinux.org.uk>
> Cc: linux-arm-kernel at lists.infradead.org
> ---
> lib/zinc/Makefile | 2 +
> lib/zinc/chacha20/chacha20-arm-glue.h | 88 +++
> ...acha20-arm-cryptogams.S => chacha20-arm.S} | 502 ++++++++++++++++--
> ...20-arm64-cryptogams.S => chacha20-arm64.S} | 0
> lib/zinc/chacha20/chacha20.c | 2 +
> 5 files changed, 556 insertions(+), 38 deletions(-)
> create mode 100644 lib/zinc/chacha20/chacha20-arm-glue.h
> rename lib/zinc/chacha20/{chacha20-arm-cryptogams.S => chacha20-arm.S} (71%)
> rename lib/zinc/chacha20/{chacha20-arm64-cryptogams.S => chacha20-arm64.S} (100%)
>
> diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
> index 223a0816c918..e47f64e12bbd 100644
> --- a/lib/zinc/Makefile
> +++ b/lib/zinc/Makefile
> @@ -4,4 +4,6 @@ ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
>
> zinc_chacha20-y := chacha20/chacha20.o
> zinc_chacha20-$(CONFIG_ZINC_ARCH_X86_64) += chacha20/chacha20-x86_64.o
> +zinc_chacha20-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o
> +zinc_chacha20-$(CONFIG_ZINC_ARCH_ARM64) += chacha20/chacha20-arm64.o
> obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
> diff --git a/lib/zinc/chacha20/chacha20-arm-glue.h b/lib/zinc/chacha20/chacha20-arm-glue.h
> new file mode 100644
> index 000000000000..86cce851ed02
> --- /dev/null
> +++ b/lib/zinc/chacha20/chacha20-arm-glue.h
> @@ -0,0 +1,88 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + */
> +
> +#include <asm/hwcap.h>
> +#include <asm/neon.h>
> +#if defined(CONFIG_ARM)
> +#include <asm/system_info.h>
> +#include <asm/cputype.h>
> +#endif
> +
> +asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
> + const u32 key[8], const u32 counter[4]);
> +#if defined(CONFIG_ARM)
> +asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]);
> +#endif
> +#if defined(CONFIG_KERNEL_MODE_NEON)
> +asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
> + const u32 key[8], const u32 counter[4]);
> +#endif
> +
> +static bool chacha20_use_neon __ro_after_init;
> +
> +static void __init chacha20_fpu_init(void)
> +{
> +#if defined(CONFIG_ARM64)
> + chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
> +#elif defined(CONFIG_ARM)
> + switch (read_cpuid_part()) {
> + case ARM_CPU_PART_CORTEX_A7:
> + case ARM_CPU_PART_CORTEX_A5:
> + /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON
> + * implementation but do incredibly with the scalar one and use
> + * less power.
> + */
> + break;
> + default:
> + chacha20_use_neon = elf_hwcap & HWCAP_NEON;
> + }
> +#endif
> +}
> +
> +static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
> + const u8 *src, size_t len,
> + simd_context_t *simd_context)
> +{
> +#if defined(CONFIG_KERNEL_MODE_NEON)
> + if (chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> + simd_use(simd_context))
> + chacha20_neon(dst, src, len, state->key, state->counter);
> + else
> +#endif
Better to use IS_ENABLED() here:
> + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON)) &&
> + chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> + simd_use(simd_context))
Also, this still has unbounded worst case scheduling latency, given
that the outer library function passes its entire input straight into
the NEON routine.
> + chacha20_arm(dst, src, len, state->key, state->counter);
> +
> + state->counter[0] += (len + 63) / 64;
> + return true;
> +}
> +
> +static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
> + const u8 nonce[HCHACHA20_NONCE_SIZE],
> + const u8 key[HCHACHA20_KEY_SIZE],
> + simd_context_t *simd_context)
> +{
> +#if defined(CONFIG_ARM)
> + u32 x[] = { CHACHA20_CONSTANT_EXPA,
> + CHACHA20_CONSTANT_ND_3,
> + CHACHA20_CONSTANT_2_BY,
> + CHACHA20_CONSTANT_TE_K,
> + get_unaligned_le32(key + 0),
> + get_unaligned_le32(key + 4),
> + get_unaligned_le32(key + 8),
> + get_unaligned_le32(key + 12),
> + get_unaligned_le32(key + 16),
> + get_unaligned_le32(key + 20),
> + get_unaligned_le32(key + 24),
> + get_unaligned_le32(key + 28),
> + get_unaligned_le32(nonce + 0),
> + get_unaligned_le32(nonce + 4),
> + get_unaligned_le32(nonce + 8),
> + get_unaligned_le32(nonce + 12)
> + };
> + hchacha20_arm(x, derived_key);
> + return true;
> +#else
> + return false;
> +#endif
> +}
> diff --git a/lib/zinc/chacha20/chacha20-arm-cryptogams.S b/lib/zinc/chacha20/chacha20-arm.S
> similarity index 71%
> rename from lib/zinc/chacha20/chacha20-arm-cryptogams.S
> rename to lib/zinc/chacha20/chacha20-arm.S
> index 770bab469171..5abedafcf129 100644
> --- a/lib/zinc/chacha20/chacha20-arm-cryptogams.S
> +++ b/lib/zinc/chacha20/chacha20-arm.S
> @@ -1,13 +1,475 @@
> /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
> /*
> + * Copyright (C) 2018 Google, Inc.
> * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
> - *
> - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
> */
>
> #include <linux/linkage.h>
>
> +/*
> + * The following scalar routine was written by Eric Biggers.
> + *
> + * Design notes:
> + *
> + * 16 registers would be needed to hold the state matrix, but only 14 are
> + * available because 'sp' and 'pc' cannot be used. So we spill the elements
> + * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
> + * 'ldrd' and one 'strd' instruction per round.
> + *
> + * All rotates are performed using the implicit rotate operand accepted by the
> + * 'add' and 'eor' instructions. This is faster than using explicit rotate
> + * instructions. To make this work, we allow the values in the second and last
> + * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
> + * wrong rotation amount. The rotation amount is then fixed up just in time
> + * when the values are used. 'brot' is the number of bits the values in row 'b'
> + * need to be rotated right to arrive at the correct values, and 'drot'
> + * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
> + * that they end up as (25, 24) after every round.
> + */
> +
> + // ChaCha state registers
> + X0 .req r0
> + X1 .req r1
> + X2 .req r2
> + X3 .req r3
> + X4 .req r4
> + X5 .req r5
> + X6 .req r6
> + X7 .req r7
> + X8_X10 .req r8 // shared by x8 and x10
> + X9_X11 .req r9 // shared by x9 and x11
> + X12 .req r10
> + X13 .req r11
> + X14 .req r12
> + X15 .req r14
> +
> +.Lexpand_32byte_k:
> + // "expand 32-byte k"
> + .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
> +
> +#ifdef __thumb2__
> +# define adrl adr
> +#endif
> +
> +.macro __rev out, in, t0, t1, t2
> +.if __LINUX_ARM_ARCH__ >= 6
> + rev \out, \in
> +.else
> + lsl \t0, \in, #24
> + and \t1, \in, #0xff00
> + and \t2, \in, #0xff0000
> + orr \out, \t0, \in, lsr #24
> + orr \out, \out, \t1, lsl #8
> + orr \out, \out, \t2, lsr #8
> +.endif
> +.endm
> +
> +.macro _le32_bswap x, t0, t1, t2
> +#ifdef __ARMEB__
> + __rev \x, \x, \t0, \t1, \t2
> +#endif
> +.endm
> +
> +.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
> + _le32_bswap \a, \t0, \t1, \t2
> + _le32_bswap \b, \t0, \t1, \t2
> + _le32_bswap \c, \t0, \t1, \t2
> + _le32_bswap \d, \t0, \t1, \t2
> +.endm
> +
> +.macro __ldrd a, b, src, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> + ldrd \a, \b, [\src, #\offset]
> +#else
> + ldr \a, [\src, #\offset]
> + ldr \b, [\src, #\offset + 4]
> +#endif
> +.endm
> +
> +.macro __strd a, b, dst, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> + strd \a, \b, [\dst, #\offset]
> +#else
> + str \a, [\dst, #\offset]
> + str \b, [\dst, #\offset + 4]
> +#endif
> +.endm
> +
> +.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
> +
> + // a += b; d ^= a; d = rol(d, 16);
> + add \a1, \a1, \b1, ror #brot
> + add \a2, \a2, \b2, ror #brot
> + eor \d1, \a1, \d1, ror #drot
> + eor \d2, \a2, \d2, ror #drot
> + // drot == 32 - 16 == 16
> +
> + // c += d; b ^= c; b = rol(b, 12);
> + add \c1, \c1, \d1, ror #16
> + add \c2, \c2, \d2, ror #16
> + eor \b1, \c1, \b1, ror #brot
> + eor \b2, \c2, \b2, ror #brot
> + // brot == 32 - 12 == 20
> +
> + // a += b; d ^= a; d = rol(d, 8);
> + add \a1, \a1, \b1, ror #20
> + add \a2, \a2, \b2, ror #20
> + eor \d1, \a1, \d1, ror #16
> + eor \d2, \a2, \d2, ror #16
> + // drot == 32 - 8 == 24
> +
> + // c += d; b ^= c; b = rol(b, 7);
> + add \c1, \c1, \d1, ror #24
> + add \c2, \c2, \d2, ror #24
> + eor \b1, \c1, \b1, ror #20
> + eor \b2, \c2, \b2, ror #20
> + // brot == 32 - 7 == 25
> +.endm
> +
> +.macro _doubleround
> +
> + // column round
> +
> + // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
> + _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
> +
> + // save (x8, x9); restore (x10, x11)
> + __strd X8_X10, X9_X11, sp, 0
> + __ldrd X8_X10, X9_X11, sp, 8
> +
> + // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
> + _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
> +
> + .set brot, 25
> + .set drot, 24
> +
> + // diagonal round
> +
> + // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
> + _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
> +
> + // save (x10, x11); restore (x8, x9)
> + __strd X8_X10, X9_X11, sp, 8
> + __ldrd X8_X10, X9_X11, sp, 0
> +
> + // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
> + _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
> +.endm
> +
> +.macro _chacha_permute nrounds
> + .set brot, 0
> + .set drot, 0
> + .rept \nrounds / 2
> + _doubleround
> + .endr
> +.endm
> +
> +.macro _chacha nrounds
> +
> +.Lnext_block\@:
> + // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
> + // Registers contain x0-x9,x12-x15.
> +
> + // Do the core ChaCha permutation to update x0-x15.
> + _chacha_permute \nrounds
> +
> + add sp, #8
> + // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
> + // Registers contain x0-x9,x12-x15.
> + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
> +
> + // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
> + push {X8_X10, X9_X11, X12, X13, X14, X15}
> +
> + // Load (OUT, IN, LEN).
> + ldr r14, [sp, #96]
> + ldr r12, [sp, #100]
> + ldr r11, [sp, #104]
> +
> + orr r10, r14, r12
> +
> + // Use slow path if fewer than 64 bytes remain.
> + cmp r11, #64
> + blt .Lxor_slowpath\@
> +
> + // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
> + // ARMv6+, since ldmia and stmia (used below) still require alignment.
> + tst r10, #3
> + bne .Lxor_slowpath\@
> +
> + // Fast path: XOR 64 bytes of aligned data.
> +
> + // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
> + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
> + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
> +
> + // x0-x3
> + __ldrd r8, r9, sp, 32
> + __ldrd r10, r11, sp, 40
> + add X0, X0, r8
> + add X1, X1, r9
> + add X2, X2, r10
> + add X3, X3, r11
> + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
> + ldmia r12!, {r8-r11}
> + eor X0, X0, r8
> + eor X1, X1, r9
> + eor X2, X2, r10
> + eor X3, X3, r11
> + stmia r14!, {X0-X3}
> +
> + // x4-x7
> + __ldrd r8, r9, sp, 48
> + __ldrd r10, r11, sp, 56
> + add X4, r8, X4, ror #brot
> + add X5, r9, X5, ror #brot
> + ldmia r12!, {X0-X3}
> + add X6, r10, X6, ror #brot
> + add X7, r11, X7, ror #brot
> + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
> + eor X4, X4, X0
> + eor X5, X5, X1
> + eor X6, X6, X2
> + eor X7, X7, X3
> + stmia r14!, {X4-X7}
> +
> + // x8-x15
> + pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
> + __ldrd r8, r9, sp, 32
> + __ldrd r10, r11, sp, 40
> + add r0, r0, r8 // x8
> + add r1, r1, r9 // x9
> + add r6, r6, r10 // x10
> + add r7, r7, r11 // x11
> + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
> + ldmia r12!, {r8-r11}
> + eor r0, r0, r8 // x8
> + eor r1, r1, r9 // x9
> + eor r6, r6, r10 // x10
> + eor r7, r7, r11 // x11
> + stmia r14!, {r0,r1,r6,r7}
> + ldmia r12!, {r0,r1,r6,r7}
> + __ldrd r8, r9, sp, 48
> + __ldrd r10, r11, sp, 56
> + add r2, r8, r2, ror #drot // x12
> + add r3, r9, r3, ror #drot // x13
> + add r4, r10, r4, ror #drot // x14
> + add r5, r11, r5, ror #drot // x15
> + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
> + ldr r9, [sp, #72] // load LEN
> + eor r2, r2, r0 // x12
> + eor r3, r3, r1 // x13
> + eor r4, r4, r6 // x14
> + eor r5, r5, r7 // x15
> + subs r9, #64 // decrement and check LEN
> + stmia r14!, {r2-r5}
> +
> + beq .Ldone\@
> +
> +.Lprepare_for_next_block\@:
> +
> + // Stack: x0-x15 OUT IN LEN
> +
> + // Increment block counter (x12)
> + add r8, #1
> +
> + // Store updated (OUT, IN, LEN)
> + str r14, [sp, #64]
> + str r12, [sp, #68]
> + str r9, [sp, #72]
> +
> + mov r14, sp
> +
> + // Store updated block counter (x12)
> + str r8, [sp, #48]
> +
> + sub sp, #16
> +
> + // Reload state and do next block
> + ldmia r14!, {r0-r11} // load x0-x11
> + __strd r10, r11, sp, 8 // store x10-x11 before state
> + ldmia r14, {r10-r12,r14} // load x12-x15
> + b .Lnext_block\@
> +
> +.Lxor_slowpath\@:
> + // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
> + // We handle it by storing the 64 bytes of keystream to the stack, then
> + // XOR-ing the needed portion with the data.
> +
> + // Allocate keystream buffer
> + sub sp, #64
> + mov r14, sp
> +
> + // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
> + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
> + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
> +
> + // Save keystream for x0-x3
> + __ldrd r8, r9, sp, 96
> + __ldrd r10, r11, sp, 104
> + add X0, X0, r8
> + add X1, X1, r9
> + add X2, X2, r10
> + add X3, X3, r11
> + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
> + stmia r14!, {X0-X3}
> +
> + // Save keystream for x4-x7
> + __ldrd r8, r9, sp, 112
> + __ldrd r10, r11, sp, 120
> + add X4, r8, X4, ror #brot
> + add X5, r9, X5, ror #brot
> + add X6, r10, X6, ror #brot
> + add X7, r11, X7, ror #brot
> + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
> + add r8, sp, #64
> + stmia r14!, {X4-X7}
> +
> + // Save keystream for x8-x15
> + ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
> + __ldrd r8, r9, sp, 128
> + __ldrd r10, r11, sp, 136
> + add r0, r0, r8 // x8
> + add r1, r1, r9 // x9
> + add r6, r6, r10 // x10
> + add r7, r7, r11 // x11
> + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
> + stmia r14!, {r0,r1,r6,r7}
> + __ldrd r8, r9, sp, 144
> + __ldrd r10, r11, sp, 152
> + add r2, r8, r2, ror #drot // x12
> + add r3, r9, r3, ror #drot // x13
> + add r4, r10, r4, ror #drot // x14
> + add r5, r11, r5, ror #drot // x15
> + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
> + stmia r14, {r2-r5}
> +
> + // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
> + // Registers: r8 is block counter, r12 is IN.
> +
> + ldr r9, [sp, #168] // LEN
> + ldr r14, [sp, #160] // OUT
> + cmp r9, #64
> + mov r0, sp
> + movle r1, r9
> + movgt r1, #64
> + // r1 is number of bytes to XOR, in range [1, 64]
> +
> +.if __LINUX_ARM_ARCH__ < 6
> + orr r2, r12, r14
> + tst r2, #3 // IN or OUT misaligned?
> + bne .Lxor_next_byte\@
> +.endif
> +
> + // XOR a word at a time
> +.rept 16
> + subs r1, #4
> + blt .Lxor_words_done\@
> + ldr r2, [r12], #4
> + ldr r3, [r0], #4
> + eor r2, r2, r3
> + str r2, [r14], #4
> +.endr
> + b .Lxor_slowpath_done\@
> +.Lxor_words_done\@:
> + ands r1, r1, #3
> + beq .Lxor_slowpath_done\@
> +
> + // XOR a byte at a time
> +.Lxor_next_byte\@:
> + ldrb r2, [r12], #1
> + ldrb r3, [r0], #1
> + eor r2, r2, r3
> + strb r2, [r14], #1
> + subs r1, #1
> + bne .Lxor_next_byte\@
> +
> +.Lxor_slowpath_done\@:
> + subs r9, #64
> + add sp, #96
> + bgt .Lprepare_for_next_block\@
> +
> +.Ldone\@:
> +.endm // _chacha
> +
> +/*
> + * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
> + * const u32 iv[4]);
> + */
> +ENTRY(chacha20_arm)
> + cmp r2, #0 // len == 0?
> + bxeq lr
> +
> + push {r0-r2,r4-r11,lr}
> +
> + // Push state x0-x15 onto stack.
> + // Also store an extra copy of x10-x11 just before the state.
> +
> + ldr r4, [sp, #48] // iv
> + mov r0, sp
> + sub sp, #80
> +
> + // iv: x12-x15
> + ldm r4, {X12,X13,X14,X15}
> + stmdb r0!, {X12,X13,X14,X15}
> +
> + // key: x4-x11
> + __ldrd X8_X10, X9_X11, r3, 24
> + __strd X8_X10, X9_X11, sp, 8
> + stmdb r0!, {X8_X10, X9_X11}
> + ldm r3, {X4-X9_X11}
> + stmdb r0!, {X4-X9_X11}
> +
> + // constants: x0-x3
> + adrl X3, .Lexpand_32byte_k
> + ldm X3, {X0-X3}
> + __strd X0, X1, sp, 16
> + __strd X2, X3, sp, 24
> +
> + _chacha 20
> +
> + add sp, #76
> + pop {r4-r11, pc}
> +ENDPROC(chacha20_arm)
> +
> +/*
> + * void hchacha20_arm(const u32 state[16], u32 out[8]);
> + */
> +ENTRY(hchacha20_arm)
> + push {r1,r4-r11,lr}
> +
> + mov r14, r0
> + ldmia r14!, {r0-r11} // load x0-x11
> + push {r10-r11} // store x10-x11 to stack
> + ldm r14, {r10-r12,r14} // load x12-x15
> + sub sp, #8
> +
> + _chacha_permute 20
> +
> + // Skip over (unused0-unused1, x10-x11)
> + add sp, #16
> +
> + // Fix up rotations of x12-x15
> + ror X12, X12, #drot
> + ror X13, X13, #drot
> + pop {r4} // load 'out'
> + ror X14, X14, #drot
> + ror X15, X15, #drot
> +
> + // Store (x0-x3,x12-x15) to 'out'
> + stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
> +
> + pop {r4-r11,pc}
> +ENDPROC(hchacha20_arm)
> +
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +/*
> + * This following NEON routine was ported from Andy Polyakov's implementation
> + * from CRYPTOGAMS. It begins with parts of the CRYPTOGAMS scalar routine,
> + * since certain NEON code paths actually branch to it.
> + */
> +
> .text
> #if defined(__thumb2__) || defined(__clang__)
> .syntax unified
> @@ -22,39 +484,6 @@
> #define ldrhsb ldrbhs
> #endif
>
> -.align 5
> -.Lsigma:
> -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
> -.Lone:
> -.long 1,0,0,0
> -.word -1
> -
> -.align 5
> -ENTRY(chacha20_arm)
> - ldr r12,[sp,#0] @ pull pointer to counter and nonce
> - stmdb sp!,{r0-r2,r4-r11,lr}
> - cmp r2,#0 @ len==0?
> -#ifdef __thumb2__
> - itt eq
> -#endif
> - addeq sp,sp,#4*3
> - beq .Lno_data_arm
> - ldmia r12,{r4-r7} @ load counter and nonce
> - sub sp,sp,#4*(16) @ off-load area
> -#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
> - sub r14,pc,#100 @ .Lsigma
> -#else
> - adr r14,.Lsigma @ .Lsigma
> -#endif
> - stmdb sp!,{r4-r7} @ copy counter and nonce
> - ldmia r3,{r4-r11} @ load key
> - ldmia r14,{r0-r3} @ load sigma
> - stmdb sp!,{r4-r11} @ copy key
> - stmdb sp!,{r0-r3} @ copy sigma
> - str r10,[sp,#4*(16+10)] @ off-load "rx"
> - str r11,[sp,#4*(16+11)] @ off-load "rx"
> - b .Loop_outer_enter
> -
> .align 4
> .Loop_outer:
> ldmia sp,{r0-r9} @ load key material
> @@ -748,11 +1177,8 @@ ENTRY(chacha20_arm)
>
> .Ldone:
> add sp,sp,#4*(32+3)
> -.Lno_data_arm:
> ldmia sp!,{r4-r11,pc}
> -ENDPROC(chacha20_arm)
>
> -#ifdef CONFIG_KERNEL_MODE_NEON
> .align 5
> .Lsigma2:
> .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
> diff --git a/lib/zinc/chacha20/chacha20-arm64-cryptogams.S b/lib/zinc/chacha20/chacha20-arm64.S
> similarity index 100%
> rename from lib/zinc/chacha20/chacha20-arm64-cryptogams.S
> rename to lib/zinc/chacha20/chacha20-arm64.S
> diff --git a/lib/zinc/chacha20/chacha20.c b/lib/zinc/chacha20/chacha20.c
> index 4354b874a6a5..fc4f74fca653 100644
> --- a/lib/zinc/chacha20/chacha20.c
> +++ b/lib/zinc/chacha20/chacha20.c
> @@ -16,6 +16,8 @@
>
> #if defined(CONFIG_ZINC_ARCH_X86_64)
> #include "chacha20-x86_64-glue.h"
> +#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
> +#include "chacha20-arm-glue.h"
> #else
> void __init chacha20_fpu_init(void)
> {
> --
> 2.19.0
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 8:59 ` Ard Biesheuvel
@ 2018-09-26 13:32 ` Jason A. Donenfeld
2018-09-26 14:02 ` Ard Biesheuvel
2018-09-26 14:36 ` Andrew Lunn
0 siblings, 2 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 13:32 UTC (permalink / raw)
To: linux-arm-kernel
Hi Ard,
On Wed, Sep 26, 2018 at 10:59 AM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> > +static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
> > + const u8 *src, size_t len,
> > + simd_context_t *simd_context)
> > +{
> > +#if defined(CONFIG_KERNEL_MODE_NEON)
> > + if (chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> > + simd_use(simd_context))
> > + chacha20_neon(dst, src, len, state->key, state->counter);
> > + else
> > +#endif
>
> Better to use IS_ENABLED() here:
>
> > + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON)) &&
> > + chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> > + simd_use(simd_context))
Good idea. I'll fix that up.
>
> Also, this still has unbounded worst case scheduling latency, given
> that the outer library function passes its entire input straight into
> the NEON routine.
The vast majority of crypto routines in arch/*/crypto/ follow this
same exact pattern, actually. I realize a few don't -- probably the
ones you had a hand in :) -- but I think this is up to the caller to
handle. I made a change so that in chacha20poly1305.c, it calls
simd_relax after handling each scatter-gather element, so a
"construction" will handle this gracefully. But I believe it's up to
the caller to decide on what sizes of information it wants to pass to
primitives. Put differently, this also hasn't ever been an issue
before -- the existing state of the tree indicates this -- and so I
don't anticipate this will be a real issue now. And if it becomes one,
this is something we can address *later*, but certainly there's no use
of adding additional complexity to the initial patchset to do this
now.
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 13:32 ` Jason A. Donenfeld
@ 2018-09-26 14:02 ` Ard Biesheuvel
2018-09-26 15:41 ` Jason A. Donenfeld
` (2 more replies)
2018-09-26 14:36 ` Andrew Lunn
1 sibling, 3 replies; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-26 14:02 UTC (permalink / raw)
To: linux-arm-kernel
(+ Herbert, Thomas)
On Wed, 26 Sep 2018 at 15:33, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Hi Ard,
>
> On Wed, Sep 26, 2018 at 10:59 AM Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
> > > +static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
> > > + const u8 *src, size_t len,
> > > + simd_context_t *simd_context)
> > > +{
> > > +#if defined(CONFIG_KERNEL_MODE_NEON)
> > > + if (chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> > > + simd_use(simd_context))
> > > + chacha20_neon(dst, src, len, state->key, state->counter);
> > > + else
> > > +#endif
> >
> > Better to use IS_ENABLED() here:
> >
> > > + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON)) &&
> > > + chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> > > + simd_use(simd_context))
>
> Good idea. I'll fix that up.
>
> >
> > Also, this still has unbounded worst case scheduling latency, given
> > that the outer library function passes its entire input straight into
> > the NEON routine.
>
> The vast majority of crypto routines in arch/*/crypto/ follow this
> same exact pattern, actually. I realize a few don't -- probably the
> ones you had a hand in :) -- but I think this is up to the caller to
> handle.
Anything that uses the scatterwalk API (AEADs and skciphers) will
handle at most a page at a time. Hashes are different, which is why
some of them have to handle it explicitly.
> I made a change so that in chacha20poly1305.c, it calls
> simd_relax after handling each scatter-gather element, so a
> "construction" will handle this gracefully. But I believe it's up to
> the caller to decide on what sizes of information it wants to pass to
> primitives. Put differently, this also hasn't ever been an issue
> before -- the existing state of the tree indicates this -- and so I
> don't anticipate this will be a real issue now.
The state of the tree does not capture all relevant context or
history. The scheduling latency issue was brought up very recently by
the -rt folks on the mailing lists.
> And if it becomes one,
> this is something we can address *later*, but certainly there's no use
> of adding additional complexity to the initial patchset to do this
> now.
>
You are introducing a very useful SIMD abstraction, but it lets code
run with preemption disabled for unbounded amounts of time, and so now
is the time to ensure we get it right.
Part of the [justified] criticism on the current state of the crypto
API is on its complexity, and so I don't think it makes sense to keep
it simple now and add the complexity later (and the same concern
applies to async support btw).
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 13:32 ` Jason A. Donenfeld
2018-09-26 14:02 ` Ard Biesheuvel
@ 2018-09-26 14:36 ` Andrew Lunn
2018-09-26 15:25 ` Jason A. Donenfeld
1 sibling, 1 reply; 47+ messages in thread
From: Andrew Lunn @ 2018-09-26 14:36 UTC (permalink / raw)
To: linux-arm-kernel
> > Also, this still has unbounded worst case scheduling latency, given
> > that the outer library function passes its entire input straight into
> > the NEON routine.
>
> The vast majority of crypto routines in arch/*/crypto/ follow this
> same exact pattern, actually. I realize a few don't -- probably the
> ones you had a hand in :) -- but I think this is up to the caller to
> handle. I made a change so that in chacha20poly1305.c, it calls
> simd_relax after handling each scatter-gather element, so a
> "construction" will handle this gracefully. But I believe it's up to
> the caller to decide on what sizes of information it wants to pass to
> primitives. Put differently, this also hasn't ever been an issue
> before -- the existing state of the tree indicates this -- and so I
> don't anticipate this will be a real issue now. And if it becomes one,
> this is something we can address *later*, but certainly there's no use
> of adding additional complexity to the initial patchset to do this
> now.
Hi Jason
This is not my area of expertise, so you should verify what i'm say
here...
My guess is, IPSEC will mostly ask the crypto code to work on 1500
byte full MTU packets and 64 byte TCP ACK packets. Disk encryption i
guess works on 4K blocks. So these requests are all quite small,
keeping the latency reasonably bounded.
The wireguard interface claims it is GSO capable. This means the
network stack will pass it big chunks of data and leave it to the
network interface to perform the segmentation into 1500 byte MTU
frames on the wire. I've not looked at how wireguard actually handles
these big chunks. But to get maximum performance, it should try to
keep them whole, just add a header and/or trailer. Will wireguard pass
these big chunks of data to the crypto code? Do we now have 64K blocks
being worked on? Does the latency jump from 4K to 64K? That might be
new, so the existing state of the tree does not help you here.
Andrew
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 14:36 ` Andrew Lunn
@ 2018-09-26 15:25 ` Jason A. Donenfeld
0 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 15:25 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 4:36 PM Andrew Lunn <andrew@lunn.ch> wrote:
> The wireguard interface claims it is GSO capable. This means the
> network stack will pass it big chunks of data and leave it to the
> network interface to perform the segmentation into 1500 byte MTU
> frames on the wire. I've not looked at how wireguard actually handles
> these big chunks. But to get maximum performance, it should try to
> keep them whole, just add a header and/or trailer. Will wireguard pass
> these big chunks of data to the crypto code? Do we now have 64K blocks
> being worked on? Does the latency jump from 4K to 64K? That might be
> new, so the existing state of the tree does not help you here.
No, it only requests GSO superpackets so that it can group the pieces
and encrypt them on the same core. But they're each encrypted
separately (broken up immediately after ndo_start_xmit), and so they
wind up being ~1420 bytes each to encrypt. I spoke about this at
netdev2.2 if you're interested in the architecture; there's a paper:
https://www.wireguard.com/papers/wireguard-netdev22.pdf
https://www.youtube.com/watch?v=54orFwtQ1XY
https://www.wireguard.com/talks/netdev2017-slides.pdf
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 14:02 ` Ard Biesheuvel
@ 2018-09-26 15:41 ` Jason A. Donenfeld
2018-09-26 16:54 ` Ard Biesheuvel
2018-09-26 17:37 ` Eric Biggers
2018-09-26 15:41 ` Ard Biesheuvel
2018-09-26 16:21 ` Andy Lutomirski
2 siblings, 2 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 15:41 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 4:02 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> I don't think it makes sense to keep
> it simple now and add the complexity later (and the same concern
> applies to async support btw).
Ugh, no. I don't want to add needless complexity, period. Zinc is
synchronous, not asynchronous. It provides software implementations.
That's what it does. While many of your reviews have been useful, many
of your comments indicate some desire to change and mold the purpose
and focus of Zinc away from Zinc's intents. Stop that. It's not going
to become a bloated mess of "things Ard wanted and quipped about on
LKML." Things like these only serve to filibuster the patchset
indefinitely. But maybe that's what you'd like all along? Hard to
tell, honestly. So, no, sorry, Zinc isn't gaining an async interface
right now.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 14:02 ` Ard Biesheuvel
2018-09-26 15:41 ` Jason A. Donenfeld
@ 2018-09-26 15:41 ` Ard Biesheuvel
2018-09-26 15:45 ` Jason A. Donenfeld
2018-09-26 16:21 ` Andy Lutomirski
2 siblings, 1 reply; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-26 15:41 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, 26 Sep 2018 at 16:02, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> (+ Herbert, Thomas)
>
> On Wed, 26 Sep 2018 at 15:33, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> >
> > Hi Ard,
> >
> > On Wed, Sep 26, 2018 at 10:59 AM Ard Biesheuvel
> > <ard.biesheuvel@linaro.org> wrote:
> > > > +static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
> > > > + const u8 *src, size_t len,
> > > > + simd_context_t *simd_context)
> > > > +{
> > > > +#if defined(CONFIG_KERNEL_MODE_NEON)
> > > > + if (chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> > > > + simd_use(simd_context))
> > > > + chacha20_neon(dst, src, len, state->key, state->counter);
> > > > + else
> > > > +#endif
> > >
> > > Better to use IS_ENABLED() here:
> > >
> > > > + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON)) &&
> > > > + chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> > > > + simd_use(simd_context))
> >
> > Good idea. I'll fix that up.
> >
> > >
> > > Also, this still has unbounded worst case scheduling latency, given
> > > that the outer library function passes its entire input straight into
> > > the NEON routine.
> >
> > The vast majority of crypto routines in arch/*/crypto/ follow this
> > same exact pattern, actually. I realize a few don't -- probably the
> > ones you had a hand in :) -- but I think this is up to the caller to
> > handle.
>
> Anything that uses the scatterwalk API (AEADs and skciphers) will
> handle at most a page at a time. Hashes are different, which is why
> some of them have to handle it explicitly.
>
> > I made a change so that in chacha20poly1305.c, it calls
> > simd_relax after handling each scatter-gather element, so a
> > "construction" will handle this gracefully. But I believe it's up to
> > the caller to decide on what sizes of information it wants to pass to
> > primitives. Put differently, this also hasn't ever been an issue
> > before -- the existing state of the tree indicates this -- and so I
> > don't anticipate this will be a real issue now.
>
> The state of the tree does not capture all relevant context or
> history. The scheduling latency issue was brought up very recently by
> the -rt folks on the mailing lists.
>
> > And if it becomes one,
> > this is something we can address *later*, but certainly there's no use
> > of adding additional complexity to the initial patchset to do this
> > now.
> >
>
> You are introducing a very useful SIMD abstraction, but it lets code
> run with preemption disabled for unbounded amounts of time, and so now
> is the time to ensure we get it right.
>
Actually, looking at the code again, the abstraction does appear to be
fine, it is just the chacha20 code that does not make use of it.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 15:41 ` Ard Biesheuvel
@ 2018-09-26 15:45 ` Jason A. Donenfeld
2018-09-26 15:49 ` Jason A. Donenfeld
0 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 15:45 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 5:42 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
>
> On Wed, 26 Sep 2018 at 16:02, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> Actually, looking at the code again, the abstraction does appear to be
> fine, it is just the chacha20 code that does not make use of it.
So what you have in mind is something like calling simd_relax() every
4096 bytes or so?
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 15:45 ` Jason A. Donenfeld
@ 2018-09-26 15:49 ` Jason A. Donenfeld
2018-09-26 15:51 ` Ard Biesheuvel
0 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 15:49 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 5:45 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> So what you have in mind is something like calling simd_relax() every
> 4096 bytes or so?
That was actually pretty easy, putting together both of your suggestions:
static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
u8 *src, size_t len,
simd_context_t *simd_context)
{
while (len > PAGE_SIZE) {
chacha20_arch(state, dst, src, PAGE_SIZE, simd_context);
len -= PAGE_SIZE;
src += PAGE_SIZE;
dst += PAGE_SIZE;
simd_relax(simd_context);
}
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&
len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context))
chacha20_neon(dst, src, len, state->key, state->counter);
else
chacha20_arm(dst, src, len, state->key, state->counter);
state->counter[0] += (len + 63) / 64;
return true;
}
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 15:49 ` Jason A. Donenfeld
@ 2018-09-26 15:51 ` Ard Biesheuvel
2018-09-26 15:58 ` Jason A. Donenfeld
2018-09-27 0:04 ` Jason A. Donenfeld
0 siblings, 2 replies; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-26 15:51 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, 26 Sep 2018 at 17:50, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Wed, Sep 26, 2018 at 5:45 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > So what you have in mind is something like calling simd_relax() every
> > 4096 bytes or so?
>
> That was actually pretty easy, putting together both of your suggestions:
>
> static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
> u8 *src, size_t len,
> simd_context_t *simd_context)
> {
> while (len > PAGE_SIZE) {
> chacha20_arch(state, dst, src, PAGE_SIZE, simd_context);
> len -= PAGE_SIZE;
> src += PAGE_SIZE;
> dst += PAGE_SIZE;
> simd_relax(simd_context);
> }
> if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&
> len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context))
> chacha20_neon(dst, src, len, state->key, state->counter);
> else
> chacha20_arm(dst, src, len, state->key, state->counter);
>
> state->counter[0] += (len + 63) / 64;
> return true;
> }
Nice one :-)
This works for me (but perhaps add a comment as well)
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 15:51 ` Ard Biesheuvel
@ 2018-09-26 15:58 ` Jason A. Donenfeld
2018-09-27 0:04 ` Jason A. Donenfeld
1 sibling, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 15:58 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 5:52 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> Nice one :-)
>
> This works for me (but perhaps add a comment as well)
Sure. Just a prototype; it'll be clean for v7.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 14:02 ` Ard Biesheuvel
2018-09-26 15:41 ` Jason A. Donenfeld
2018-09-26 15:41 ` Ard Biesheuvel
@ 2018-09-26 16:21 ` Andy Lutomirski
2018-09-26 17:03 ` Jason A. Donenfeld
2 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2018-09-26 16:21 UTC (permalink / raw)
To: linux-arm-kernel
> On Sep 26, 2018, at 7:02 AM, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> (+ Herbert, Thomas)
>
>> On Wed, 26 Sep 2018 at 15:33, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>>
>> Hi Ard,
>> .
>
>> And if it becomes one,
>> this is something we can address *later*, but certainly there's no use
>> of adding additional complexity to the initial patchset to do this
>> now.
>>
>
> You are introducing a very useful SIMD abstraction, but it lets code
> run with preemption disabled for unbounded amounts of time, and so now
> is the time to ensure we get it right.
>
> Part of the [justified] criticism on the current state of the crypto
> API is on its complexity, and so I don't think it makes sense to keep
> it simple now and add the complexity later (and the same concern
> applies to async support btw).
Are, is what you?re saying that the Zinc chacha20 functions should call simd_relax() every n bytes automatically for some reasonable value of n? If so, seems sensible, except that some care might be needed to make sure they interact with preemption correctly.
What I mean is: the public Zinc entry points should either be callable in an atomic context or they should not be. I think this should be checked at runtime in an appropriate place with an __might_sleep or similar. Or simd_relax should learn to *not* schedule if the result of preempt_enable() leaves it atomic. (And the latter needs to be done in a way that works even on non-preempt kernels, and I don?t remember whether that?s possible.). And this should happen regardless of how many bytes are processed. IOW, calling into Zinc should be equally not atomic-safe for 100 bytes and for 10 MB.
As for async, ISTM a really good WireGuard accelerator would expose a different interface than crypto API supports, and it probably makes sense to wait for such hardware to show up before figuring out how to use it. And no matter what form it takes, I don?t think it should complicate the basic Zinc crypto entry points.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 15:41 ` Jason A. Donenfeld
@ 2018-09-26 16:54 ` Ard Biesheuvel
2018-09-26 17:07 ` Jason A. Donenfeld
2018-09-26 17:37 ` Eric Biggers
1 sibling, 1 reply; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-26 16:54 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, 26 Sep 2018 at 17:41, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Wed, Sep 26, 2018 at 4:02 PM Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
> > I don't think it makes sense to keep
> > it simple now and add the complexity later (and the same concern
> > applies to async support btw).
>
> Ugh, no. I don't want to add needless complexity, period. Zinc is
> synchronous, not asynchronous. It provides software implementations.
> That's what it does. While many of your reviews have been useful, many
> of your comments indicate some desire to change and mold the purpose
> and focus of Zinc away from Zinc's intents. Stop that. It's not going
> to become a bloated mess of "things Ard wanted and quipped about on
> LKML." Things like these only serve to filibuster the patchset
> indefinitely. But maybe that's what you'd like all along? Hard to
> tell, honestly. So, no, sorry, Zinc isn't gaining an async interface
> right now.
Framing it as /needless/ complexity does not help at all. The changes
you are proposing are very useful, but nobody wants two crypto
subsystems with two different maintainers in the kernel, so I would
like to understand where this is going in the future. I am not saying
it should block these patches though.
Also, I have spent a *lot* of time looking at your code, and trying to
make it better, especially for use cases that weren't on your radar to
begin with (e.g., 'pet projects' [your words] like the Cortex-A7 which
will be in almost every new 32-bit Android phone). So characterizing
my feedback as some kind of sabotage is not very productive either.
Contrary to what you seem to think, I am not deeply invested in the
crypto API. What I do care about is that the ARM crypto pieces in the
kernel are maintained, supported and improved by someone who
understands the use cases Linaro's members care about, and is willing
to make an effort to gain such understanding if he doesn't. I have no
doubt that your involvement in the kernel's crypto subsystem will have
a significant positive impact when it comes to code quality,
robustness and usability. I'd just like to see a bit more
consideration for other aspects of kernel programming, e.g.,
preemption under -rt, stack size constraints, coding style, importing
code from other projects etc. - please try to be less dismissive of
feedback first time around, but try to understand why people are
raising these issues; I'm sure you will appreciate it when future
contributors to zinc will do the same.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 16:21 ` Andy Lutomirski
@ 2018-09-26 17:03 ` Jason A. Donenfeld
2018-09-26 17:08 ` Ard Biesheuvel
2018-09-26 17:23 ` Andy Lutomirski
0 siblings, 2 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 17:03 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 6:21 PM Andy Lutomirski <luto@amacapital.net> wrote:
> Are, is what you?re saying that the Zinc chacha20 functions should call simd_relax() every n bytes automatically for some reasonable value of n? If so, seems sensible, except that some care might be needed to make sure they interact with preemption correctly.
>
> What I mean is: the public Zinc entry points should either be callable in an atomic context or they should not be. I think this should be checked at runtime in an appropriate place with an __might_sleep or similar. Or simd_relax should learn to *not* schedule if the result of preempt_enable() leaves it atomic. (And the latter needs to be done in a way that works even on non-preempt kernels, and I don?t remember whether that?s possible.). And this should happen regardless of how many bytes are processed. IOW, calling into Zinc should be equally not atomic-safe for 100 bytes and for 10 MB.
I'm not sure this is actually a problem. Namely:
preempt_disable();
kernel_fpu_begin();
kernel_fpu_end();
schedule(); <--- bug!
Calling kernel_fpu_end() disables preemption, but AFAIK, preemption
enabling/disabling is recursive, so kernel_fpu_end's use of
preempt_disable won't actually do anything until the outer preempt
enable is called:
preempt_disable();
kernel_fpu_begin();
kernel_fpu_end();
preempt_enable();
schedule(); <--- works!
Or am I missing some more subtle point?
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 16:54 ` Ard Biesheuvel
@ 2018-09-26 17:07 ` Jason A. Donenfeld
0 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 17:07 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 6:55 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> Framing it as /needless/ complexity does not help at all. The changes
> you are proposing are very useful, but nobody wants two crypto
> subsystems with two different maintainers in the kernel, so I would
> like to understand where this is going in the future. I am not saying
> it should block these patches though.
Thanks for clarifying. I understood you to be intending to block the
patches until they were converted to an async interface, which is not
what Zinc's about. Seeing as you're just curious about future
directions, that seems much more tenable.
> Also, I have spent a *lot* of time looking at your code, and trying to
> make it better, especially for use cases that weren't on your radar to
> begin with
I am extremely grateful for a good portion of your reviews indeed. As
I mentioned earlier, much is very useful. But in other places, I fear
you're steering this in a direction I really am hesitant to go.
> (e.g., 'pet projects' [your words]
Taken out of context.
> consideration for other aspects of kernel programming, e.g.,
> preemption under -rt, stack size constraints, coding style, importing
> code from other projects etc.
And indeed all of these concerns I've been pretty amenable to, and
continue to do so. What I'm commenting on are things outside of these.
> - please try to be less dismissive of
> feedback first time around, but try to understand why people are
> raising these issues
Apologies, and duly noted. I'll give you the benefit of the doubt.
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 17:03 ` Jason A. Donenfeld
@ 2018-09-26 17:08 ` Ard Biesheuvel
2018-09-26 17:23 ` Andy Lutomirski
1 sibling, 0 replies; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-26 17:08 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, 26 Sep 2018 at 19:03, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Wed, Sep 26, 2018 at 6:21 PM Andy Lutomirski <luto@amacapital.net> wrote:
> > Are, is what you?re saying that the Zinc chacha20 functions should call simd_relax() every n bytes automatically for some reasonable value of n? If so, seems sensible, except that some care might be needed to make sure they interact with preemption correctly.
> >
> > What I mean is: the public Zinc entry points should either be callable in an atomic context or they should not be. I think this should be checked at runtime in an appropriate place with an __might_sleep or similar. Or simd_relax should learn to *not* schedule if the result of preempt_enable() leaves it atomic. (And the latter needs to be done in a way that works even on non-preempt kernels, and I don?t remember whether that?s possible.). And this should happen regardless of how many bytes are processed. IOW, calling into Zinc should be equally not atomic-safe for 100 bytes and for 10 MB.
>
> I'm not sure this is actually a problem. Namely:
>
> preempt_disable();
> kernel_fpu_begin();
> kernel_fpu_end();
> schedule(); <--- bug!
>
> Calling kernel_fpu_end() disables preemption, but AFAIK, preemption
> enabling/disabling is recursive, so kernel_fpu_end's use of
> preempt_disable won't actually do anything until the outer preempt
> enable is called:
>
> preempt_disable();
> kernel_fpu_begin();
> kernel_fpu_end();
> preempt_enable();
> schedule(); <--- works!
>
> Or am I missing some more subtle point?
>
No that seems accurate to me.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 17:03 ` Jason A. Donenfeld
2018-09-26 17:08 ` Ard Biesheuvel
@ 2018-09-26 17:23 ` Andy Lutomirski
1 sibling, 0 replies; 47+ messages in thread
From: Andy Lutomirski @ 2018-09-26 17:23 UTC (permalink / raw)
To: linux-arm-kernel
> On Sep 26, 2018, at 10:03 AM, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
>> On Wed, Sep 26, 2018 at 6:21 PM Andy Lutomirski <luto@amacapital.net> wrote:
>> Are, is what you?re saying that the Zinc chacha20 functions should call simd_relax() every n bytes automatically for some reasonable value of n? If so, seems sensible, except that some care might be needed to make sure they interact with preemption correctly.
>>
>> What I mean is: the public Zinc entry points should either be callable in an atomic context or they should not be. I think this should be checked at runtime in an appropriate place with an __might_sleep or similar. Or simd_relax should learn to *not* schedule if the result of preempt_enable() leaves it atomic. (And the latter needs to be done in a way that works even on non-preempt kernels, and I don?t remember whether that?s possible.). And this should happen regardless of how many bytes are processed. IOW, calling into Zinc should be equally not atomic-safe for 100 bytes and for 10 MB.
>
> I'm not sure this is actually a problem. Namely:
>
> preempt_disable();
> kernel_fpu_begin();
> kernel_fpu_end();
> schedule(); <--- bug!
>
> Calling kernel_fpu_end() disables preemption, but AFAIK, preemption
> enabling/disabling is recursive, so kernel_fpu_end's use of
> preempt_disable won't actually do anything until the outer preempt
> enable is called:
>
> preempt_disable();
> kernel_fpu_begin();
> kernel_fpu_end();
> preempt_enable();
> schedule(); <--- works!
>
> Or am I missing some more subtle point?
>
No, I think you?re right. I was mid-remembering precisely how simd_relax() worked.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 15:41 ` Jason A. Donenfeld
2018-09-26 16:54 ` Ard Biesheuvel
@ 2018-09-26 17:37 ` Eric Biggers
2018-09-26 17:46 ` Jason A. Donenfeld
1 sibling, 1 reply; 47+ messages in thread
From: Eric Biggers @ 2018-09-26 17:37 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 05:41:12PM +0200, Jason A. Donenfeld wrote:
> On Wed, Sep 26, 2018 at 4:02 PM Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
> > I don't think it makes sense to keep
> > it simple now and add the complexity later (and the same concern
> > applies to async support btw).
>
> Ugh, no. I don't want to add needless complexity, period. Zinc is
> synchronous, not asynchronous. It provides software implementations.
> That's what it does. While many of your reviews have been useful, many
> of your comments indicate some desire to change and mold the purpose
> and focus of Zinc away from Zinc's intents. Stop that. It's not going
> to become a bloated mess of "things Ard wanted and quipped about on
> LKML." Things like these only serve to filibuster the patchset
> indefinitely. But maybe that's what you'd like all along? Hard to
> tell, honestly. So, no, sorry, Zinc isn't gaining an async interface
> right now.
Can you please stop accusing Ard of "filibustering" your patchset? Spending too
long in non-preemptible code is a real problem even on non-RT systems.
syzkaller has been reporting bugs where the kernel spins too long without any
preemption points, both in crypto-related code and elsewhere in the kernel. So
we've had to add explicit preemption points to address those, as otherwise users
can lock up all CPUs for tens of seconds. The issue being discussed here is
basically the same except here preemption is being explicitly disabled via
kernel_neon_begin(), so it becomes a problem even on non-CONFIG_PREEMPT kernels.
It's much better to address this problem (which is a regression from the current
skcipher API which only maps a page at a time) up front rather than have to rush
to patch it after the fact once the syzbot reports start coming in.
- Eric
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 17:37 ` Eric Biggers
@ 2018-09-26 17:46 ` Jason A. Donenfeld
0 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-26 17:46 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 7:37 PM Eric Biggers <ebiggers@kernel.org> wrote:
> Can you please stop accusing Ard of "filibustering" your patchset? Spending too
> long in non-preemptible code is a real problem even on non-RT systems.
> syzkaller has been reporting bugs where the kernel spins too long without any
> preemption points, both in crypto-related code and elsewhere in the kernel. So
> we've had to add explicit preemption points to address those, as otherwise users
> can lock up all CPUs for tens of seconds. The issue being discussed here is
> basically the same except here preemption is being explicitly disabled via
> kernel_neon_begin(), so it becomes a problem even on non-CONFIG_PREEMPT kernels.
The async distraction (re:filibustering) and the preempt concern are
two totally different things. I've already posted some code elsewhere
in this thread that addresses the preempt issue that looked good to
Ard. This will be part of v7.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-26 15:51 ` Ard Biesheuvel
2018-09-26 15:58 ` Jason A. Donenfeld
@ 2018-09-27 0:04 ` Jason A. Donenfeld
2018-09-27 13:26 ` Jason A. Donenfeld
1 sibling, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-27 0:04 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Sep 26, 2018 at 5:52 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
>
> On Wed, 26 Sep 2018 at 17:50, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> >
> > On Wed, Sep 26, 2018 at 5:45 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > So what you have in mind is something like calling simd_relax() every
> > > 4096 bytes or so?
> >
> > That was actually pretty easy, putting together both of your suggestions:
> >
> > static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
> > u8 *src, size_t len,
> > simd_context_t *simd_context)
> > {
> > while (len > PAGE_SIZE) {
> > chacha20_arch(state, dst, src, PAGE_SIZE, simd_context);
> > len -= PAGE_SIZE;
> > src += PAGE_SIZE;
> > dst += PAGE_SIZE;
> > simd_relax(simd_context);
> > }
> > if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&
> > len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context))
> > chacha20_neon(dst, src, len, state->key, state->counter);
> > else
> > chacha20_arm(dst, src, len, state->key, state->counter);
> >
> > state->counter[0] += (len + 63) / 64;
> > return true;
> > }
>
> Nice one :-)
>
> This works for me (but perhaps add a comment as well)
As elegant as my quick recursive solution was, gcc produced kind of
bad code from it, as you might expect. So I've implemented this using
a boring old loop that works the way it's supposed to. This is marked
for v7.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-27 0:04 ` Jason A. Donenfeld
@ 2018-09-27 13:26 ` Jason A. Donenfeld
2018-09-27 15:19 ` Jason A. Donenfeld
0 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-27 13:26 UTC (permalink / raw)
To: linux-arm-kernel
Hi Thomas,
I'm trying to optimize this for crypto performance while still taking
into account preemption concerns. I'm having a bit of trouble figuring
out a way to determine numerically what the upper bounds for this
stuff looks like. I'm sure I could pick a pretty sane number that's
arguably okay -- and way under the limit -- but I still am interested
in determining what that limit actually is. I was hoping there'd be a
debugging option called, "warn if preemption is disabled for too
long", or something, but I couldn't find anything like that. I'm also
not quite sure what the latency limits are, to just compute this with
a formula. Essentially what I'm trying to determine is:
preempt_disable();
asm volatile(".fill N, 1, 0x90;");
preempt_enable();
What is the maximum value of N for which the above is okay? What
technique would you generally use in measuring this?
Thanks,
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-27 13:26 ` Jason A. Donenfeld
@ 2018-09-27 15:19 ` Jason A. Donenfeld
2018-09-27 16:26 ` Andy Lutomirski
0 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-27 15:19 UTC (permalink / raw)
To: linux-arm-kernel
Hey again Thomas,
On Thu, Sep 27, 2018 at 3:26 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Hi Thomas,
>
> I'm trying to optimize this for crypto performance while still taking
> into account preemption concerns. I'm having a bit of trouble figuring
> out a way to determine numerically what the upper bounds for this
> stuff looks like. I'm sure I could pick a pretty sane number that's
> arguably okay -- and way under the limit -- but I still am interested
> in determining what that limit actually is. I was hoping there'd be a
> debugging option called, "warn if preemption is disabled for too
> long", or something, but I couldn't find anything like that. I'm also
> not quite sure what the latency limits are, to just compute this with
> a formula. Essentially what I'm trying to determine is:
>
> preempt_disable();
> asm volatile(".fill N, 1, 0x90;");
> preempt_enable();
>
> What is the maximum value of N for which the above is okay? What
> technique would you generally use in measuring this?
>
> Thanks,
> Jason
>From talking to Peter (now CC'd) on IRC, it sounds like what you're
mostly interested in is clocktime latency on reasonable hardware, with
a goal of around ~20?s as a maximum upper bound? I don't expect to get
anywhere near this value at all, but if you can confirm that's a
decent ballpark, it would make for some interesting calculations.
Regards,
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-27 15:19 ` Jason A. Donenfeld
@ 2018-09-27 16:26 ` Andy Lutomirski
2018-09-27 17:06 ` Jason A. Donenfeld
0 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2018-09-27 16:26 UTC (permalink / raw)
To: linux-arm-kernel
> On Sep 27, 2018, at 8:19 AM, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Hey again Thomas,
>
>> On Thu, Sep 27, 2018 at 3:26 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>>
>> Hi Thomas,
>>
>> I'm trying to optimize this for crypto performance while still taking
>> into account preemption concerns. I'm having a bit of trouble figuring
>> out a way to determine numerically what the upper bounds for this
>> stuff looks like. I'm sure I could pick a pretty sane number that's
>> arguably okay -- and way under the limit -- but I still am interested
>> in determining what that limit actually is. I was hoping there'd be a
>> debugging option called, "warn if preemption is disabled for too
>> long", or something, but I couldn't find anything like that. I'm also
>> not quite sure what the latency limits are, to just compute this with
>> a formula. Essentially what I'm trying to determine is:
>>
>> preempt_disable();
>> asm volatile(".fill N, 1, 0x90;");
>> preempt_enable();
>>
>> What is the maximum value of N for which the above is okay? What
>> technique would you generally use in measuring this?
>>
>> Thanks,
>> Jason
>
> From talking to Peter (now CC'd) on IRC, it sounds like what you're
> mostly interested in is clocktime latency on reasonable hardware, with
> a goal of around ~20?s as a maximum upper bound? I don't expect to get
> anywhere near this value at all, but if you can confirm that's a
> decent ballpark, it would make for some interesting calculations.
>
>
I would add another consideration: if you can get better latency with negligible overhead (0.1%? 0.05%), then that might make sense too. For example, it seems plausible that checking need_resched() every few blocks adds basically no overhead, and the SIMD helpers could do this themselves or perhaps only ever do a block at a time.
need_resched() costs a cacheline access, but it?s usually a hot cacheline, and the actual check is just whether a certain bit in memory is set.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-27 16:26 ` Andy Lutomirski
@ 2018-09-27 17:06 ` Jason A. Donenfeld
0 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-27 17:06 UTC (permalink / raw)
To: linux-arm-kernel
On Thu, Sep 27, 2018 at 6:27 PM Andy Lutomirski <luto@amacapital.net> wrote:
> I would add another consideration: if you can get better latency with negligible overhead (0.1%? 0.05%), then that might make sense too. For example, it seems plausible that checking need_resched() every few blocks adds basically no overhead, and the SIMD helpers could do this themselves or perhaps only ever do a block at a time.
>
> need_resched() costs a cacheline access, but it?s usually a hot cacheline, and the actual check is just whether a certain bit in memory is set.
Yes you're right, I do plan to check quite often, rather than seldom,
for this reason. I've been toying with the idea of instead processing
65k (maximum size of a UDP packet) at a time before checking
need_resched(), but armed with the 20?s figure, this isn't remotely
possible on most hardware. So I'll stick with the original
conservative plan of checking very often, and not making things
different from the aspects worked out by the present crypto API in
this regard.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 05/23] zinc: import Andy Polyakov's ChaCha20 ARM and ARM64 implementations
[not found] ` <CAKv+Gu8-EwxFhQSUPxjEvTA5ZPz34RieMokM6CUqwURDr74jtg@mail.gmail.com>
@ 2018-09-28 15:51 ` Ard Biesheuvel
2018-09-28 15:57 ` Jason A. Donenfeld
1 sibling, 0 replies; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-28 15:51 UTC (permalink / raw)
To: linux-arm-kernel
On 28 September 2018 at 17:49, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 25 September 2018 at 16:56, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>> These NEON and non-NEON implementations come from Andy Polyakov's
>> implementation, and are included here in raw form without modification,
>> so that subsequent commits that fix these up for the kernel can see how
>> it has changed. This awkward commit splitting has been requested for the
>> ARM[64] implementations in particular.
>>
>> While this is CRYPTOGAMS code, the originating code for this happens to
>> be the same as OpenSSL's commit 87cc649f30aaf69b351701875b9dac07c29ce8a2
>>
>> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
>> Based-on-code-from: Andy Polyakov <appro@openssl.org>
>> Cc: Samuel Neves <sneves@dei.uc.pt>
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: Greg KH <gregkh@linuxfoundation.org>
>> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
>> Cc: Andy Polyakov <appro@openssl.org>
>> Cc: Russell King <linux@armlinux.org.uk>
>> Cc: linux-arm-kernel at lists.infradead.org
>
> As I mentioned before, I'd prefer this to be based on the original .pl
> but if I am the only one objecting to this, I guess I can live with
> it.
>
Note that I am getting bounces from LAKML because the patch is too big.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 05/23] zinc: import Andy Polyakov's ChaCha20 ARM and ARM64 implementations
[not found] ` <CAKv+Gu8-EwxFhQSUPxjEvTA5ZPz34RieMokM6CUqwURDr74jtg@mail.gmail.com>
2018-09-28 15:51 ` Ard Biesheuvel
@ 2018-09-28 15:57 ` Jason A. Donenfeld
1 sibling, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-28 15:57 UTC (permalink / raw)
To: linux-arm-kernel
Hi Ard,
On Fri, Sep 28, 2018 at 5:49 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> As I mentioned before, I'd prefer this to be based on the original .pl
> but if I am the only one objecting to this, I guess I can live with
> it.
We're working on that, actually. It's not obvious when it'll be ready
to ship -- perhaps after the initial merge, but perhaps way sooner --
but that is something we're trying to do for arm/arm64/mips64.
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-25 14:56 ` [PATCH net-next v6 07/23] zinc: " Jason A. Donenfeld
2018-09-26 8:59 ` Ard Biesheuvel
@ 2018-09-28 16:01 ` Ard Biesheuvel
2018-09-29 2:20 ` Jason A. Donenfeld
1 sibling, 1 reply; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-28 16:01 UTC (permalink / raw)
To: linux-arm-kernel
On 25 September 2018 at 16:56, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> These wire Andy Polyakov's implementations up to the kernel for ARMv7,8
> NEON, and introduce Eric Biggers' ultra-fast scalar implementation for
> CPUs without NEON or for CPUs with slow NEON (Cortex-A5,7).
>
> This commit does the following:
> - Adds the glue code for the assembly implementations.
> - Renames the ARMv8 code into place, since it can at this point be
> used wholesale.
> - Merges Andy Polyakov's ARMv7 NEON code with Eric Biggers' <=ARMv7
> scalar code.
>
> Commit note: Eric Biggers' scalar code is brand new, and quite possibly
> prematurely added to this commit, and so it may require a bit of revision.
>
Please put comments like this below the ---
> This commit delivers approximately the same or much better performance than
> the existing crypto API's code and has been measured to do as such on:
>
> - ARM1176JZF-S [ARMv6]
> - Cortex-A7 [ARMv7]
> - Cortex-A8 [ARMv7]
> - Cortex-A9 [ARMv7]
> - Cortex-A17 [ARMv7]
> - Cortex-A53 [ARMv8]
> - Cortex-A55 [ARMv8]
> - Cortex-A73 [ARMv8]
> - Cortex-A75 [ARMv8]
>
> Interestingly, Andy Polyakov's scalar code is slower than Eric Biggers',
> but is also significantly shorter. This has the advantage that it does
> not evict other code from L1 cache -- particularly on ARM11 chips -- and
> so in certain circumstances it can actually be faster. However, it wasn't
> found that this had an affect on any code existing in the kernel today.
>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Co-authored-by: Eric Biggers <ebiggers@google.com>
> Cc: Samuel Neves <sneves@dei.uc.pt>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
> Cc: Russell King <linux@armlinux.org.uk>
> Cc: linux-arm-kernel at lists.infradead.org
> ---
> lib/zinc/Makefile | 2 +
> lib/zinc/chacha20/chacha20-arm-glue.h | 88 +++
> ...acha20-arm-cryptogams.S => chacha20-arm.S} | 502 ++++++++++++++++--
> ...20-arm64-cryptogams.S => chacha20-arm64.S} | 0
> lib/zinc/chacha20/chacha20.c | 2 +
> 5 files changed, 556 insertions(+), 38 deletions(-)
> create mode 100644 lib/zinc/chacha20/chacha20-arm-glue.h
> rename lib/zinc/chacha20/{chacha20-arm-cryptogams.S => chacha20-arm.S} (71%)
> rename lib/zinc/chacha20/{chacha20-arm64-cryptogams.S => chacha20-arm64.S} (100%)
>
> diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
> index 223a0816c918..e47f64e12bbd 100644
> --- a/lib/zinc/Makefile
> +++ b/lib/zinc/Makefile
> @@ -4,4 +4,6 @@ ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
>
> zinc_chacha20-y := chacha20/chacha20.o
> zinc_chacha20-$(CONFIG_ZINC_ARCH_X86_64) += chacha20/chacha20-x86_64.o
> +zinc_chacha20-$(CONFIG_ZINC_ARCH_ARM) += chacha20/chacha20-arm.o
> +zinc_chacha20-$(CONFIG_ZINC_ARCH_ARM64) += chacha20/chacha20-arm64.o
Are these CONFIG_ symbols defined anywhere at this point?
In any case, I don't think these is a reason for these, at least not
on ARM/arm64. The 64-bitness is implied in both cases, and the
dependency on !CPU_32v3 you introduce (looking at the version of
Kconfig at the end of the series) seems spurious to me. Was that added
because of some kbuild robot report? (we don't support ARMv3 in the
kernel but ARCH_RPC is built in v3 mode because of historical reasons
while the actual core is a v4)
> obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
> diff --git a/lib/zinc/chacha20/chacha20-arm-glue.h b/lib/zinc/chacha20/chacha20-arm-glue.h
> new file mode 100644
> index 000000000000..86cce851ed02
> --- /dev/null
> +++ b/lib/zinc/chacha20/chacha20-arm-glue.h
> @@ -0,0 +1,88 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + */
> +
> +#include <asm/hwcap.h>
> +#include <asm/neon.h>
> +#if defined(CONFIG_ARM)
> +#include <asm/system_info.h>
> +#include <asm/cputype.h>
> +#endif
> +
> +asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
> + const u32 key[8], const u32 counter[4]);
> +#if defined(CONFIG_ARM)
> +asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]);
> +#endif
> +#if defined(CONFIG_KERNEL_MODE_NEON)
> +asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
> + const u32 key[8], const u32 counter[4]);
> +#endif
> +
No need to make asmlinkage declarations conditional
> +static bool chacha20_use_neon __ro_after_init;
> +
> +static void __init chacha20_fpu_init(void)
> +{
> +#if defined(CONFIG_ARM64)
> + chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
> +#elif defined(CONFIG_ARM)
> + switch (read_cpuid_part()) {
> + case ARM_CPU_PART_CORTEX_A7:
> + case ARM_CPU_PART_CORTEX_A5:
> + /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON
> + * implementation but do incredibly with the scalar one and use
> + * less power.
> + */
> + break;
> + default:
> + chacha20_use_neon = elf_hwcap & HWCAP_NEON;
> + }
> +#endif
> +}
> +
> +static inline bool chacha20_arch(struct chacha20_ctx *state, u8 *dst,
> + const u8 *src, size_t len,
> + simd_context_t *simd_context)
> +{
> +#if defined(CONFIG_KERNEL_MODE_NEON)
if (IS_ENABLED())
> + if (chacha20_use_neon && len >= CHACHA20_BLOCK_SIZE * 3 &&
> + simd_use(simd_context))
> + chacha20_neon(dst, src, len, state->key, state->counter);
> + else
> +#endif
> + chacha20_arm(dst, src, len, state->key, state->counter);
> +
> + state->counter[0] += (len + 63) / 64;
> + return true;
> +}
> +
> +static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
> + const u8 nonce[HCHACHA20_NONCE_SIZE],
> + const u8 key[HCHACHA20_KEY_SIZE],
> + simd_context_t *simd_context)
> +{
> +#if defined(CONFIG_ARM)
> + u32 x[] = { CHACHA20_CONSTANT_EXPA,
> + CHACHA20_CONSTANT_ND_3,
> + CHACHA20_CONSTANT_2_BY,
> + CHACHA20_CONSTANT_TE_K,
> + get_unaligned_le32(key + 0),
> + get_unaligned_le32(key + 4),
> + get_unaligned_le32(key + 8),
> + get_unaligned_le32(key + 12),
> + get_unaligned_le32(key + 16),
> + get_unaligned_le32(key + 20),
> + get_unaligned_le32(key + 24),
> + get_unaligned_le32(key + 28),
> + get_unaligned_le32(nonce + 0),
> + get_unaligned_le32(nonce + 4),
> + get_unaligned_le32(nonce + 8),
> + get_unaligned_le32(nonce + 12)
> + };
> + hchacha20_arm(x, derived_key);
"""
if (!IS_ENABLED(CONFIG_ARM))
return false;
hchacha20_arm(x, derived_key);
return true;
"""
and drop the #ifdefs
> + return true;
> +#else
> + return false;
> +#endif
> +}
> diff --git a/lib/zinc/chacha20/chacha20-arm-cryptogams.S b/lib/zinc/chacha20/chacha20-arm.S
> similarity index 71%
> rename from lib/zinc/chacha20/chacha20-arm-cryptogams.S
> rename to lib/zinc/chacha20/chacha20-arm.S
> index 770bab469171..5abedafcf129 100644
> --- a/lib/zinc/chacha20/chacha20-arm-cryptogams.S
> +++ b/lib/zinc/chacha20/chacha20-arm.S
> @@ -1,13 +1,475 @@
> /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
> /*
> + * Copyright (C) 2018 Google, Inc.
> * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
> - *
> - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
> */
>
> #include <linux/linkage.h>
>
> +/*
> + * The following scalar routine was written by Eric Biggers.
> + *
> + * Design notes:
> + *
> + * 16 registers would be needed to hold the state matrix, but only 14 are
> + * available because 'sp' and 'pc' cannot be used. So we spill the elements
> + * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
> + * 'ldrd' and one 'strd' instruction per round.
> + *
> + * All rotates are performed using the implicit rotate operand accepted by the
> + * 'add' and 'eor' instructions. This is faster than using explicit rotate
> + * instructions. To make this work, we allow the values in the second and last
> + * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
> + * wrong rotation amount. The rotation amount is then fixed up just in time
> + * when the values are used. 'brot' is the number of bits the values in row 'b'
> + * need to be rotated right to arrive at the correct values, and 'drot'
> + * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
> + * that they end up as (25, 24) after every round.
> + */
> +
> + // ChaCha state registers
> + X0 .req r0
> + X1 .req r1
> + X2 .req r2
> + X3 .req r3
> + X4 .req r4
> + X5 .req r5
> + X6 .req r6
> + X7 .req r7
> + X8_X10 .req r8 // shared by x8 and x10
> + X9_X11 .req r9 // shared by x9 and x11
> + X12 .req r10
> + X13 .req r11
> + X14 .req r12
> + X15 .req r14
> +
> +.Lexpand_32byte_k:
> + // "expand 32-byte k"
> + .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
> +
> +#ifdef __thumb2__
> +# define adrl adr
> +#endif
> +
> +.macro __rev out, in, t0, t1, t2
> +.if __LINUX_ARM_ARCH__ >= 6
> + rev \out, \in
> +.else
> + lsl \t0, \in, #24
> + and \t1, \in, #0xff00
> + and \t2, \in, #0xff0000
> + orr \out, \t0, \in, lsr #24
> + orr \out, \out, \t1, lsl #8
> + orr \out, \out, \t2, lsr #8
> +.endif
> +.endm
> +
> +.macro _le32_bswap x, t0, t1, t2
> +#ifdef __ARMEB__
> + __rev \x, \x, \t0, \t1, \t2
> +#endif
> +.endm
> +
> +.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
> + _le32_bswap \a, \t0, \t1, \t2
> + _le32_bswap \b, \t0, \t1, \t2
> + _le32_bswap \c, \t0, \t1, \t2
> + _le32_bswap \d, \t0, \t1, \t2
> +.endm
> +
> +.macro __ldrd a, b, src, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> + ldrd \a, \b, [\src, #\offset]
> +#else
> + ldr \a, [\src, #\offset]
> + ldr \b, [\src, #\offset + 4]
> +#endif
> +.endm
> +
> +.macro __strd a, b, dst, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> + strd \a, \b, [\dst, #\offset]
> +#else
> + str \a, [\dst, #\offset]
> + str \b, [\dst, #\offset + 4]
> +#endif
> +.endm
> +
> +.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
> +
> + // a += b; d ^= a; d = rol(d, 16);
> + add \a1, \a1, \b1, ror #brot
> + add \a2, \a2, \b2, ror #brot
> + eor \d1, \a1, \d1, ror #drot
> + eor \d2, \a2, \d2, ror #drot
> + // drot == 32 - 16 == 16
> +
> + // c += d; b ^= c; b = rol(b, 12);
> + add \c1, \c1, \d1, ror #16
> + add \c2, \c2, \d2, ror #16
> + eor \b1, \c1, \b1, ror #brot
> + eor \b2, \c2, \b2, ror #brot
> + // brot == 32 - 12 == 20
> +
> + // a += b; d ^= a; d = rol(d, 8);
> + add \a1, \a1, \b1, ror #20
> + add \a2, \a2, \b2, ror #20
> + eor \d1, \a1, \d1, ror #16
> + eor \d2, \a2, \d2, ror #16
> + // drot == 32 - 8 == 24
> +
> + // c += d; b ^= c; b = rol(b, 7);
> + add \c1, \c1, \d1, ror #24
> + add \c2, \c2, \d2, ror #24
> + eor \b1, \c1, \b1, ror #20
> + eor \b2, \c2, \b2, ror #20
> + // brot == 32 - 7 == 25
> +.endm
> +
> +.macro _doubleround
> +
> + // column round
> +
> + // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
> + _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
> +
> + // save (x8, x9); restore (x10, x11)
> + __strd X8_X10, X9_X11, sp, 0
> + __ldrd X8_X10, X9_X11, sp, 8
> +
> + // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
> + _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
> +
> + .set brot, 25
> + .set drot, 24
> +
> + // diagonal round
> +
> + // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
> + _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
> +
> + // save (x10, x11); restore (x8, x9)
> + __strd X8_X10, X9_X11, sp, 8
> + __ldrd X8_X10, X9_X11, sp, 0
> +
> + // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
> + _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
> +.endm
> +
> +.macro _chacha_permute nrounds
> + .set brot, 0
> + .set drot, 0
> + .rept \nrounds / 2
> + _doubleround
> + .endr
> +.endm
> +
> +.macro _chacha nrounds
> +
> +.Lnext_block\@:
> + // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
> + // Registers contain x0-x9,x12-x15.
> +
> + // Do the core ChaCha permutation to update x0-x15.
> + _chacha_permute \nrounds
> +
> + add sp, #8
> + // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
> + // Registers contain x0-x9,x12-x15.
> + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
> +
> + // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
> + push {X8_X10, X9_X11, X12, X13, X14, X15}
> +
> + // Load (OUT, IN, LEN).
> + ldr r14, [sp, #96]
> + ldr r12, [sp, #100]
> + ldr r11, [sp, #104]
> +
> + orr r10, r14, r12
> +
> + // Use slow path if fewer than 64 bytes remain.
> + cmp r11, #64
> + blt .Lxor_slowpath\@
> +
> + // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
> + // ARMv6+, since ldmia and stmia (used below) still require alignment.
> + tst r10, #3
> + bne .Lxor_slowpath\@
> +
> + // Fast path: XOR 64 bytes of aligned data.
> +
> + // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
> + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
> + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
> +
> + // x0-x3
> + __ldrd r8, r9, sp, 32
> + __ldrd r10, r11, sp, 40
> + add X0, X0, r8
> + add X1, X1, r9
> + add X2, X2, r10
> + add X3, X3, r11
> + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
> + ldmia r12!, {r8-r11}
> + eor X0, X0, r8
> + eor X1, X1, r9
> + eor X2, X2, r10
> + eor X3, X3, r11
> + stmia r14!, {X0-X3}
> +
> + // x4-x7
> + __ldrd r8, r9, sp, 48
> + __ldrd r10, r11, sp, 56
> + add X4, r8, X4, ror #brot
> + add X5, r9, X5, ror #brot
> + ldmia r12!, {X0-X3}
> + add X6, r10, X6, ror #brot
> + add X7, r11, X7, ror #brot
> + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
> + eor X4, X4, X0
> + eor X5, X5, X1
> + eor X6, X6, X2
> + eor X7, X7, X3
> + stmia r14!, {X4-X7}
> +
> + // x8-x15
> + pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
> + __ldrd r8, r9, sp, 32
> + __ldrd r10, r11, sp, 40
> + add r0, r0, r8 // x8
> + add r1, r1, r9 // x9
> + add r6, r6, r10 // x10
> + add r7, r7, r11 // x11
> + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
> + ldmia r12!, {r8-r11}
> + eor r0, r0, r8 // x8
> + eor r1, r1, r9 // x9
> + eor r6, r6, r10 // x10
> + eor r7, r7, r11 // x11
> + stmia r14!, {r0,r1,r6,r7}
> + ldmia r12!, {r0,r1,r6,r7}
> + __ldrd r8, r9, sp, 48
> + __ldrd r10, r11, sp, 56
> + add r2, r8, r2, ror #drot // x12
> + add r3, r9, r3, ror #drot // x13
> + add r4, r10, r4, ror #drot // x14
> + add r5, r11, r5, ror #drot // x15
> + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
> + ldr r9, [sp, #72] // load LEN
> + eor r2, r2, r0 // x12
> + eor r3, r3, r1 // x13
> + eor r4, r4, r6 // x14
> + eor r5, r5, r7 // x15
> + subs r9, #64 // decrement and check LEN
> + stmia r14!, {r2-r5}
> +
> + beq .Ldone\@
> +
> +.Lprepare_for_next_block\@:
> +
> + // Stack: x0-x15 OUT IN LEN
> +
> + // Increment block counter (x12)
> + add r8, #1
> +
> + // Store updated (OUT, IN, LEN)
> + str r14, [sp, #64]
> + str r12, [sp, #68]
> + str r9, [sp, #72]
> +
> + mov r14, sp
> +
> + // Store updated block counter (x12)
> + str r8, [sp, #48]
> +
> + sub sp, #16
> +
> + // Reload state and do next block
> + ldmia r14!, {r0-r11} // load x0-x11
> + __strd r10, r11, sp, 8 // store x10-x11 before state
> + ldmia r14, {r10-r12,r14} // load x12-x15
> + b .Lnext_block\@
> +
> +.Lxor_slowpath\@:
> + // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
> + // We handle it by storing the 64 bytes of keystream to the stack, then
> + // XOR-ing the needed portion with the data.
> +
> + // Allocate keystream buffer
> + sub sp, #64
> + mov r14, sp
> +
> + // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
> + // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
> + // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
> +
> + // Save keystream for x0-x3
> + __ldrd r8, r9, sp, 96
> + __ldrd r10, r11, sp, 104
> + add X0, X0, r8
> + add X1, X1, r9
> + add X2, X2, r10
> + add X3, X3, r11
> + _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
> + stmia r14!, {X0-X3}
> +
> + // Save keystream for x4-x7
> + __ldrd r8, r9, sp, 112
> + __ldrd r10, r11, sp, 120
> + add X4, r8, X4, ror #brot
> + add X5, r9, X5, ror #brot
> + add X6, r10, X6, ror #brot
> + add X7, r11, X7, ror #brot
> + _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
> + add r8, sp, #64
> + stmia r14!, {X4-X7}
> +
> + // Save keystream for x8-x15
> + ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
> + __ldrd r8, r9, sp, 128
> + __ldrd r10, r11, sp, 136
> + add r0, r0, r8 // x8
> + add r1, r1, r9 // x9
> + add r6, r6, r10 // x10
> + add r7, r7, r11 // x11
> + _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
> + stmia r14!, {r0,r1,r6,r7}
> + __ldrd r8, r9, sp, 144
> + __ldrd r10, r11, sp, 152
> + add r2, r8, r2, ror #drot // x12
> + add r3, r9, r3, ror #drot // x13
> + add r4, r10, r4, ror #drot // x14
> + add r5, r11, r5, ror #drot // x15
> + _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
> + stmia r14, {r2-r5}
> +
> + // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
> + // Registers: r8 is block counter, r12 is IN.
> +
> + ldr r9, [sp, #168] // LEN
> + ldr r14, [sp, #160] // OUT
> + cmp r9, #64
> + mov r0, sp
> + movle r1, r9
> + movgt r1, #64
> + // r1 is number of bytes to XOR, in range [1, 64]
> +
> +.if __LINUX_ARM_ARCH__ < 6
> + orr r2, r12, r14
> + tst r2, #3 // IN or OUT misaligned?
> + bne .Lxor_next_byte\@
> +.endif
> +
> + // XOR a word at a time
> +.rept 16
> + subs r1, #4
> + blt .Lxor_words_done\@
> + ldr r2, [r12], #4
> + ldr r3, [r0], #4
> + eor r2, r2, r3
> + str r2, [r14], #4
> +.endr
> + b .Lxor_slowpath_done\@
> +.Lxor_words_done\@:
> + ands r1, r1, #3
> + beq .Lxor_slowpath_done\@
> +
> + // XOR a byte at a time
> +.Lxor_next_byte\@:
> + ldrb r2, [r12], #1
> + ldrb r3, [r0], #1
> + eor r2, r2, r3
> + strb r2, [r14], #1
> + subs r1, #1
> + bne .Lxor_next_byte\@
> +
> +.Lxor_slowpath_done\@:
> + subs r9, #64
> + add sp, #96
> + bgt .Lprepare_for_next_block\@
> +
> +.Ldone\@:
> +.endm // _chacha
> +
> +/*
> + * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
> + * const u32 iv[4]);
> + */
> +ENTRY(chacha20_arm)
> + cmp r2, #0 // len == 0?
> + bxeq lr
> +
> + push {r0-r2,r4-r11,lr}
> +
> + // Push state x0-x15 onto stack.
> + // Also store an extra copy of x10-x11 just before the state.
> +
> + ldr r4, [sp, #48] // iv
> + mov r0, sp
> + sub sp, #80
> +
> + // iv: x12-x15
> + ldm r4, {X12,X13,X14,X15}
> + stmdb r0!, {X12,X13,X14,X15}
> +
> + // key: x4-x11
> + __ldrd X8_X10, X9_X11, r3, 24
> + __strd X8_X10, X9_X11, sp, 8
> + stmdb r0!, {X8_X10, X9_X11}
> + ldm r3, {X4-X9_X11}
> + stmdb r0!, {X4-X9_X11}
> +
> + // constants: x0-x3
> + adrl X3, .Lexpand_32byte_k
> + ldm X3, {X0-X3}
> + __strd X0, X1, sp, 16
> + __strd X2, X3, sp, 24
> +
> + _chacha 20
> +
> + add sp, #76
> + pop {r4-r11, pc}
> +ENDPROC(chacha20_arm)
> +
> +/*
> + * void hchacha20_arm(const u32 state[16], u32 out[8]);
> + */
> +ENTRY(hchacha20_arm)
> + push {r1,r4-r11,lr}
> +
> + mov r14, r0
> + ldmia r14!, {r0-r11} // load x0-x11
> + push {r10-r11} // store x10-x11 to stack
> + ldm r14, {r10-r12,r14} // load x12-x15
> + sub sp, #8
> +
> + _chacha_permute 20
> +
> + // Skip over (unused0-unused1, x10-x11)
> + add sp, #16
> +
> + // Fix up rotations of x12-x15
> + ror X12, X12, #drot
> + ror X13, X13, #drot
> + pop {r4} // load 'out'
> + ror X14, X14, #drot
> + ror X15, X15, #drot
> +
> + // Store (x0-x3,x12-x15) to 'out'
> + stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
> +
> + pop {r4-r11,pc}
> +ENDPROC(hchacha20_arm)
> +
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +/*
> + * This following NEON routine was ported from Andy Polyakov's implementation
> + * from CRYPTOGAMS. It begins with parts of the CRYPTOGAMS scalar routine,
> + * since certain NEON code paths actually branch to it.
> + */
> +
> .text
> #if defined(__thumb2__) || defined(__clang__)
> .syntax unified
> @@ -22,39 +484,6 @@
> #define ldrhsb ldrbhs
> #endif
>
> -.align 5
> -.Lsigma:
> -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
> -.Lone:
> -.long 1,0,0,0
> -.word -1
> -
> -.align 5
> -ENTRY(chacha20_arm)
> - ldr r12,[sp,#0] @ pull pointer to counter and nonce
> - stmdb sp!,{r0-r2,r4-r11,lr}
> - cmp r2,#0 @ len==0?
> -#ifdef __thumb2__
> - itt eq
> -#endif
> - addeq sp,sp,#4*3
> - beq .Lno_data_arm
> - ldmia r12,{r4-r7} @ load counter and nonce
> - sub sp,sp,#4*(16) @ off-load area
> -#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
> - sub r14,pc,#100 @ .Lsigma
> -#else
> - adr r14,.Lsigma @ .Lsigma
> -#endif
> - stmdb sp!,{r4-r7} @ copy counter and nonce
> - ldmia r3,{r4-r11} @ load key
> - ldmia r14,{r0-r3} @ load sigma
> - stmdb sp!,{r4-r11} @ copy key
> - stmdb sp!,{r0-r3} @ copy sigma
> - str r10,[sp,#4*(16+10)] @ off-load "rx"
> - str r11,[sp,#4*(16+11)] @ off-load "rx"
> - b .Loop_outer_enter
> -
> .align 4
> .Loop_outer:
> ldmia sp,{r0-r9} @ load key material
> @@ -748,11 +1177,8 @@ ENTRY(chacha20_arm)
>
> .Ldone:
> add sp,sp,#4*(32+3)
> -.Lno_data_arm:
> ldmia sp!,{r4-r11,pc}
> -ENDPROC(chacha20_arm)
>
> -#ifdef CONFIG_KERNEL_MODE_NEON
> .align 5
> .Lsigma2:
> .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
> diff --git a/lib/zinc/chacha20/chacha20-arm64-cryptogams.S b/lib/zinc/chacha20/chacha20-arm64.S
> similarity index 100%
> rename from lib/zinc/chacha20/chacha20-arm64-cryptogams.S
> rename to lib/zinc/chacha20/chacha20-arm64.S
> diff --git a/lib/zinc/chacha20/chacha20.c b/lib/zinc/chacha20/chacha20.c
> index 4354b874a6a5..fc4f74fca653 100644
> --- a/lib/zinc/chacha20/chacha20.c
> +++ b/lib/zinc/chacha20/chacha20.c
> @@ -16,6 +16,8 @@
>
> #if defined(CONFIG_ZINC_ARCH_X86_64)
> #include "chacha20-x86_64-glue.h"
> +#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
As above, just use CONFIG_ARM / CONFIG_ARM64 directly
> +#include "chacha20-arm-glue.h"
> #else
> void __init chacha20_fpu_init(void)
> {
> --
> 2.19.0
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-28 16:01 ` Ard Biesheuvel
@ 2018-09-29 2:20 ` Jason A. Donenfeld
2018-09-29 6:16 ` Ard Biesheuvel
0 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-29 2:20 UTC (permalink / raw)
To: linux-arm-kernel
Hi Ard,
On Fri, Sep 28, 2018 at 6:02 PM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> Please put comments like this below the ---
git-notes is nice for this indeed.
> Are these CONFIG_ symbols defined anywhere at this point?
Yes, they're introduced in the first zinc commit. There's no git-blame
on git.kernel.org, presumably because it's expensive to compute, but
there is on my personal instance, so this might help:
https://git.zx2c4.com/linux-dev/blame/lib/zinc/Kconfig?h=jd/wireguard
> In any case, I don't think these is a reason for these, at least not
> on ARM/arm64. The 64-bitness is implied in both cases
You mean to say that since these nobs are def_bool y and are
essentially "depends on ARM", then I should just straight up use
CONFIG_ARM? I had thought about this, but figured this would make it
easier to later make these optional or have other options block them
need be, or even if the dependencies and requirements for having them
changes (for example, with UML on x86). I think doing it this way
gives us some flexibility later on. So if that's a compelling enough
reason, I'd like to keep those.
> and the
> dependency on !CPU_32v3 you introduce (looking at the version of
> Kconfig at the end of the series) seems spurious to me. Was that added
> because of some kbuild robot report? (we don't support ARMv3 in the
> kernel but ARCH_RPC is built in v3 mode because of historical reasons
> while the actual core is a v4)
I added the !CPU_32v3 in my development tree after posting v6, so good
to hear you're just looking straight at the updated tree. If you see
things jump out in there prior to me posting v7, don't hesitate to let
me know.
The reason it was added was indeed because of:
https://lists.01.org/pipermail/kbuild-all/2018-September/053114.html
-- exactly what you suspected, ARCH_RPC. Have a better suggestion than
!CPU_32v3? It seems to me like so long as the kernel has CPU_32v3 as a
thing in any form, I should mark Zinc as not supporting it, since
we'll certainly be at least v4 and up. (Do you guys have any old Acorn
ARM610 boxes sitting around for old time's sake at LinaroHQ? ;-)
> > +#endif
> > +
>
> No need to make asmlinkage declarations conditional
Yep, addressed in the IS_ENABLED cleanup.
>
> if (IS_ENABLED())
Sorted.
>
> """
> if (!IS_ENABLED(CONFIG_ARM))
> return false;
>
> hchacha20_arm(x, derived_key);
> return true;
> """
>
> and drop the #ifdefs
Also sorted.
Regards,
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-29 2:20 ` Jason A. Donenfeld
@ 2018-09-29 6:16 ` Ard Biesheuvel
2018-09-30 2:33 ` Jason A. Donenfeld
0 siblings, 1 reply; 47+ messages in thread
From: Ard Biesheuvel @ 2018-09-29 6:16 UTC (permalink / raw)
To: linux-arm-kernel
On 29 September 2018 at 04:20, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> Hi Ard,
>
> On Fri, Sep 28, 2018 at 6:02 PM Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> Please put comments like this below the ---
>
> git-notes is nice for this indeed.
>
>> Are these CONFIG_ symbols defined anywhere at this point?
>
> Yes, they're introduced in the first zinc commit. There's no git-blame
> on git.kernel.org, presumably because it's expensive to compute, but
> there is on my personal instance, so this might help:
> https://git.zx2c4.com/linux-dev/blame/lib/zinc/Kconfig?h=jd/wireguard
>
>> In any case, I don't think these is a reason for these, at least not
>> on ARM/arm64. The 64-bitness is implied in both cases
>
> You mean to say that since these nobs are def_bool y and are
> essentially "depends on ARM", then I should just straight up use
> CONFIG_ARM? I had thought about this, but figured this would make it
> easier to later make these optional or have other options block them
> need be, or even if the dependencies and requirements for having them
> changes (for example, with UML on x86). I think doing it this way
> gives us some flexibility later on. So if that's a compelling enough
> reason, I'd like to keep those.
>
Sure. But probably better to be consistent then, and stop using
CONFIG_ARM directly in your code.
>> and the
>> dependency on !CPU_32v3 you introduce (looking at the version of
>> Kconfig at the end of the series) seems spurious to me. Was that added
>> because of some kbuild robot report? (we don't support ARMv3 in the
>> kernel but ARCH_RPC is built in v3 mode because of historical reasons
>> while the actual core is a v4)
>
> I added the !CPU_32v3 in my development tree after posting v6, so good
> to hear you're just looking straight at the updated tree. If you see
> things jump out in there prior to me posting v7, don't hesitate to let
> me know.
>
> The reason it was added was indeed because of:
> https://lists.01.org/pipermail/kbuild-all/2018-September/053114.html
> -- exactly what you suspected, ARCH_RPC. Have a better suggestion than
> !CPU_32v3?
Yes, you could just add
asflags-$(CONFIG_CPU_32v3) += -march=armv4
with a comment stating that we don't actually support ARMv3 but only
use it as a code generation target for reasons unrelated to the ISA
> It seems to me like so long as the kernel has CPU_32v3 as a
> thing in any form, I should mark Zinc as not supporting it, since
> we'll certainly be at least v4 and up. (Do you guys have any old Acorn
> ARM610 boxes sitting around for old time's sake at LinaroHQ? ;-)
>
AFAIK we only support the StrongARM flavor of RiscPC which is ARMv4.
But yes, some people do care deeply about these antiquated platforms,
including Russell, and there is no particular to leave this one
behind.
>> > +#endif
>> > +
>>
>> No need to make asmlinkage declarations conditional
>
> Yep, addressed in the IS_ENABLED cleanup.
>
>>
>> if (IS_ENABLED())
>
> Sorted.
>
>>
>> """
>> if (!IS_ENABLED(CONFIG_ARM))
>> return false;
>>
>> hchacha20_arm(x, derived_key);
>> return true;
>> """
>>
>> and drop the #ifdefs
>
> Also sorted.
>
> Regards,
> Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 07/23] zinc: ChaCha20 ARM and ARM64 implementations
2018-09-29 6:16 ` Ard Biesheuvel
@ 2018-09-30 2:33 ` Jason A. Donenfeld
0 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-09-30 2:33 UTC (permalink / raw)
To: linux-arm-kernel
Hi Ard,
On Sat, Sep 29, 2018 at 8:16 AM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> > You mean to say that since these nobs are def_bool y and are
> > essentially "depends on ARM", then I should just straight up use
> > CONFIG_ARM? I had thought about this, but figured this would make it
> > easier to later make these optional or have other options block them
> > need be, or even if the dependencies and requirements for having them
> > changes (for example, with UML on x86). I think doing it this way
> > gives us some flexibility later on. So if that's a compelling enough
> > reason, I'd like to keep those.
>
> Sure. But probably better to be consistent then, and stop using
> CONFIG_ARM directly in your code.
Ack.
> > The reason it was added was indeed because of:
> > https://lists.01.org/pipermail/kbuild-all/2018-September/053114.html
> > -- exactly what you suspected, ARCH_RPC. Have a better suggestion than
> > !CPU_32v3?
>
> Yes, you could just add
>
> asflags-$(CONFIG_CPU_32v3) += -march=armv4
>
> with a comment stating that we don't actually support ARMv3 but only
> use it as a code generation target for reasons unrelated to the ISA
Alright, I'll do exactly that. Though, if the rationale for this has
to do only with codegen -- with what the C compiler does -- then
shouldn't this be set globally for CONFIG_CPU_32v3? I couldn't find
any macros that test against __LINUX_ARM_ARCH__ being 3 in the
assembly, so this shouldn't be a problem I don't think. Maybe I'll
send a patch.
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
2018-09-25 14:56 ` [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation Jason A. Donenfeld
@ 2018-10-02 16:59 ` Ard Biesheuvel
2018-10-02 21:35 ` Richard Weinberger
` (2 more replies)
0 siblings, 3 replies; 47+ messages in thread
From: Ard Biesheuvel @ 2018-10-02 16:59 UTC (permalink / raw)
To: linux-arm-kernel
Hi Jason,
On 25 September 2018 at 16:56, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> This comes from Dan Bernstein and Peter Schwabe's public domain NEON
> code, and has been modified to be friendly for kernel space, as well as
> removing some qhasm strangeness to be more idiomatic.
>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Cc: Samuel Neves <sneves@dei.uc.pt>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
> Cc: Russell King <linux@armlinux.org.uk>
> Cc: linux-arm-kernel at lists.infradead.org
> ---
> lib/zinc/Makefile | 1 +
> lib/zinc/curve25519/curve25519-arm-glue.h | 42 +
> lib/zinc/curve25519/curve25519-arm.S | 2095 +++++++++++++++++++++
> lib/zinc/curve25519/curve25519.c | 2 +
> 4 files changed, 2140 insertions(+)
> create mode 100644 lib/zinc/curve25519/curve25519-arm-glue.h
> create mode 100644 lib/zinc/curve25519/curve25519-arm.S
>
> diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
> index 65440438c6e5..be73c342f9ba 100644
> --- a/lib/zinc/Makefile
> +++ b/lib/zinc/Makefile
> @@ -27,4 +27,5 @@ zinc_blake2s-$(CONFIG_ZINC_ARCH_X86_64) += blake2s/blake2s-x86_64.o
> obj-$(CONFIG_ZINC_BLAKE2S) += zinc_blake2s.o
>
> zinc_curve25519-y := curve25519/curve25519.o
> +zinc_curve25519-$(CONFIG_ZINC_ARCH_ARM) += curve25519/curve25519-arm.o
> obj-$(CONFIG_ZINC_CURVE25519) += zinc_curve25519.o
> diff --git a/lib/zinc/curve25519/curve25519-arm-glue.h b/lib/zinc/curve25519/curve25519-arm-glue.h
> new file mode 100644
> index 000000000000..9211bcab5615
> --- /dev/null
> +++ b/lib/zinc/curve25519/curve25519-arm-glue.h
> @@ -0,0 +1,42 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + */
> +
> +#include <asm/hwcap.h>
> +#include <asm/neon.h>
> +#include <asm/simd.h>
> +
> +#if defined(CONFIG_KERNEL_MODE_NEON)
> +asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
> + const u8 secret[CURVE25519_KEY_SIZE],
> + const u8 basepoint[CURVE25519_KEY_SIZE]);
> +#endif
> +
> +static bool curve25519_use_neon __ro_after_init;
> +
> +static void __init curve25519_fpu_init(void)
> +{
> + curve25519_use_neon = elf_hwcap & HWCAP_NEON;
> +}
> +
> +static inline bool curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
> + const u8 secret[CURVE25519_KEY_SIZE],
> + const u8 basepoint[CURVE25519_KEY_SIZE])
> +{
> +#if defined(CONFIG_KERNEL_MODE_NEON)
> + if (curve25519_use_neon && may_use_simd()) {
> + kernel_neon_begin();
> + curve25519_neon(mypublic, secret, basepoint);
> + kernel_neon_end();
> + return true;
> + }
> +#endif
> + return false;
> +}
> +
> +static inline bool curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
> + const u8 secret[CURVE25519_KEY_SIZE])
> +{
> + return false;
> +}
Shouldn't this use the new simd abstraction as well?
> diff --git a/lib/zinc/curve25519/curve25519-arm.S b/lib/zinc/curve25519/curve25519-arm.S
> new file mode 100644
> index 000000000000..db6570c20fd1
> --- /dev/null
> +++ b/lib/zinc/curve25519/curve25519-arm.S
> @@ -0,0 +1,2095 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + *
> + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
> + * has been built from SUPERCOP's curve25519/neon2/scalarmult.pq using qhasm,
> + * but has subsequently been manually reworked for use in kernel space.
> + */
> +
> +#ifdef CONFIG_KERNEL_MODE_NEON
> +#include <linux/linkage.h>
> +
> +.text
> +.fpu neon
> +.arch armv7-a
> +.align 4
> +
> +ENTRY(curve25519_neon)
> + push {r4-r11, lr}
> + mov ip, sp
> + sub r3, sp, #704
> + and r3, r3, #0xfffffff0
> + mov sp, r3
> + movw r4, #0
> + movw r5, #254
> + vmov.i32 q0, #1
> + vshr.u64 q1, q0, #7
> + vshr.u64 q0, q0, #8
> + vmov.i32 d4, #19
> + vmov.i32 d5, #38
> + add r6, sp, #480
> + vst1.8 {d2-d3}, [r6, : 128]
> + add r6, sp, #496
> + vst1.8 {d0-d1}, [r6, : 128]
> + add r6, sp, #512
> + vst1.8 {d4-d5}, [r6, : 128]
I guess qhasm means generated code, right?
Because many of these adds are completely redundant ...
> + add r6, r3, #0
> + vmov.i32 q2, #0
> + vst1.8 {d4-d5}, [r6, : 128]!
> + vst1.8 {d4-d5}, [r6, : 128]!
> + vst1.8 d4, [r6, : 64]
> + add r6, r3, #0
> + movw r7, #960
> + sub r7, r7, #2
> + neg r7, r7
> + sub r7, r7, r7, LSL #7
This looks odd as well.
Could you elaborate on what qhasm is exactly? And, as with the other
patches, I would prefer it if we could have your changes as a separate
patch (although having the qhasm base would be preferred)
> + str r7, [r6]
> + add r6, sp, #672
> + vld1.8 {d4-d5}, [r1]!
> + vld1.8 {d6-d7}, [r1]
> + vst1.8 {d4-d5}, [r6, : 128]!
> + vst1.8 {d6-d7}, [r6, : 128]
> + sub r1, r6, #16
> + ldrb r6, [r1]
> + and r6, r6, #248
> + strb r6, [r1]
> + ldrb r6, [r1, #31]
> + and r6, r6, #127
> + orr r6, r6, #64
> + strb r6, [r1, #31]
> + vmov.i64 q2, #0xffffffff
> + vshr.u64 q3, q2, #7
> + vshr.u64 q2, q2, #6
> + vld1.8 {d8}, [r2]
> + vld1.8 {d10}, [r2]
> + add r2, r2, #6
> + vld1.8 {d12}, [r2]
> + vld1.8 {d14}, [r2]
> + add r2, r2, #6
> + vld1.8 {d16}, [r2]
> + add r2, r2, #4
> + vld1.8 {d18}, [r2]
> + vld1.8 {d20}, [r2]
> + add r2, r2, #6
> + vld1.8 {d22}, [r2]
> + add r2, r2, #2
> + vld1.8 {d24}, [r2]
> + vld1.8 {d26}, [r2]
> + vshr.u64 q5, q5, #26
> + vshr.u64 q6, q6, #3
> + vshr.u64 q7, q7, #29
> + vshr.u64 q8, q8, #6
> + vshr.u64 q10, q10, #25
> + vshr.u64 q11, q11, #3
> + vshr.u64 q12, q12, #12
> + vshr.u64 q13, q13, #38
> + vand q4, q4, q2
> + vand q6, q6, q2
> + vand q8, q8, q2
> + vand q10, q10, q2
> + vand q2, q12, q2
> + vand q5, q5, q3
> + vand q7, q7, q3
> + vand q9, q9, q3
> + vand q11, q11, q3
> + vand q3, q13, q3
> + add r2, r3, #48
> + vadd.i64 q12, q4, q1
> + vadd.i64 q13, q10, q1
> + vshr.s64 q12, q12, #26
> + vshr.s64 q13, q13, #26
> + vadd.i64 q5, q5, q12
> + vshl.i64 q12, q12, #26
> + vadd.i64 q14, q5, q0
> + vadd.i64 q11, q11, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q15, q11, q0
> + vsub.i64 q4, q4, q12
> + vshr.s64 q12, q14, #25
> + vsub.i64 q10, q10, q13
> + vshr.s64 q13, q15, #25
> + vadd.i64 q6, q6, q12
> + vshl.i64 q12, q12, #25
> + vadd.i64 q14, q6, q1
> + vadd.i64 q2, q2, q13
> + vsub.i64 q5, q5, q12
> + vshr.s64 q12, q14, #26
> + vshl.i64 q13, q13, #25
> + vadd.i64 q14, q2, q1
> + vadd.i64 q7, q7, q12
> + vshl.i64 q12, q12, #26
> + vadd.i64 q15, q7, q0
> + vsub.i64 q11, q11, q13
> + vshr.s64 q13, q14, #26
> + vsub.i64 q6, q6, q12
> + vshr.s64 q12, q15, #25
> + vadd.i64 q3, q3, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q14, q3, q0
> + vadd.i64 q8, q8, q12
> + vshl.i64 q12, q12, #25
> + vadd.i64 q15, q8, q1
> + add r2, r2, #8
> + vsub.i64 q2, q2, q13
> + vshr.s64 q13, q14, #25
> + vsub.i64 q7, q7, q12
> + vshr.s64 q12, q15, #26
> + vadd.i64 q14, q13, q13
> + vadd.i64 q9, q9, q12
> + vtrn.32 d12, d14
> + vshl.i64 q12, q12, #26
> + vtrn.32 d13, d15
> + vadd.i64 q0, q9, q0
> + vadd.i64 q4, q4, q14
> + vst1.8 d12, [r2, : 64]!
> + vshl.i64 q6, q13, #4
> + vsub.i64 q7, q8, q12
> + vshr.s64 q0, q0, #25
> + vadd.i64 q4, q4, q6
> + vadd.i64 q6, q10, q0
> + vshl.i64 q0, q0, #25
> + vadd.i64 q8, q6, q1
> + vadd.i64 q4, q4, q13
> + vshl.i64 q10, q13, #25
> + vadd.i64 q1, q4, q1
> + vsub.i64 q0, q9, q0
> + vshr.s64 q8, q8, #26
> + vsub.i64 q3, q3, q10
> + vtrn.32 d14, d0
> + vshr.s64 q1, q1, #26
> + vtrn.32 d15, d1
> + vadd.i64 q0, q11, q8
> + vst1.8 d14, [r2, : 64]
> + vshl.i64 q7, q8, #26
> + vadd.i64 q5, q5, q1
> + vtrn.32 d4, d6
> + vshl.i64 q1, q1, #26
> + vtrn.32 d5, d7
> + vsub.i64 q3, q6, q7
> + add r2, r2, #16
> + vsub.i64 q1, q4, q1
> + vst1.8 d4, [r2, : 64]
> + vtrn.32 d6, d0
> + vtrn.32 d7, d1
> + sub r2, r2, #8
> + vtrn.32 d2, d10
> + vtrn.32 d3, d11
> + vst1.8 d6, [r2, : 64]
> + sub r2, r2, #24
> + vst1.8 d2, [r2, : 64]
> + add r2, r3, #96
> + vmov.i32 q0, #0
> + vmov.i64 d2, #0xff
> + vmov.i64 d3, #0
> + vshr.u32 q1, q1, #7
> + vst1.8 {d2-d3}, [r2, : 128]!
> + vst1.8 {d0-d1}, [r2, : 128]!
> + vst1.8 d0, [r2, : 64]
> + add r2, r3, #144
> + vmov.i32 q0, #0
> + vst1.8 {d0-d1}, [r2, : 128]!
> + vst1.8 {d0-d1}, [r2, : 128]!
> + vst1.8 d0, [r2, : 64]
> + add r2, r3, #240
> + vmov.i32 q0, #0
> + vmov.i64 d2, #0xff
> + vmov.i64 d3, #0
> + vshr.u32 q1, q1, #7
> + vst1.8 {d2-d3}, [r2, : 128]!
> + vst1.8 {d0-d1}, [r2, : 128]!
> + vst1.8 d0, [r2, : 64]
> + add r2, r3, #48
> + add r6, r3, #192
> + vld1.8 {d0-d1}, [r2, : 128]!
> + vld1.8 {d2-d3}, [r2, : 128]!
> + vld1.8 {d4}, [r2, : 64]
> + vst1.8 {d0-d1}, [r6, : 128]!
> + vst1.8 {d2-d3}, [r6, : 128]!
> + vst1.8 d4, [r6, : 64]
> +.Lmainloop:
> + mov r2, r5, LSR #3
> + and r6, r5, #7
> + ldrb r2, [r1, r2]
> + mov r2, r2, LSR r6
> + and r2, r2, #1
> + str r5, [sp, #456]
> + eor r4, r4, r2
> + str r2, [sp, #460]
> + neg r2, r4
> + add r4, r3, #96
> + add r5, r3, #192
> + add r6, r3, #144
> + vld1.8 {d8-d9}, [r4, : 128]!
> + add r7, r3, #240
> + vld1.8 {d10-d11}, [r5, : 128]!
> + veor q6, q4, q5
> + vld1.8 {d14-d15}, [r6, : 128]!
> + vdup.i32 q8, r2
> + vld1.8 {d18-d19}, [r7, : 128]!
> + veor q10, q7, q9
> + vld1.8 {d22-d23}, [r4, : 128]!
> + vand q6, q6, q8
> + vld1.8 {d24-d25}, [r5, : 128]!
> + vand q10, q10, q8
> + vld1.8 {d26-d27}, [r6, : 128]!
> + veor q4, q4, q6
> + vld1.8 {d28-d29}, [r7, : 128]!
> + veor q5, q5, q6
> + vld1.8 {d0}, [r4, : 64]
> + veor q6, q7, q10
> + vld1.8 {d2}, [r5, : 64]
> + veor q7, q9, q10
> + vld1.8 {d4}, [r6, : 64]
> + veor q9, q11, q12
> + vld1.8 {d6}, [r7, : 64]
> + veor q10, q0, q1
> + sub r2, r4, #32
> + vand q9, q9, q8
> + sub r4, r5, #32
> + vand q10, q10, q8
> + sub r5, r6, #32
> + veor q11, q11, q9
> + sub r6, r7, #32
> + veor q0, q0, q10
> + veor q9, q12, q9
> + veor q1, q1, q10
> + veor q10, q13, q14
> + veor q12, q2, q3
> + vand q10, q10, q8
> + vand q8, q12, q8
> + veor q12, q13, q10
> + veor q2, q2, q8
> + veor q10, q14, q10
> + veor q3, q3, q8
> + vadd.i32 q8, q4, q6
> + vsub.i32 q4, q4, q6
> + vst1.8 {d16-d17}, [r2, : 128]!
> + vadd.i32 q6, q11, q12
> + vst1.8 {d8-d9}, [r5, : 128]!
> + vsub.i32 q4, q11, q12
> + vst1.8 {d12-d13}, [r2, : 128]!
> + vadd.i32 q6, q0, q2
> + vst1.8 {d8-d9}, [r5, : 128]!
> + vsub.i32 q0, q0, q2
> + vst1.8 d12, [r2, : 64]
> + vadd.i32 q2, q5, q7
> + vst1.8 d0, [r5, : 64]
> + vsub.i32 q0, q5, q7
> + vst1.8 {d4-d5}, [r4, : 128]!
> + vadd.i32 q2, q9, q10
> + vst1.8 {d0-d1}, [r6, : 128]!
> + vsub.i32 q0, q9, q10
> + vst1.8 {d4-d5}, [r4, : 128]!
> + vadd.i32 q2, q1, q3
> + vst1.8 {d0-d1}, [r6, : 128]!
> + vsub.i32 q0, q1, q3
> + vst1.8 d4, [r4, : 64]
> + vst1.8 d0, [r6, : 64]
> + add r2, sp, #512
> + add r4, r3, #96
> + add r5, r3, #144
> + vld1.8 {d0-d1}, [r2, : 128]
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vld1.8 {d4-d5}, [r5, : 128]!
> + vzip.i32 q1, q2
> + vld1.8 {d6-d7}, [r4, : 128]!
> + vld1.8 {d8-d9}, [r5, : 128]!
> + vshl.i32 q5, q1, #1
> + vzip.i32 q3, q4
> + vshl.i32 q6, q2, #1
> + vld1.8 {d14}, [r4, : 64]
> + vshl.i32 q8, q3, #1
> + vld1.8 {d15}, [r5, : 64]
> + vshl.i32 q9, q4, #1
> + vmul.i32 d21, d7, d1
> + vtrn.32 d14, d15
> + vmul.i32 q11, q4, q0
> + vmul.i32 q0, q7, q0
> + vmull.s32 q12, d2, d2
> + vmlal.s32 q12, d11, d1
> + vmlal.s32 q12, d12, d0
> + vmlal.s32 q12, d13, d23
> + vmlal.s32 q12, d16, d22
> + vmlal.s32 q12, d7, d21
> + vmull.s32 q10, d2, d11
> + vmlal.s32 q10, d4, d1
> + vmlal.s32 q10, d13, d0
> + vmlal.s32 q10, d6, d23
> + vmlal.s32 q10, d17, d22
> + vmull.s32 q13, d10, d4
> + vmlal.s32 q13, d11, d3
> + vmlal.s32 q13, d13, d1
> + vmlal.s32 q13, d16, d0
> + vmlal.s32 q13, d17, d23
> + vmlal.s32 q13, d8, d22
> + vmull.s32 q1, d10, d5
> + vmlal.s32 q1, d11, d4
> + vmlal.s32 q1, d6, d1
> + vmlal.s32 q1, d17, d0
> + vmlal.s32 q1, d8, d23
> + vmull.s32 q14, d10, d6
> + vmlal.s32 q14, d11, d13
> + vmlal.s32 q14, d4, d4
> + vmlal.s32 q14, d17, d1
> + vmlal.s32 q14, d18, d0
> + vmlal.s32 q14, d9, d23
> + vmull.s32 q11, d10, d7
> + vmlal.s32 q11, d11, d6
> + vmlal.s32 q11, d12, d5
> + vmlal.s32 q11, d8, d1
> + vmlal.s32 q11, d19, d0
> + vmull.s32 q15, d10, d8
> + vmlal.s32 q15, d11, d17
> + vmlal.s32 q15, d12, d6
> + vmlal.s32 q15, d13, d5
> + vmlal.s32 q15, d19, d1
> + vmlal.s32 q15, d14, d0
> + vmull.s32 q2, d10, d9
> + vmlal.s32 q2, d11, d8
> + vmlal.s32 q2, d12, d7
> + vmlal.s32 q2, d13, d6
> + vmlal.s32 q2, d14, d1
> + vmull.s32 q0, d15, d1
> + vmlal.s32 q0, d10, d14
> + vmlal.s32 q0, d11, d19
> + vmlal.s32 q0, d12, d8
> + vmlal.s32 q0, d13, d17
> + vmlal.s32 q0, d6, d6
> + add r2, sp, #480
> + vld1.8 {d18-d19}, [r2, : 128]
If you append a ! here ...
> + vmull.s32 q3, d16, d7
> + vmlal.s32 q3, d10, d15
> + vmlal.s32 q3, d11, d14
> + vmlal.s32 q3, d12, d9
> + vmlal.s32 q3, d13, d8
> + add r2, sp, #496
... you can drop this add
> + vld1.8 {d8-d9}, [r2, : 128]
> + vadd.i64 q5, q12, q9
> + vadd.i64 q6, q15, q9
> + vshr.s64 q5, q5, #26
> + vshr.s64 q6, q6, #26
> + vadd.i64 q7, q10, q5
> + vshl.i64 q5, q5, #26
> + vadd.i64 q8, q7, q4
> + vadd.i64 q2, q2, q6
> + vshl.i64 q6, q6, #26
> + vadd.i64 q10, q2, q4
> + vsub.i64 q5, q12, q5
> + vshr.s64 q8, q8, #25
> + vsub.i64 q6, q15, q6
> + vshr.s64 q10, q10, #25
> + vadd.i64 q12, q13, q8
> + vshl.i64 q8, q8, #25
> + vadd.i64 q13, q12, q9
> + vadd.i64 q0, q0, q10
> + vsub.i64 q7, q7, q8
> + vshr.s64 q8, q13, #26
> + vshl.i64 q10, q10, #25
> + vadd.i64 q13, q0, q9
> + vadd.i64 q1, q1, q8
> + vshl.i64 q8, q8, #26
> + vadd.i64 q15, q1, q4
> + vsub.i64 q2, q2, q10
> + vshr.s64 q10, q13, #26
> + vsub.i64 q8, q12, q8
> + vshr.s64 q12, q15, #25
> + vadd.i64 q3, q3, q10
> + vshl.i64 q10, q10, #26
> + vadd.i64 q13, q3, q4
> + vadd.i64 q14, q14, q12
> + add r2, r3, #288
> + vshl.i64 q12, q12, #25
> + add r4, r3, #336
> + vadd.i64 q15, q14, q9
> + add r2, r2, #8
> + vsub.i64 q0, q0, q10
> + add r4, r4, #8
> + vshr.s64 q10, q13, #25
> + vsub.i64 q1, q1, q12
> + vshr.s64 q12, q15, #26
> + vadd.i64 q13, q10, q10
> + vadd.i64 q11, q11, q12
> + vtrn.32 d16, d2
> + vshl.i64 q12, q12, #26
> + vtrn.32 d17, d3
> + vadd.i64 q1, q11, q4
> + vadd.i64 q4, q5, q13
> + vst1.8 d16, [r2, : 64]!
> + vshl.i64 q5, q10, #4
> + vst1.8 d17, [r4, : 64]!
> + vsub.i64 q8, q14, q12
> + vshr.s64 q1, q1, #25
> + vadd.i64 q4, q4, q5
> + vadd.i64 q5, q6, q1
> + vshl.i64 q1, q1, #25
> + vadd.i64 q6, q5, q9
> + vadd.i64 q4, q4, q10
> + vshl.i64 q10, q10, #25
> + vadd.i64 q9, q4, q9
> + vsub.i64 q1, q11, q1
> + vshr.s64 q6, q6, #26
> + vsub.i64 q3, q3, q10
> + vtrn.32 d16, d2
> + vshr.s64 q9, q9, #26
> + vtrn.32 d17, d3
> + vadd.i64 q1, q2, q6
> + vst1.8 d16, [r2, : 64]
> + vshl.i64 q2, q6, #26
> + vst1.8 d17, [r4, : 64]
> + vadd.i64 q6, q7, q9
> + vtrn.32 d0, d6
> + vshl.i64 q7, q9, #26
> + vtrn.32 d1, d7
> + vsub.i64 q2, q5, q2
> + add r2, r2, #16
> + vsub.i64 q3, q4, q7
> + vst1.8 d0, [r2, : 64]
> + add r4, r4, #16
> + vst1.8 d1, [r4, : 64]
> + vtrn.32 d4, d2
> + vtrn.32 d5, d3
> + sub r2, r2, #8
> + sub r4, r4, #8
> + vtrn.32 d6, d12
> + vtrn.32 d7, d13
> + vst1.8 d4, [r2, : 64]
> + vst1.8 d5, [r4, : 64]
> + sub r2, r2, #24
> + sub r4, r4, #24
> + vst1.8 d6, [r2, : 64]
> + vst1.8 d7, [r4, : 64]
> + add r2, r3, #240
> + add r4, r3, #96
> + vld1.8 {d0-d1}, [r4, : 128]!
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vld1.8 {d4}, [r4, : 64]
> + add r4, r3, #144
> + vld1.8 {d6-d7}, [r4, : 128]!
> + vtrn.32 q0, q3
> + vld1.8 {d8-d9}, [r4, : 128]!
> + vshl.i32 q5, q0, #4
> + vtrn.32 q1, q4
> + vshl.i32 q6, q3, #4
> + vadd.i32 q5, q5, q0
> + vadd.i32 q6, q6, q3
> + vshl.i32 q7, q1, #4
> + vld1.8 {d5}, [r4, : 64]
> + vshl.i32 q8, q4, #4
> + vtrn.32 d4, d5
> + vadd.i32 q7, q7, q1
> + vadd.i32 q8, q8, q4
> + vld1.8 {d18-d19}, [r2, : 128]!
> + vshl.i32 q10, q2, #4
> + vld1.8 {d22-d23}, [r2, : 128]!
> + vadd.i32 q10, q10, q2
> + vld1.8 {d24}, [r2, : 64]
> + vadd.i32 q5, q5, q0
> + add r2, r3, #192
> + vld1.8 {d26-d27}, [r2, : 128]!
> + vadd.i32 q6, q6, q3
> + vld1.8 {d28-d29}, [r2, : 128]!
> + vadd.i32 q8, q8, q4
> + vld1.8 {d25}, [r2, : 64]
> + vadd.i32 q10, q10, q2
> + vtrn.32 q9, q13
> + vadd.i32 q7, q7, q1
> + vadd.i32 q5, q5, q0
> + vtrn.32 q11, q14
> + vadd.i32 q6, q6, q3
> + add r2, sp, #528
> + vadd.i32 q10, q10, q2
> + vtrn.32 d24, d25
> + vst1.8 {d12-d13}, [r2, : 128]
same here
> + vshl.i32 q6, q13, #1
> + add r2, sp, #544
> + vst1.8 {d20-d21}, [r2, : 128]
and here
> + vshl.i32 q10, q14, #1
> + add r2, sp, #560
> + vst1.8 {d12-d13}, [r2, : 128]
and here
> + vshl.i32 q15, q12, #1
> + vadd.i32 q8, q8, q4
> + vext.32 d10, d31, d30, #0
> + vadd.i32 q7, q7, q1
> + add r2, sp, #576
> + vst1.8 {d16-d17}, [r2, : 128]
and here
> + vmull.s32 q8, d18, d5
> + vmlal.s32 q8, d26, d4
> + vmlal.s32 q8, d19, d9
> + vmlal.s32 q8, d27, d3
> + vmlal.s32 q8, d22, d8
> + vmlal.s32 q8, d28, d2
> + vmlal.s32 q8, d23, d7
> + vmlal.s32 q8, d29, d1
> + vmlal.s32 q8, d24, d6
> + vmlal.s32 q8, d25, d0
> + add r2, sp, #592
> + vst1.8 {d14-d15}, [r2, : 128]
and here
> + vmull.s32 q2, d18, d4
> + vmlal.s32 q2, d12, d9
> + vmlal.s32 q2, d13, d8
> + vmlal.s32 q2, d19, d3
> + vmlal.s32 q2, d22, d2
> + vmlal.s32 q2, d23, d1
> + vmlal.s32 q2, d24, d0
> + add r2, sp, #608
> + vst1.8 {d20-d21}, [r2, : 128]
and here
> + vmull.s32 q7, d18, d9
> + vmlal.s32 q7, d26, d3
> + vmlal.s32 q7, d19, d8
> + vmlal.s32 q7, d27, d2
> + vmlal.s32 q7, d22, d7
> + vmlal.s32 q7, d28, d1
> + vmlal.s32 q7, d23, d6
> + vmlal.s32 q7, d29, d0
> + add r2, sp, #624
> + vst1.8 {d10-d11}, [r2, : 128]
and here
> + vmull.s32 q5, d18, d3
> + vmlal.s32 q5, d19, d2
> + vmlal.s32 q5, d22, d1
> + vmlal.s32 q5, d23, d0
> + vmlal.s32 q5, d12, d8
> + add r2, sp, #640
> + vst1.8 {d16-d17}, [r2, : 128]
> + vmull.s32 q4, d18, d8
> + vmlal.s32 q4, d26, d2
> + vmlal.s32 q4, d19, d7
> + vmlal.s32 q4, d27, d1
> + vmlal.s32 q4, d22, d6
> + vmlal.s32 q4, d28, d0
> + vmull.s32 q8, d18, d7
> + vmlal.s32 q8, d26, d1
> + vmlal.s32 q8, d19, d6
> + vmlal.s32 q8, d27, d0
> + add r2, sp, #544
> + vld1.8 {d20-d21}, [r2, : 128]
> + vmlal.s32 q7, d24, d21
> + vmlal.s32 q7, d25, d20
> + vmlal.s32 q4, d23, d21
> + vmlal.s32 q4, d29, d20
> + vmlal.s32 q8, d22, d21
> + vmlal.s32 q8, d28, d20
> + vmlal.s32 q5, d24, d20
> + add r2, sp, #544
redundant add
I'll stop here - let me just note that this code does not strike me as
particularly well optimized for in-order cores (such as A7).
For instance, the sequence
vmlal.s32 q2, d18, d7
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d18, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d18, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d18, d28
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d18, d29
vmlal.s32 q6, d19, d28
can be reordered as
vmlal.s32 q2, d18, d7
vmlal.s32 q5, d18, d6
vmlal.s32 q1, d18, d21
vmlal.s32 q0, d18, d28
vmlal.s32 q6, d18, d29
vmlal.s32 q2, d19, d6
vmlal.s32 q5, d19, d21
vmlal.s32 q1, d19, d29
vmlal.s32 q0, d19, d9
vmlal.s32 q6, d19, d28
and not have every other instruction depend on the output of the previous one.
Obviously, the ultimate truth is in the benchmark numbers, but I'd
thought I'd mention it anyway.
> + vst1.8 {d14-d15}, [r2, : 128]
> + vmull.s32 q7, d18, d6
> + vmlal.s32 q7, d26, d0
> + add r2, sp, #624
> + vld1.8 {d30-d31}, [r2, : 128]
> + vmlal.s32 q2, d30, d21
> + vmlal.s32 q7, d19, d21
> + vmlal.s32 q7, d27, d20
> + add r2, sp, #592
> + vld1.8 {d26-d27}, [r2, : 128]
> + vmlal.s32 q4, d25, d27
> + vmlal.s32 q8, d29, d27
> + vmlal.s32 q8, d25, d26
> + vmlal.s32 q7, d28, d27
> + vmlal.s32 q7, d29, d26
> + add r2, sp, #576
> + vld1.8 {d28-d29}, [r2, : 128]
> + vmlal.s32 q4, d24, d29
> + vmlal.s32 q8, d23, d29
> + vmlal.s32 q8, d24, d28
> + vmlal.s32 q7, d22, d29
> + vmlal.s32 q7, d23, d28
> + add r2, sp, #576
> + vst1.8 {d8-d9}, [r2, : 128]
> + add r2, sp, #528
> + vld1.8 {d8-d9}, [r2, : 128]
> + vmlal.s32 q7, d24, d9
> + vmlal.s32 q7, d25, d31
> + vmull.s32 q1, d18, d2
> + vmlal.s32 q1, d19, d1
> + vmlal.s32 q1, d22, d0
> + vmlal.s32 q1, d24, d27
> + vmlal.s32 q1, d23, d20
> + vmlal.s32 q1, d12, d7
> + vmlal.s32 q1, d13, d6
> + vmull.s32 q6, d18, d1
> + vmlal.s32 q6, d19, d0
> + vmlal.s32 q6, d23, d27
> + vmlal.s32 q6, d22, d20
> + vmlal.s32 q6, d24, d26
> + vmull.s32 q0, d18, d0
> + vmlal.s32 q0, d22, d27
> + vmlal.s32 q0, d23, d26
> + vmlal.s32 q0, d24, d31
> + vmlal.s32 q0, d19, d20
> + add r2, sp, #608
> + vld1.8 {d18-d19}, [r2, : 128]
> + vmlal.s32 q2, d18, d7
> + vmlal.s32 q2, d19, d6
> + vmlal.s32 q5, d18, d6
> + vmlal.s32 q5, d19, d21
> + vmlal.s32 q1, d18, d21
> + vmlal.s32 q1, d19, d29
> + vmlal.s32 q0, d18, d28
> + vmlal.s32 q0, d19, d9
> + vmlal.s32 q6, d18, d29
> + vmlal.s32 q6, d19, d28
> + add r2, sp, #560
> + vld1.8 {d18-d19}, [r2, : 128]
> + add r2, sp, #480
> + vld1.8 {d22-d23}, [r2, : 128]
> + vmlal.s32 q5, d19, d7
> + vmlal.s32 q0, d18, d21
> + vmlal.s32 q0, d19, d29
> + vmlal.s32 q6, d18, d6
> + add r2, sp, #496
> + vld1.8 {d6-d7}, [r2, : 128]
> + vmlal.s32 q6, d19, d21
> + add r2, sp, #544
> + vld1.8 {d18-d19}, [r2, : 128]
> + vmlal.s32 q0, d30, d8
> + add r2, sp, #640
> + vld1.8 {d20-d21}, [r2, : 128]
> + vmlal.s32 q5, d30, d29
> + add r2, sp, #576
> + vld1.8 {d24-d25}, [r2, : 128]
> + vmlal.s32 q1, d30, d28
> + vadd.i64 q13, q0, q11
> + vadd.i64 q14, q5, q11
> + vmlal.s32 q6, d30, d9
> + vshr.s64 q4, q13, #26
> + vshr.s64 q13, q14, #26
> + vadd.i64 q7, q7, q4
> + vshl.i64 q4, q4, #26
> + vadd.i64 q14, q7, q3
> + vadd.i64 q9, q9, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q15, q9, q3
> + vsub.i64 q0, q0, q4
> + vshr.s64 q4, q14, #25
> + vsub.i64 q5, q5, q13
> + vshr.s64 q13, q15, #25
> + vadd.i64 q6, q6, q4
> + vshl.i64 q4, q4, #25
> + vadd.i64 q14, q6, q11
> + vadd.i64 q2, q2, q13
> + vsub.i64 q4, q7, q4
> + vshr.s64 q7, q14, #26
> + vshl.i64 q13, q13, #25
> + vadd.i64 q14, q2, q11
> + vadd.i64 q8, q8, q7
> + vshl.i64 q7, q7, #26
> + vadd.i64 q15, q8, q3
> + vsub.i64 q9, q9, q13
> + vshr.s64 q13, q14, #26
> + vsub.i64 q6, q6, q7
> + vshr.s64 q7, q15, #25
> + vadd.i64 q10, q10, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q14, q10, q3
> + vadd.i64 q1, q1, q7
> + add r2, r3, #144
> + vshl.i64 q7, q7, #25
> + add r4, r3, #96
> + vadd.i64 q15, q1, q11
> + add r2, r2, #8
> + vsub.i64 q2, q2, q13
> + add r4, r4, #8
> + vshr.s64 q13, q14, #25
> + vsub.i64 q7, q8, q7
> + vshr.s64 q8, q15, #26
> + vadd.i64 q14, q13, q13
> + vadd.i64 q12, q12, q8
> + vtrn.32 d12, d14
> + vshl.i64 q8, q8, #26
> + vtrn.32 d13, d15
> + vadd.i64 q3, q12, q3
> + vadd.i64 q0, q0, q14
> + vst1.8 d12, [r2, : 64]!
> + vshl.i64 q7, q13, #4
> + vst1.8 d13, [r4, : 64]!
> + vsub.i64 q1, q1, q8
> + vshr.s64 q3, q3, #25
> + vadd.i64 q0, q0, q7
> + vadd.i64 q5, q5, q3
> + vshl.i64 q3, q3, #25
> + vadd.i64 q6, q5, q11
> + vadd.i64 q0, q0, q13
> + vshl.i64 q7, q13, #25
> + vadd.i64 q8, q0, q11
> + vsub.i64 q3, q12, q3
> + vshr.s64 q6, q6, #26
> + vsub.i64 q7, q10, q7
> + vtrn.32 d2, d6
> + vshr.s64 q8, q8, #26
> + vtrn.32 d3, d7
> + vadd.i64 q3, q9, q6
> + vst1.8 d2, [r2, : 64]
> + vshl.i64 q6, q6, #26
> + vst1.8 d3, [r4, : 64]
> + vadd.i64 q1, q4, q8
> + vtrn.32 d4, d14
> + vshl.i64 q4, q8, #26
> + vtrn.32 d5, d15
> + vsub.i64 q5, q5, q6
> + add r2, r2, #16
> + vsub.i64 q0, q0, q4
> + vst1.8 d4, [r2, : 64]
> + add r4, r4, #16
> + vst1.8 d5, [r4, : 64]
> + vtrn.32 d10, d6
> + vtrn.32 d11, d7
> + sub r2, r2, #8
> + sub r4, r4, #8
> + vtrn.32 d0, d2
> + vtrn.32 d1, d3
> + vst1.8 d10, [r2, : 64]
> + vst1.8 d11, [r4, : 64]
> + sub r2, r2, #24
> + sub r4, r4, #24
> + vst1.8 d0, [r2, : 64]
> + vst1.8 d1, [r4, : 64]
> + add r2, r3, #288
> + add r4, r3, #336
> + vld1.8 {d0-d1}, [r2, : 128]!
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vsub.i32 q0, q0, q1
> + vld1.8 {d2-d3}, [r2, : 128]!
> + vld1.8 {d4-d5}, [r4, : 128]!
> + vsub.i32 q1, q1, q2
> + add r5, r3, #240
> + vld1.8 {d4}, [r2, : 64]
> + vld1.8 {d6}, [r4, : 64]
> + vsub.i32 q2, q2, q3
> + vst1.8 {d0-d1}, [r5, : 128]!
> + vst1.8 {d2-d3}, [r5, : 128]!
> + vst1.8 d4, [r5, : 64]
> + add r2, r3, #144
> + add r4, r3, #96
> + add r5, r3, #144
> + add r6, r3, #192
> + vld1.8 {d0-d1}, [r2, : 128]!
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vsub.i32 q2, q0, q1
> + vadd.i32 q0, q0, q1
> + vld1.8 {d2-d3}, [r2, : 128]!
> + vld1.8 {d6-d7}, [r4, : 128]!
> + vsub.i32 q4, q1, q3
> + vadd.i32 q1, q1, q3
> + vld1.8 {d6}, [r2, : 64]
> + vld1.8 {d10}, [r4, : 64]
> + vsub.i32 q6, q3, q5
> + vadd.i32 q3, q3, q5
> + vst1.8 {d4-d5}, [r5, : 128]!
> + vst1.8 {d0-d1}, [r6, : 128]!
> + vst1.8 {d8-d9}, [r5, : 128]!
> + vst1.8 {d2-d3}, [r6, : 128]!
> + vst1.8 d12, [r5, : 64]
> + vst1.8 d6, [r6, : 64]
> + add r2, r3, #0
> + add r4, r3, #240
> + vld1.8 {d0-d1}, [r4, : 128]!
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vld1.8 {d4}, [r4, : 64]
> + add r4, r3, #336
> + vld1.8 {d6-d7}, [r4, : 128]!
> + vtrn.32 q0, q3
> + vld1.8 {d8-d9}, [r4, : 128]!
> + vshl.i32 q5, q0, #4
> + vtrn.32 q1, q4
> + vshl.i32 q6, q3, #4
> + vadd.i32 q5, q5, q0
> + vadd.i32 q6, q6, q3
> + vshl.i32 q7, q1, #4
> + vld1.8 {d5}, [r4, : 64]
> + vshl.i32 q8, q4, #4
> + vtrn.32 d4, d5
> + vadd.i32 q7, q7, q1
> + vadd.i32 q8, q8, q4
> + vld1.8 {d18-d19}, [r2, : 128]!
> + vshl.i32 q10, q2, #4
> + vld1.8 {d22-d23}, [r2, : 128]!
> + vadd.i32 q10, q10, q2
> + vld1.8 {d24}, [r2, : 64]
> + vadd.i32 q5, q5, q0
> + add r2, r3, #288
> + vld1.8 {d26-d27}, [r2, : 128]!
> + vadd.i32 q6, q6, q3
> + vld1.8 {d28-d29}, [r2, : 128]!
> + vadd.i32 q8, q8, q4
> + vld1.8 {d25}, [r2, : 64]
> + vadd.i32 q10, q10, q2
> + vtrn.32 q9, q13
> + vadd.i32 q7, q7, q1
> + vadd.i32 q5, q5, q0
> + vtrn.32 q11, q14
> + vadd.i32 q6, q6, q3
> + add r2, sp, #528
> + vadd.i32 q10, q10, q2
> + vtrn.32 d24, d25
> + vst1.8 {d12-d13}, [r2, : 128]
> + vshl.i32 q6, q13, #1
> + add r2, sp, #544
> + vst1.8 {d20-d21}, [r2, : 128]
> + vshl.i32 q10, q14, #1
> + add r2, sp, #560
> + vst1.8 {d12-d13}, [r2, : 128]
> + vshl.i32 q15, q12, #1
> + vadd.i32 q8, q8, q4
> + vext.32 d10, d31, d30, #0
> + vadd.i32 q7, q7, q1
> + add r2, sp, #576
> + vst1.8 {d16-d17}, [r2, : 128]
> + vmull.s32 q8, d18, d5
> + vmlal.s32 q8, d26, d4
> + vmlal.s32 q8, d19, d9
> + vmlal.s32 q8, d27, d3
> + vmlal.s32 q8, d22, d8
> + vmlal.s32 q8, d28, d2
> + vmlal.s32 q8, d23, d7
> + vmlal.s32 q8, d29, d1
> + vmlal.s32 q8, d24, d6
> + vmlal.s32 q8, d25, d0
> + add r2, sp, #592
> + vst1.8 {d14-d15}, [r2, : 128]
> + vmull.s32 q2, d18, d4
> + vmlal.s32 q2, d12, d9
> + vmlal.s32 q2, d13, d8
> + vmlal.s32 q2, d19, d3
> + vmlal.s32 q2, d22, d2
> + vmlal.s32 q2, d23, d1
> + vmlal.s32 q2, d24, d0
> + add r2, sp, #608
> + vst1.8 {d20-d21}, [r2, : 128]
> + vmull.s32 q7, d18, d9
> + vmlal.s32 q7, d26, d3
> + vmlal.s32 q7, d19, d8
> + vmlal.s32 q7, d27, d2
> + vmlal.s32 q7, d22, d7
> + vmlal.s32 q7, d28, d1
> + vmlal.s32 q7, d23, d6
> + vmlal.s32 q7, d29, d0
> + add r2, sp, #624
> + vst1.8 {d10-d11}, [r2, : 128]
> + vmull.s32 q5, d18, d3
> + vmlal.s32 q5, d19, d2
> + vmlal.s32 q5, d22, d1
> + vmlal.s32 q5, d23, d0
> + vmlal.s32 q5, d12, d8
> + add r2, sp, #640
> + vst1.8 {d16-d17}, [r2, : 128]
> + vmull.s32 q4, d18, d8
> + vmlal.s32 q4, d26, d2
> + vmlal.s32 q4, d19, d7
> + vmlal.s32 q4, d27, d1
> + vmlal.s32 q4, d22, d6
> + vmlal.s32 q4, d28, d0
> + vmull.s32 q8, d18, d7
> + vmlal.s32 q8, d26, d1
> + vmlal.s32 q8, d19, d6
> + vmlal.s32 q8, d27, d0
> + add r2, sp, #544
> + vld1.8 {d20-d21}, [r2, : 128]
> + vmlal.s32 q7, d24, d21
> + vmlal.s32 q7, d25, d20
> + vmlal.s32 q4, d23, d21
> + vmlal.s32 q4, d29, d20
> + vmlal.s32 q8, d22, d21
> + vmlal.s32 q8, d28, d20
> + vmlal.s32 q5, d24, d20
> + add r2, sp, #544
> + vst1.8 {d14-d15}, [r2, : 128]
> + vmull.s32 q7, d18, d6
> + vmlal.s32 q7, d26, d0
> + add r2, sp, #624
> + vld1.8 {d30-d31}, [r2, : 128]
> + vmlal.s32 q2, d30, d21
> + vmlal.s32 q7, d19, d21
> + vmlal.s32 q7, d27, d20
> + add r2, sp, #592
> + vld1.8 {d26-d27}, [r2, : 128]
> + vmlal.s32 q4, d25, d27
> + vmlal.s32 q8, d29, d27
> + vmlal.s32 q8, d25, d26
> + vmlal.s32 q7, d28, d27
> + vmlal.s32 q7, d29, d26
> + add r2, sp, #576
> + vld1.8 {d28-d29}, [r2, : 128]
> + vmlal.s32 q4, d24, d29
> + vmlal.s32 q8, d23, d29
> + vmlal.s32 q8, d24, d28
> + vmlal.s32 q7, d22, d29
> + vmlal.s32 q7, d23, d28
> + add r2, sp, #576
> + vst1.8 {d8-d9}, [r2, : 128]
> + add r2, sp, #528
> + vld1.8 {d8-d9}, [r2, : 128]
> + vmlal.s32 q7, d24, d9
> + vmlal.s32 q7, d25, d31
> + vmull.s32 q1, d18, d2
> + vmlal.s32 q1, d19, d1
> + vmlal.s32 q1, d22, d0
> + vmlal.s32 q1, d24, d27
> + vmlal.s32 q1, d23, d20
> + vmlal.s32 q1, d12, d7
> + vmlal.s32 q1, d13, d6
> + vmull.s32 q6, d18, d1
> + vmlal.s32 q6, d19, d0
> + vmlal.s32 q6, d23, d27
> + vmlal.s32 q6, d22, d20
> + vmlal.s32 q6, d24, d26
> + vmull.s32 q0, d18, d0
> + vmlal.s32 q0, d22, d27
> + vmlal.s32 q0, d23, d26
> + vmlal.s32 q0, d24, d31
> + vmlal.s32 q0, d19, d20
> + add r2, sp, #608
> + vld1.8 {d18-d19}, [r2, : 128]
> + vmlal.s32 q2, d18, d7
> + vmlal.s32 q2, d19, d6
> + vmlal.s32 q5, d18, d6
> + vmlal.s32 q5, d19, d21
> + vmlal.s32 q1, d18, d21
> + vmlal.s32 q1, d19, d29
> + vmlal.s32 q0, d18, d28
> + vmlal.s32 q0, d19, d9
> + vmlal.s32 q6, d18, d29
> + vmlal.s32 q6, d19, d28
> + add r2, sp, #560
> + vld1.8 {d18-d19}, [r2, : 128]
> + add r2, sp, #480
> + vld1.8 {d22-d23}, [r2, : 128]
> + vmlal.s32 q5, d19, d7
> + vmlal.s32 q0, d18, d21
> + vmlal.s32 q0, d19, d29
> + vmlal.s32 q6, d18, d6
> + add r2, sp, #496
> + vld1.8 {d6-d7}, [r2, : 128]
> + vmlal.s32 q6, d19, d21
> + add r2, sp, #544
> + vld1.8 {d18-d19}, [r2, : 128]
> + vmlal.s32 q0, d30, d8
> + add r2, sp, #640
> + vld1.8 {d20-d21}, [r2, : 128]
> + vmlal.s32 q5, d30, d29
> + add r2, sp, #576
> + vld1.8 {d24-d25}, [r2, : 128]
> + vmlal.s32 q1, d30, d28
> + vadd.i64 q13, q0, q11
> + vadd.i64 q14, q5, q11
> + vmlal.s32 q6, d30, d9
> + vshr.s64 q4, q13, #26
> + vshr.s64 q13, q14, #26
> + vadd.i64 q7, q7, q4
> + vshl.i64 q4, q4, #26
> + vadd.i64 q14, q7, q3
> + vadd.i64 q9, q9, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q15, q9, q3
> + vsub.i64 q0, q0, q4
> + vshr.s64 q4, q14, #25
> + vsub.i64 q5, q5, q13
> + vshr.s64 q13, q15, #25
> + vadd.i64 q6, q6, q4
> + vshl.i64 q4, q4, #25
> + vadd.i64 q14, q6, q11
> + vadd.i64 q2, q2, q13
> + vsub.i64 q4, q7, q4
> + vshr.s64 q7, q14, #26
> + vshl.i64 q13, q13, #25
> + vadd.i64 q14, q2, q11
> + vadd.i64 q8, q8, q7
> + vshl.i64 q7, q7, #26
> + vadd.i64 q15, q8, q3
> + vsub.i64 q9, q9, q13
> + vshr.s64 q13, q14, #26
> + vsub.i64 q6, q6, q7
> + vshr.s64 q7, q15, #25
> + vadd.i64 q10, q10, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q14, q10, q3
> + vadd.i64 q1, q1, q7
> + add r2, r3, #288
> + vshl.i64 q7, q7, #25
> + add r4, r3, #96
> + vadd.i64 q15, q1, q11
> + add r2, r2, #8
> + vsub.i64 q2, q2, q13
> + add r4, r4, #8
> + vshr.s64 q13, q14, #25
> + vsub.i64 q7, q8, q7
> + vshr.s64 q8, q15, #26
> + vadd.i64 q14, q13, q13
> + vadd.i64 q12, q12, q8
> + vtrn.32 d12, d14
> + vshl.i64 q8, q8, #26
> + vtrn.32 d13, d15
> + vadd.i64 q3, q12, q3
> + vadd.i64 q0, q0, q14
> + vst1.8 d12, [r2, : 64]!
> + vshl.i64 q7, q13, #4
> + vst1.8 d13, [r4, : 64]!
> + vsub.i64 q1, q1, q8
> + vshr.s64 q3, q3, #25
> + vadd.i64 q0, q0, q7
> + vadd.i64 q5, q5, q3
> + vshl.i64 q3, q3, #25
> + vadd.i64 q6, q5, q11
> + vadd.i64 q0, q0, q13
> + vshl.i64 q7, q13, #25
> + vadd.i64 q8, q0, q11
> + vsub.i64 q3, q12, q3
> + vshr.s64 q6, q6, #26
> + vsub.i64 q7, q10, q7
> + vtrn.32 d2, d6
> + vshr.s64 q8, q8, #26
> + vtrn.32 d3, d7
> + vadd.i64 q3, q9, q6
> + vst1.8 d2, [r2, : 64]
> + vshl.i64 q6, q6, #26
> + vst1.8 d3, [r4, : 64]
> + vadd.i64 q1, q4, q8
> + vtrn.32 d4, d14
> + vshl.i64 q4, q8, #26
> + vtrn.32 d5, d15
> + vsub.i64 q5, q5, q6
> + add r2, r2, #16
> + vsub.i64 q0, q0, q4
> + vst1.8 d4, [r2, : 64]
> + add r4, r4, #16
> + vst1.8 d5, [r4, : 64]
> + vtrn.32 d10, d6
> + vtrn.32 d11, d7
> + sub r2, r2, #8
> + sub r4, r4, #8
> + vtrn.32 d0, d2
> + vtrn.32 d1, d3
> + vst1.8 d10, [r2, : 64]
> + vst1.8 d11, [r4, : 64]
> + sub r2, r2, #24
> + sub r4, r4, #24
> + vst1.8 d0, [r2, : 64]
> + vst1.8 d1, [r4, : 64]
> + add r2, sp, #512
> + add r4, r3, #144
> + add r5, r3, #192
> + vld1.8 {d0-d1}, [r2, : 128]
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vld1.8 {d4-d5}, [r5, : 128]!
> + vzip.i32 q1, q2
> + vld1.8 {d6-d7}, [r4, : 128]!
> + vld1.8 {d8-d9}, [r5, : 128]!
> + vshl.i32 q5, q1, #1
> + vzip.i32 q3, q4
> + vshl.i32 q6, q2, #1
> + vld1.8 {d14}, [r4, : 64]
> + vshl.i32 q8, q3, #1
> + vld1.8 {d15}, [r5, : 64]
> + vshl.i32 q9, q4, #1
> + vmul.i32 d21, d7, d1
> + vtrn.32 d14, d15
> + vmul.i32 q11, q4, q0
> + vmul.i32 q0, q7, q0
> + vmull.s32 q12, d2, d2
> + vmlal.s32 q12, d11, d1
> + vmlal.s32 q12, d12, d0
> + vmlal.s32 q12, d13, d23
> + vmlal.s32 q12, d16, d22
> + vmlal.s32 q12, d7, d21
> + vmull.s32 q10, d2, d11
> + vmlal.s32 q10, d4, d1
> + vmlal.s32 q10, d13, d0
> + vmlal.s32 q10, d6, d23
> + vmlal.s32 q10, d17, d22
> + vmull.s32 q13, d10, d4
> + vmlal.s32 q13, d11, d3
> + vmlal.s32 q13, d13, d1
> + vmlal.s32 q13, d16, d0
> + vmlal.s32 q13, d17, d23
> + vmlal.s32 q13, d8, d22
> + vmull.s32 q1, d10, d5
> + vmlal.s32 q1, d11, d4
> + vmlal.s32 q1, d6, d1
> + vmlal.s32 q1, d17, d0
> + vmlal.s32 q1, d8, d23
> + vmull.s32 q14, d10, d6
> + vmlal.s32 q14, d11, d13
> + vmlal.s32 q14, d4, d4
> + vmlal.s32 q14, d17, d1
> + vmlal.s32 q14, d18, d0
> + vmlal.s32 q14, d9, d23
> + vmull.s32 q11, d10, d7
> + vmlal.s32 q11, d11, d6
> + vmlal.s32 q11, d12, d5
> + vmlal.s32 q11, d8, d1
> + vmlal.s32 q11, d19, d0
> + vmull.s32 q15, d10, d8
> + vmlal.s32 q15, d11, d17
> + vmlal.s32 q15, d12, d6
> + vmlal.s32 q15, d13, d5
> + vmlal.s32 q15, d19, d1
> + vmlal.s32 q15, d14, d0
> + vmull.s32 q2, d10, d9
> + vmlal.s32 q2, d11, d8
> + vmlal.s32 q2, d12, d7
> + vmlal.s32 q2, d13, d6
> + vmlal.s32 q2, d14, d1
> + vmull.s32 q0, d15, d1
> + vmlal.s32 q0, d10, d14
> + vmlal.s32 q0, d11, d19
> + vmlal.s32 q0, d12, d8
> + vmlal.s32 q0, d13, d17
> + vmlal.s32 q0, d6, d6
> + add r2, sp, #480
> + vld1.8 {d18-d19}, [r2, : 128]
> + vmull.s32 q3, d16, d7
> + vmlal.s32 q3, d10, d15
> + vmlal.s32 q3, d11, d14
> + vmlal.s32 q3, d12, d9
> + vmlal.s32 q3, d13, d8
> + add r2, sp, #496
> + vld1.8 {d8-d9}, [r2, : 128]
> + vadd.i64 q5, q12, q9
> + vadd.i64 q6, q15, q9
> + vshr.s64 q5, q5, #26
> + vshr.s64 q6, q6, #26
> + vadd.i64 q7, q10, q5
> + vshl.i64 q5, q5, #26
> + vadd.i64 q8, q7, q4
> + vadd.i64 q2, q2, q6
> + vshl.i64 q6, q6, #26
> + vadd.i64 q10, q2, q4
> + vsub.i64 q5, q12, q5
> + vshr.s64 q8, q8, #25
> + vsub.i64 q6, q15, q6
> + vshr.s64 q10, q10, #25
> + vadd.i64 q12, q13, q8
> + vshl.i64 q8, q8, #25
> + vadd.i64 q13, q12, q9
> + vadd.i64 q0, q0, q10
> + vsub.i64 q7, q7, q8
> + vshr.s64 q8, q13, #26
> + vshl.i64 q10, q10, #25
> + vadd.i64 q13, q0, q9
> + vadd.i64 q1, q1, q8
> + vshl.i64 q8, q8, #26
> + vadd.i64 q15, q1, q4
> + vsub.i64 q2, q2, q10
> + vshr.s64 q10, q13, #26
> + vsub.i64 q8, q12, q8
> + vshr.s64 q12, q15, #25
> + vadd.i64 q3, q3, q10
> + vshl.i64 q10, q10, #26
> + vadd.i64 q13, q3, q4
> + vadd.i64 q14, q14, q12
> + add r2, r3, #144
> + vshl.i64 q12, q12, #25
> + add r4, r3, #192
> + vadd.i64 q15, q14, q9
> + add r2, r2, #8
> + vsub.i64 q0, q0, q10
> + add r4, r4, #8
> + vshr.s64 q10, q13, #25
> + vsub.i64 q1, q1, q12
> + vshr.s64 q12, q15, #26
> + vadd.i64 q13, q10, q10
> + vadd.i64 q11, q11, q12
> + vtrn.32 d16, d2
> + vshl.i64 q12, q12, #26
> + vtrn.32 d17, d3
> + vadd.i64 q1, q11, q4
> + vadd.i64 q4, q5, q13
> + vst1.8 d16, [r2, : 64]!
> + vshl.i64 q5, q10, #4
> + vst1.8 d17, [r4, : 64]!
> + vsub.i64 q8, q14, q12
> + vshr.s64 q1, q1, #25
> + vadd.i64 q4, q4, q5
> + vadd.i64 q5, q6, q1
> + vshl.i64 q1, q1, #25
> + vadd.i64 q6, q5, q9
> + vadd.i64 q4, q4, q10
> + vshl.i64 q10, q10, #25
> + vadd.i64 q9, q4, q9
> + vsub.i64 q1, q11, q1
> + vshr.s64 q6, q6, #26
> + vsub.i64 q3, q3, q10
> + vtrn.32 d16, d2
> + vshr.s64 q9, q9, #26
> + vtrn.32 d17, d3
> + vadd.i64 q1, q2, q6
> + vst1.8 d16, [r2, : 64]
> + vshl.i64 q2, q6, #26
> + vst1.8 d17, [r4, : 64]
> + vadd.i64 q6, q7, q9
> + vtrn.32 d0, d6
> + vshl.i64 q7, q9, #26
> + vtrn.32 d1, d7
> + vsub.i64 q2, q5, q2
> + add r2, r2, #16
> + vsub.i64 q3, q4, q7
> + vst1.8 d0, [r2, : 64]
> + add r4, r4, #16
> + vst1.8 d1, [r4, : 64]
> + vtrn.32 d4, d2
> + vtrn.32 d5, d3
> + sub r2, r2, #8
> + sub r4, r4, #8
> + vtrn.32 d6, d12
> + vtrn.32 d7, d13
> + vst1.8 d4, [r2, : 64]
> + vst1.8 d5, [r4, : 64]
> + sub r2, r2, #24
> + sub r4, r4, #24
> + vst1.8 d6, [r2, : 64]
> + vst1.8 d7, [r4, : 64]
> + add r2, r3, #336
> + add r4, r3, #288
> + vld1.8 {d0-d1}, [r2, : 128]!
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vadd.i32 q0, q0, q1
> + vld1.8 {d2-d3}, [r2, : 128]!
> + vld1.8 {d4-d5}, [r4, : 128]!
> + vadd.i32 q1, q1, q2
> + add r5, r3, #288
> + vld1.8 {d4}, [r2, : 64]
> + vld1.8 {d6}, [r4, : 64]
> + vadd.i32 q2, q2, q3
> + vst1.8 {d0-d1}, [r5, : 128]!
> + vst1.8 {d2-d3}, [r5, : 128]!
> + vst1.8 d4, [r5, : 64]
> + add r2, r3, #48
> + add r4, r3, #144
> + vld1.8 {d0-d1}, [r4, : 128]!
> + vld1.8 {d2-d3}, [r4, : 128]!
> + vld1.8 {d4}, [r4, : 64]
> + add r4, r3, #288
> + vld1.8 {d6-d7}, [r4, : 128]!
> + vtrn.32 q0, q3
> + vld1.8 {d8-d9}, [r4, : 128]!
> + vshl.i32 q5, q0, #4
> + vtrn.32 q1, q4
> + vshl.i32 q6, q3, #4
> + vadd.i32 q5, q5, q0
> + vadd.i32 q6, q6, q3
> + vshl.i32 q7, q1, #4
> + vld1.8 {d5}, [r4, : 64]
> + vshl.i32 q8, q4, #4
> + vtrn.32 d4, d5
> + vadd.i32 q7, q7, q1
> + vadd.i32 q8, q8, q4
> + vld1.8 {d18-d19}, [r2, : 128]!
> + vshl.i32 q10, q2, #4
> + vld1.8 {d22-d23}, [r2, : 128]!
> + vadd.i32 q10, q10, q2
> + vld1.8 {d24}, [r2, : 64]
> + vadd.i32 q5, q5, q0
> + add r2, r3, #240
> + vld1.8 {d26-d27}, [r2, : 128]!
> + vadd.i32 q6, q6, q3
> + vld1.8 {d28-d29}, [r2, : 128]!
> + vadd.i32 q8, q8, q4
> + vld1.8 {d25}, [r2, : 64]
> + vadd.i32 q10, q10, q2
> + vtrn.32 q9, q13
> + vadd.i32 q7, q7, q1
> + vadd.i32 q5, q5, q0
> + vtrn.32 q11, q14
> + vadd.i32 q6, q6, q3
> + add r2, sp, #528
> + vadd.i32 q10, q10, q2
> + vtrn.32 d24, d25
> + vst1.8 {d12-d13}, [r2, : 128]
> + vshl.i32 q6, q13, #1
> + add r2, sp, #544
> + vst1.8 {d20-d21}, [r2, : 128]
> + vshl.i32 q10, q14, #1
> + add r2, sp, #560
> + vst1.8 {d12-d13}, [r2, : 128]
> + vshl.i32 q15, q12, #1
> + vadd.i32 q8, q8, q4
> + vext.32 d10, d31, d30, #0
> + vadd.i32 q7, q7, q1
> + add r2, sp, #576
> + vst1.8 {d16-d17}, [r2, : 128]
> + vmull.s32 q8, d18, d5
> + vmlal.s32 q8, d26, d4
> + vmlal.s32 q8, d19, d9
> + vmlal.s32 q8, d27, d3
> + vmlal.s32 q8, d22, d8
> + vmlal.s32 q8, d28, d2
> + vmlal.s32 q8, d23, d7
> + vmlal.s32 q8, d29, d1
> + vmlal.s32 q8, d24, d6
> + vmlal.s32 q8, d25, d0
> + add r2, sp, #592
> + vst1.8 {d14-d15}, [r2, : 128]
> + vmull.s32 q2, d18, d4
> + vmlal.s32 q2, d12, d9
> + vmlal.s32 q2, d13, d8
> + vmlal.s32 q2, d19, d3
> + vmlal.s32 q2, d22, d2
> + vmlal.s32 q2, d23, d1
> + vmlal.s32 q2, d24, d0
> + add r2, sp, #608
> + vst1.8 {d20-d21}, [r2, : 128]
> + vmull.s32 q7, d18, d9
> + vmlal.s32 q7, d26, d3
> + vmlal.s32 q7, d19, d8
> + vmlal.s32 q7, d27, d2
> + vmlal.s32 q7, d22, d7
> + vmlal.s32 q7, d28, d1
> + vmlal.s32 q7, d23, d6
> + vmlal.s32 q7, d29, d0
> + add r2, sp, #624
> + vst1.8 {d10-d11}, [r2, : 128]
> + vmull.s32 q5, d18, d3
> + vmlal.s32 q5, d19, d2
> + vmlal.s32 q5, d22, d1
> + vmlal.s32 q5, d23, d0
> + vmlal.s32 q5, d12, d8
> + add r2, sp, #640
> + vst1.8 {d16-d17}, [r2, : 128]
> + vmull.s32 q4, d18, d8
> + vmlal.s32 q4, d26, d2
> + vmlal.s32 q4, d19, d7
> + vmlal.s32 q4, d27, d1
> + vmlal.s32 q4, d22, d6
> + vmlal.s32 q4, d28, d0
> + vmull.s32 q8, d18, d7
> + vmlal.s32 q8, d26, d1
> + vmlal.s32 q8, d19, d6
> + vmlal.s32 q8, d27, d0
> + add r2, sp, #544
> + vld1.8 {d20-d21}, [r2, : 128]
> + vmlal.s32 q7, d24, d21
> + vmlal.s32 q7, d25, d20
> + vmlal.s32 q4, d23, d21
> + vmlal.s32 q4, d29, d20
> + vmlal.s32 q8, d22, d21
> + vmlal.s32 q8, d28, d20
> + vmlal.s32 q5, d24, d20
> + add r2, sp, #544
> + vst1.8 {d14-d15}, [r2, : 128]
> + vmull.s32 q7, d18, d6
> + vmlal.s32 q7, d26, d0
> + add r2, sp, #624
> + vld1.8 {d30-d31}, [r2, : 128]
> + vmlal.s32 q2, d30, d21
> + vmlal.s32 q7, d19, d21
> + vmlal.s32 q7, d27, d20
> + add r2, sp, #592
> + vld1.8 {d26-d27}, [r2, : 128]
> + vmlal.s32 q4, d25, d27
> + vmlal.s32 q8, d29, d27
> + vmlal.s32 q8, d25, d26
> + vmlal.s32 q7, d28, d27
> + vmlal.s32 q7, d29, d26
> + add r2, sp, #576
> + vld1.8 {d28-d29}, [r2, : 128]
> + vmlal.s32 q4, d24, d29
> + vmlal.s32 q8, d23, d29
> + vmlal.s32 q8, d24, d28
> + vmlal.s32 q7, d22, d29
> + vmlal.s32 q7, d23, d28
> + add r2, sp, #576
> + vst1.8 {d8-d9}, [r2, : 128]
> + add r2, sp, #528
> + vld1.8 {d8-d9}, [r2, : 128]
> + vmlal.s32 q7, d24, d9
> + vmlal.s32 q7, d25, d31
> + vmull.s32 q1, d18, d2
> + vmlal.s32 q1, d19, d1
> + vmlal.s32 q1, d22, d0
> + vmlal.s32 q1, d24, d27
> + vmlal.s32 q1, d23, d20
> + vmlal.s32 q1, d12, d7
> + vmlal.s32 q1, d13, d6
> + vmull.s32 q6, d18, d1
> + vmlal.s32 q6, d19, d0
> + vmlal.s32 q6, d23, d27
> + vmlal.s32 q6, d22, d20
> + vmlal.s32 q6, d24, d26
> + vmull.s32 q0, d18, d0
> + vmlal.s32 q0, d22, d27
> + vmlal.s32 q0, d23, d26
> + vmlal.s32 q0, d24, d31
> + vmlal.s32 q0, d19, d20
> + add r2, sp, #608
> + vld1.8 {d18-d19}, [r2, : 128]
> + vmlal.s32 q2, d18, d7
> + vmlal.s32 q2, d19, d6
> + vmlal.s32 q5, d18, d6
> + vmlal.s32 q5, d19, d21
> + vmlal.s32 q1, d18, d21
> + vmlal.s32 q1, d19, d29
> + vmlal.s32 q0, d18, d28
> + vmlal.s32 q0, d19, d9
> + vmlal.s32 q6, d18, d29
> + vmlal.s32 q6, d19, d28
> + add r2, sp, #560
> + vld1.8 {d18-d19}, [r2, : 128]
> + add r2, sp, #480
> + vld1.8 {d22-d23}, [r2, : 128]
> + vmlal.s32 q5, d19, d7
> + vmlal.s32 q0, d18, d21
> + vmlal.s32 q0, d19, d29
> + vmlal.s32 q6, d18, d6
> + add r2, sp, #496
> + vld1.8 {d6-d7}, [r2, : 128]
> + vmlal.s32 q6, d19, d21
> + add r2, sp, #544
> + vld1.8 {d18-d19}, [r2, : 128]
> + vmlal.s32 q0, d30, d8
> + add r2, sp, #640
> + vld1.8 {d20-d21}, [r2, : 128]
> + vmlal.s32 q5, d30, d29
> + add r2, sp, #576
> + vld1.8 {d24-d25}, [r2, : 128]
> + vmlal.s32 q1, d30, d28
> + vadd.i64 q13, q0, q11
> + vadd.i64 q14, q5, q11
> + vmlal.s32 q6, d30, d9
> + vshr.s64 q4, q13, #26
> + vshr.s64 q13, q14, #26
> + vadd.i64 q7, q7, q4
> + vshl.i64 q4, q4, #26
> + vadd.i64 q14, q7, q3
> + vadd.i64 q9, q9, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q15, q9, q3
> + vsub.i64 q0, q0, q4
> + vshr.s64 q4, q14, #25
> + vsub.i64 q5, q5, q13
> + vshr.s64 q13, q15, #25
> + vadd.i64 q6, q6, q4
> + vshl.i64 q4, q4, #25
> + vadd.i64 q14, q6, q11
> + vadd.i64 q2, q2, q13
> + vsub.i64 q4, q7, q4
> + vshr.s64 q7, q14, #26
> + vshl.i64 q13, q13, #25
> + vadd.i64 q14, q2, q11
> + vadd.i64 q8, q8, q7
> + vshl.i64 q7, q7, #26
> + vadd.i64 q15, q8, q3
> + vsub.i64 q9, q9, q13
> + vshr.s64 q13, q14, #26
> + vsub.i64 q6, q6, q7
> + vshr.s64 q7, q15, #25
> + vadd.i64 q10, q10, q13
> + vshl.i64 q13, q13, #26
> + vadd.i64 q14, q10, q3
> + vadd.i64 q1, q1, q7
> + add r2, r3, #240
> + vshl.i64 q7, q7, #25
> + add r4, r3, #144
> + vadd.i64 q15, q1, q11
> + add r2, r2, #8
> + vsub.i64 q2, q2, q13
> + add r4, r4, #8
> + vshr.s64 q13, q14, #25
> + vsub.i64 q7, q8, q7
> + vshr.s64 q8, q15, #26
> + vadd.i64 q14, q13, q13
> + vadd.i64 q12, q12, q8
> + vtrn.32 d12, d14
> + vshl.i64 q8, q8, #26
> + vtrn.32 d13, d15
> + vadd.i64 q3, q12, q3
> + vadd.i64 q0, q0, q14
> + vst1.8 d12, [r2, : 64]!
> + vshl.i64 q7, q13, #4
> + vst1.8 d13, [r4, : 64]!
> + vsub.i64 q1, q1, q8
> + vshr.s64 q3, q3, #25
> + vadd.i64 q0, q0, q7
> + vadd.i64 q5, q5, q3
> + vshl.i64 q3, q3, #25
> + vadd.i64 q6, q5, q11
> + vadd.i64 q0, q0, q13
> + vshl.i64 q7, q13, #25
> + vadd.i64 q8, q0, q11
> + vsub.i64 q3, q12, q3
> + vshr.s64 q6, q6, #26
> + vsub.i64 q7, q10, q7
> + vtrn.32 d2, d6
> + vshr.s64 q8, q8, #26
> + vtrn.32 d3, d7
> + vadd.i64 q3, q9, q6
> + vst1.8 d2, [r2, : 64]
> + vshl.i64 q6, q6, #26
> + vst1.8 d3, [r4, : 64]
> + vadd.i64 q1, q4, q8
> + vtrn.32 d4, d14
> + vshl.i64 q4, q8, #26
> + vtrn.32 d5, d15
> + vsub.i64 q5, q5, q6
> + add r2, r2, #16
> + vsub.i64 q0, q0, q4
> + vst1.8 d4, [r2, : 64]
> + add r4, r4, #16
> + vst1.8 d5, [r4, : 64]
> + vtrn.32 d10, d6
> + vtrn.32 d11, d7
> + sub r2, r2, #8
> + sub r4, r4, #8
> + vtrn.32 d0, d2
> + vtrn.32 d1, d3
> + vst1.8 d10, [r2, : 64]
> + vst1.8 d11, [r4, : 64]
> + sub r2, r2, #24
> + sub r4, r4, #24
> + vst1.8 d0, [r2, : 64]
> + vst1.8 d1, [r4, : 64]
> + ldr r2, [sp, #456]
> + ldr r4, [sp, #460]
> + subs r5, r2, #1
> + bge .Lmainloop
> + add r1, r3, #144
> + add r2, r3, #336
> + vld1.8 {d0-d1}, [r1, : 128]!
> + vld1.8 {d2-d3}, [r1, : 128]!
> + vld1.8 {d4}, [r1, : 64]
> + vst1.8 {d0-d1}, [r2, : 128]!
> + vst1.8 {d2-d3}, [r2, : 128]!
> + vst1.8 d4, [r2, : 64]
> + movw r1, #0
> +.Linvertloop:
> + add r2, r3, #144
> + movw r4, #0
> + movw r5, #2
> + cmp r1, #1
> + moveq r5, #1
> + addeq r2, r3, #336
> + addeq r4, r3, #48
> + cmp r1, #2
> + moveq r5, #1
> + addeq r2, r3, #48
> + cmp r1, #3
> + moveq r5, #5
> + addeq r4, r3, #336
> + cmp r1, #4
> + moveq r5, #10
> + cmp r1, #5
> + moveq r5, #20
> + cmp r1, #6
> + moveq r5, #10
> + addeq r2, r3, #336
> + addeq r4, r3, #336
> + cmp r1, #7
> + moveq r5, #50
> + cmp r1, #8
> + moveq r5, #100
> + cmp r1, #9
> + moveq r5, #50
> + addeq r2, r3, #336
> + cmp r1, #10
> + moveq r5, #5
> + addeq r2, r3, #48
> + cmp r1, #11
> + moveq r5, #0
> + addeq r2, r3, #96
> + add r6, r3, #144
> + add r7, r3, #288
> + vld1.8 {d0-d1}, [r6, : 128]!
> + vld1.8 {d2-d3}, [r6, : 128]!
> + vld1.8 {d4}, [r6, : 64]
> + vst1.8 {d0-d1}, [r7, : 128]!
> + vst1.8 {d2-d3}, [r7, : 128]!
> + vst1.8 d4, [r7, : 64]
> + cmp r5, #0
> + beq .Lskipsquaringloop
> +.Lsquaringloop:
> + add r6, r3, #288
> + add r7, r3, #288
> + add r8, r3, #288
> + vmov.i32 q0, #19
> + vmov.i32 q1, #0
> + vmov.i32 q2, #1
> + vzip.i32 q1, q2
> + vld1.8 {d4-d5}, [r7, : 128]!
> + vld1.8 {d6-d7}, [r7, : 128]!
> + vld1.8 {d9}, [r7, : 64]
> + vld1.8 {d10-d11}, [r6, : 128]!
> + add r7, sp, #384
> + vld1.8 {d12-d13}, [r6, : 128]!
> + vmul.i32 q7, q2, q0
> + vld1.8 {d8}, [r6, : 64]
> + vext.32 d17, d11, d10, #1
> + vmul.i32 q9, q3, q0
> + vext.32 d16, d10, d8, #1
> + vshl.u32 q10, q5, q1
> + vext.32 d22, d14, d4, #1
> + vext.32 d24, d18, d6, #1
> + vshl.u32 q13, q6, q1
> + vshl.u32 d28, d8, d2
> + vrev64.i32 d22, d22
> + vmul.i32 d1, d9, d1
> + vrev64.i32 d24, d24
> + vext.32 d29, d8, d13, #1
> + vext.32 d0, d1, d9, #1
> + vrev64.i32 d0, d0
> + vext.32 d2, d9, d1, #1
> + vext.32 d23, d15, d5, #1
> + vmull.s32 q4, d20, d4
> + vrev64.i32 d23, d23
> + vmlal.s32 q4, d21, d1
> + vrev64.i32 d2, d2
> + vmlal.s32 q4, d26, d19
> + vext.32 d3, d5, d15, #1
> + vmlal.s32 q4, d27, d18
> + vrev64.i32 d3, d3
> + vmlal.s32 q4, d28, d15
> + vext.32 d14, d12, d11, #1
> + vmull.s32 q5, d16, d23
> + vext.32 d15, d13, d12, #1
> + vmlal.s32 q5, d17, d4
> + vst1.8 d8, [r7, : 64]!
> + vmlal.s32 q5, d14, d1
> + vext.32 d12, d9, d8, #0
> + vmlal.s32 q5, d15, d19
> + vmov.i64 d13, #0
> + vmlal.s32 q5, d29, d18
> + vext.32 d25, d19, d7, #1
> + vmlal.s32 q6, d20, d5
> + vrev64.i32 d25, d25
> + vmlal.s32 q6, d21, d4
> + vst1.8 d11, [r7, : 64]!
> + vmlal.s32 q6, d26, d1
> + vext.32 d9, d10, d10, #0
> + vmlal.s32 q6, d27, d19
> + vmov.i64 d8, #0
> + vmlal.s32 q6, d28, d18
> + vmlal.s32 q4, d16, d24
> + vmlal.s32 q4, d17, d5
> + vmlal.s32 q4, d14, d4
> + vst1.8 d12, [r7, : 64]!
> + vmlal.s32 q4, d15, d1
> + vext.32 d10, d13, d12, #0
> + vmlal.s32 q4, d29, d19
> + vmov.i64 d11, #0
> + vmlal.s32 q5, d20, d6
> + vmlal.s32 q5, d21, d5
> + vmlal.s32 q5, d26, d4
> + vext.32 d13, d8, d8, #0
> + vmlal.s32 q5, d27, d1
> + vmov.i64 d12, #0
> + vmlal.s32 q5, d28, d19
> + vst1.8 d9, [r7, : 64]!
> + vmlal.s32 q6, d16, d25
> + vmlal.s32 q6, d17, d6
> + vst1.8 d10, [r7, : 64]
> + vmlal.s32 q6, d14, d5
> + vext.32 d8, d11, d10, #0
> + vmlal.s32 q6, d15, d4
> + vmov.i64 d9, #0
> + vmlal.s32 q6, d29, d1
> + vmlal.s32 q4, d20, d7
> + vmlal.s32 q4, d21, d6
> + vmlal.s32 q4, d26, d5
> + vext.32 d11, d12, d12, #0
> + vmlal.s32 q4, d27, d4
> + vmov.i64 d10, #0
> + vmlal.s32 q4, d28, d1
> + vmlal.s32 q5, d16, d0
> + sub r6, r7, #32
> + vmlal.s32 q5, d17, d7
> + vmlal.s32 q5, d14, d6
> + vext.32 d30, d9, d8, #0
> + vmlal.s32 q5, d15, d5
> + vld1.8 {d31}, [r6, : 64]!
> + vmlal.s32 q5, d29, d4
> + vmlal.s32 q15, d20, d0
> + vext.32 d0, d6, d18, #1
> + vmlal.s32 q15, d21, d25
> + vrev64.i32 d0, d0
> + vmlal.s32 q15, d26, d24
> + vext.32 d1, d7, d19, #1
> + vext.32 d7, d10, d10, #0
> + vmlal.s32 q15, d27, d23
> + vrev64.i32 d1, d1
> + vld1.8 {d6}, [r6, : 64]
> + vmlal.s32 q15, d28, d22
> + vmlal.s32 q3, d16, d4
> + add r6, r6, #24
> + vmlal.s32 q3, d17, d2
> + vext.32 d4, d31, d30, #0
> + vmov d17, d11
> + vmlal.s32 q3, d14, d1
> + vext.32 d11, d13, d13, #0
> + vext.32 d13, d30, d30, #0
> + vmlal.s32 q3, d15, d0
> + vext.32 d1, d8, d8, #0
> + vmlal.s32 q3, d29, d3
> + vld1.8 {d5}, [r6, : 64]
> + sub r6, r6, #16
> + vext.32 d10, d6, d6, #0
> + vmov.i32 q1, #0xffffffff
> + vshl.i64 q4, q1, #25
> + add r7, sp, #480
> + vld1.8 {d14-d15}, [r7, : 128]
> + vadd.i64 q9, q2, q7
> + vshl.i64 q1, q1, #26
> + vshr.s64 q10, q9, #26
> + vld1.8 {d0}, [r6, : 64]!
> + vadd.i64 q5, q5, q10
> + vand q9, q9, q1
> + vld1.8 {d16}, [r6, : 64]!
> + add r6, sp, #496
> + vld1.8 {d20-d21}, [r6, : 128]
> + vadd.i64 q11, q5, q10
> + vsub.i64 q2, q2, q9
> + vshr.s64 q9, q11, #25
> + vext.32 d12, d5, d4, #0
> + vand q11, q11, q4
> + vadd.i64 q0, q0, q9
> + vmov d19, d7
> + vadd.i64 q3, q0, q7
> + vsub.i64 q5, q5, q11
> + vshr.s64 q11, q3, #26
> + vext.32 d18, d11, d10, #0
> + vand q3, q3, q1
> + vadd.i64 q8, q8, q11
> + vadd.i64 q11, q8, q10
> + vsub.i64 q0, q0, q3
> + vshr.s64 q3, q11, #25
> + vand q11, q11, q4
> + vadd.i64 q3, q6, q3
> + vadd.i64 q6, q3, q7
> + vsub.i64 q8, q8, q11
> + vshr.s64 q11, q6, #26
> + vand q6, q6, q1
> + vadd.i64 q9, q9, q11
> + vadd.i64 d25, d19, d21
> + vsub.i64 q3, q3, q6
> + vshr.s64 d23, d25, #25
> + vand q4, q12, q4
> + vadd.i64 d21, d23, d23
> + vshl.i64 d25, d23, #4
> + vadd.i64 d21, d21, d23
> + vadd.i64 d25, d25, d21
> + vadd.i64 d4, d4, d25
> + vzip.i32 q0, q8
> + vadd.i64 d12, d4, d14
> + add r6, r8, #8
> + vst1.8 d0, [r6, : 64]
> + vsub.i64 d19, d19, d9
> + add r6, r6, #16
> + vst1.8 d16, [r6, : 64]
> + vshr.s64 d22, d12, #26
> + vand q0, q6, q1
> + vadd.i64 d10, d10, d22
> + vzip.i32 q3, q9
> + vsub.i64 d4, d4, d0
> + sub r6, r6, #8
> + vst1.8 d6, [r6, : 64]
> + add r6, r6, #16
> + vst1.8 d18, [r6, : 64]
> + vzip.i32 q2, q5
> + sub r6, r6, #32
> + vst1.8 d4, [r6, : 64]
> + subs r5, r5, #1
> + bhi .Lsquaringloop
> +.Lskipsquaringloop:
> + mov r2, r2
> + add r5, r3, #288
> + add r6, r3, #144
> + vmov.i32 q0, #19
> + vmov.i32 q1, #0
> + vmov.i32 q2, #1
> + vzip.i32 q1, q2
> + vld1.8 {d4-d5}, [r5, : 128]!
> + vld1.8 {d6-d7}, [r5, : 128]!
> + vld1.8 {d9}, [r5, : 64]
> + vld1.8 {d10-d11}, [r2, : 128]!
> + add r5, sp, #384
> + vld1.8 {d12-d13}, [r2, : 128]!
> + vmul.i32 q7, q2, q0
> + vld1.8 {d8}, [r2, : 64]
> + vext.32 d17, d11, d10, #1
> + vmul.i32 q9, q3, q0
> + vext.32 d16, d10, d8, #1
> + vshl.u32 q10, q5, q1
> + vext.32 d22, d14, d4, #1
> + vext.32 d24, d18, d6, #1
> + vshl.u32 q13, q6, q1
> + vshl.u32 d28, d8, d2
> + vrev64.i32 d22, d22
> + vmul.i32 d1, d9, d1
> + vrev64.i32 d24, d24
> + vext.32 d29, d8, d13, #1
> + vext.32 d0, d1, d9, #1
> + vrev64.i32 d0, d0
> + vext.32 d2, d9, d1, #1
> + vext.32 d23, d15, d5, #1
> + vmull.s32 q4, d20, d4
> + vrev64.i32 d23, d23
> + vmlal.s32 q4, d21, d1
> + vrev64.i32 d2, d2
> + vmlal.s32 q4, d26, d19
> + vext.32 d3, d5, d15, #1
> + vmlal.s32 q4, d27, d18
> + vrev64.i32 d3, d3
> + vmlal.s32 q4, d28, d15
> + vext.32 d14, d12, d11, #1
> + vmull.s32 q5, d16, d23
> + vext.32 d15, d13, d12, #1
> + vmlal.s32 q5, d17, d4
> + vst1.8 d8, [r5, : 64]!
> + vmlal.s32 q5, d14, d1
> + vext.32 d12, d9, d8, #0
> + vmlal.s32 q5, d15, d19
> + vmov.i64 d13, #0
> + vmlal.s32 q5, d29, d18
> + vext.32 d25, d19, d7, #1
> + vmlal.s32 q6, d20, d5
> + vrev64.i32 d25, d25
> + vmlal.s32 q6, d21, d4
> + vst1.8 d11, [r5, : 64]!
> + vmlal.s32 q6, d26, d1
> + vext.32 d9, d10, d10, #0
> + vmlal.s32 q6, d27, d19
> + vmov.i64 d8, #0
> + vmlal.s32 q6, d28, d18
> + vmlal.s32 q4, d16, d24
> + vmlal.s32 q4, d17, d5
> + vmlal.s32 q4, d14, d4
> + vst1.8 d12, [r5, : 64]!
> + vmlal.s32 q4, d15, d1
> + vext.32 d10, d13, d12, #0
> + vmlal.s32 q4, d29, d19
> + vmov.i64 d11, #0
> + vmlal.s32 q5, d20, d6
> + vmlal.s32 q5, d21, d5
> + vmlal.s32 q5, d26, d4
> + vext.32 d13, d8, d8, #0
> + vmlal.s32 q5, d27, d1
> + vmov.i64 d12, #0
> + vmlal.s32 q5, d28, d19
> + vst1.8 d9, [r5, : 64]!
> + vmlal.s32 q6, d16, d25
> + vmlal.s32 q6, d17, d6
> + vst1.8 d10, [r5, : 64]
> + vmlal.s32 q6, d14, d5
> + vext.32 d8, d11, d10, #0
> + vmlal.s32 q6, d15, d4
> + vmov.i64 d9, #0
> + vmlal.s32 q6, d29, d1
> + vmlal.s32 q4, d20, d7
> + vmlal.s32 q4, d21, d6
> + vmlal.s32 q4, d26, d5
> + vext.32 d11, d12, d12, #0
> + vmlal.s32 q4, d27, d4
> + vmov.i64 d10, #0
> + vmlal.s32 q4, d28, d1
> + vmlal.s32 q5, d16, d0
> + sub r2, r5, #32
> + vmlal.s32 q5, d17, d7
> + vmlal.s32 q5, d14, d6
> + vext.32 d30, d9, d8, #0
> + vmlal.s32 q5, d15, d5
> + vld1.8 {d31}, [r2, : 64]!
> + vmlal.s32 q5, d29, d4
> + vmlal.s32 q15, d20, d0
> + vext.32 d0, d6, d18, #1
> + vmlal.s32 q15, d21, d25
> + vrev64.i32 d0, d0
> + vmlal.s32 q15, d26, d24
> + vext.32 d1, d7, d19, #1
> + vext.32 d7, d10, d10, #0
> + vmlal.s32 q15, d27, d23
> + vrev64.i32 d1, d1
> + vld1.8 {d6}, [r2, : 64]
> + vmlal.s32 q15, d28, d22
> + vmlal.s32 q3, d16, d4
> + add r2, r2, #24
> + vmlal.s32 q3, d17, d2
> + vext.32 d4, d31, d30, #0
> + vmov d17, d11
> + vmlal.s32 q3, d14, d1
> + vext.32 d11, d13, d13, #0
> + vext.32 d13, d30, d30, #0
> + vmlal.s32 q3, d15, d0
> + vext.32 d1, d8, d8, #0
> + vmlal.s32 q3, d29, d3
> + vld1.8 {d5}, [r2, : 64]
> + sub r2, r2, #16
> + vext.32 d10, d6, d6, #0
> + vmov.i32 q1, #0xffffffff
> + vshl.i64 q4, q1, #25
> + add r5, sp, #480
> + vld1.8 {d14-d15}, [r5, : 128]
> + vadd.i64 q9, q2, q7
> + vshl.i64 q1, q1, #26
> + vshr.s64 q10, q9, #26
> + vld1.8 {d0}, [r2, : 64]!
> + vadd.i64 q5, q5, q10
> + vand q9, q9, q1
> + vld1.8 {d16}, [r2, : 64]!
> + add r2, sp, #496
> + vld1.8 {d20-d21}, [r2, : 128]
> + vadd.i64 q11, q5, q10
> + vsub.i64 q2, q2, q9
> + vshr.s64 q9, q11, #25
> + vext.32 d12, d5, d4, #0
> + vand q11, q11, q4
> + vadd.i64 q0, q0, q9
> + vmov d19, d7
> + vadd.i64 q3, q0, q7
> + vsub.i64 q5, q5, q11
> + vshr.s64 q11, q3, #26
> + vext.32 d18, d11, d10, #0
> + vand q3, q3, q1
> + vadd.i64 q8, q8, q11
> + vadd.i64 q11, q8, q10
> + vsub.i64 q0, q0, q3
> + vshr.s64 q3, q11, #25
> + vand q11, q11, q4
> + vadd.i64 q3, q6, q3
> + vadd.i64 q6, q3, q7
> + vsub.i64 q8, q8, q11
> + vshr.s64 q11, q6, #26
> + vand q6, q6, q1
> + vadd.i64 q9, q9, q11
> + vadd.i64 d25, d19, d21
> + vsub.i64 q3, q3, q6
> + vshr.s64 d23, d25, #25
> + vand q4, q12, q4
> + vadd.i64 d21, d23, d23
> + vshl.i64 d25, d23, #4
> + vadd.i64 d21, d21, d23
> + vadd.i64 d25, d25, d21
> + vadd.i64 d4, d4, d25
> + vzip.i32 q0, q8
> + vadd.i64 d12, d4, d14
> + add r2, r6, #8
> + vst1.8 d0, [r2, : 64]
> + vsub.i64 d19, d19, d9
> + add r2, r2, #16
> + vst1.8 d16, [r2, : 64]
> + vshr.s64 d22, d12, #26
> + vand q0, q6, q1
> + vadd.i64 d10, d10, d22
> + vzip.i32 q3, q9
> + vsub.i64 d4, d4, d0
> + sub r2, r2, #8
> + vst1.8 d6, [r2, : 64]
> + add r2, r2, #16
> + vst1.8 d18, [r2, : 64]
> + vzip.i32 q2, q5
> + sub r2, r2, #32
> + vst1.8 d4, [r2, : 64]
> + cmp r4, #0
> + beq .Lskippostcopy
> + add r2, r3, #144
> + mov r4, r4
> + vld1.8 {d0-d1}, [r2, : 128]!
> + vld1.8 {d2-d3}, [r2, : 128]!
> + vld1.8 {d4}, [r2, : 64]
> + vst1.8 {d0-d1}, [r4, : 128]!
> + vst1.8 {d2-d3}, [r4, : 128]!
> + vst1.8 d4, [r4, : 64]
> +.Lskippostcopy:
> + cmp r1, #1
> + bne .Lskipfinalcopy
> + add r2, r3, #288
> + add r4, r3, #144
> + vld1.8 {d0-d1}, [r2, : 128]!
> + vld1.8 {d2-d3}, [r2, : 128]!
> + vld1.8 {d4}, [r2, : 64]
> + vst1.8 {d0-d1}, [r4, : 128]!
> + vst1.8 {d2-d3}, [r4, : 128]!
> + vst1.8 d4, [r4, : 64]
> +.Lskipfinalcopy:
> + add r1, r1, #1
> + cmp r1, #12
> + blo .Linvertloop
> + add r1, r3, #144
> + ldr r2, [r1], #4
> + ldr r3, [r1], #4
> + ldr r4, [r1], #4
> + ldr r5, [r1], #4
> + ldr r6, [r1], #4
> + ldr r7, [r1], #4
> + ldr r8, [r1], #4
> + ldr r9, [r1], #4
> + ldr r10, [r1], #4
> + ldr r1, [r1]
> + add r11, r1, r1, LSL #4
> + add r11, r11, r1, LSL #1
> + add r11, r11, #16777216
> + mov r11, r11, ASR #25
> + add r11, r11, r2
> + mov r11, r11, ASR #26
> + add r11, r11, r3
> + mov r11, r11, ASR #25
> + add r11, r11, r4
> + mov r11, r11, ASR #26
> + add r11, r11, r5
> + mov r11, r11, ASR #25
> + add r11, r11, r6
> + mov r11, r11, ASR #26
> + add r11, r11, r7
> + mov r11, r11, ASR #25
> + add r11, r11, r8
> + mov r11, r11, ASR #26
> + add r11, r11, r9
> + mov r11, r11, ASR #25
> + add r11, r11, r10
> + mov r11, r11, ASR #26
> + add r11, r11, r1
> + mov r11, r11, ASR #25
> + add r2, r2, r11
> + add r2, r2, r11, LSL #1
> + add r2, r2, r11, LSL #4
> + mov r11, r2, ASR #26
> + add r3, r3, r11
> + sub r2, r2, r11, LSL #26
> + mov r11, r3, ASR #25
> + add r4, r4, r11
> + sub r3, r3, r11, LSL #25
> + mov r11, r4, ASR #26
> + add r5, r5, r11
> + sub r4, r4, r11, LSL #26
> + mov r11, r5, ASR #25
> + add r6, r6, r11
> + sub r5, r5, r11, LSL #25
> + mov r11, r6, ASR #26
> + add r7, r7, r11
> + sub r6, r6, r11, LSL #26
> + mov r11, r7, ASR #25
> + add r8, r8, r11
> + sub r7, r7, r11, LSL #25
> + mov r11, r8, ASR #26
> + add r9, r9, r11
> + sub r8, r8, r11, LSL #26
> + mov r11, r9, ASR #25
> + add r10, r10, r11
> + sub r9, r9, r11, LSL #25
> + mov r11, r10, ASR #26
> + add r1, r1, r11
> + sub r10, r10, r11, LSL #26
> + mov r11, r1, ASR #25
> + sub r1, r1, r11, LSL #25
> + add r2, r2, r3, LSL #26
> + mov r3, r3, LSR #6
> + add r3, r3, r4, LSL #19
> + mov r4, r4, LSR #13
> + add r4, r4, r5, LSL #13
> + mov r5, r5, LSR #19
> + add r5, r5, r6, LSL #6
> + add r6, r7, r8, LSL #25
> + mov r7, r8, LSR #7
> + add r7, r7, r9, LSL #19
> + mov r8, r9, LSR #13
> + add r8, r8, r10, LSL #12
> + mov r9, r10, LSR #20
> + add r1, r9, r1, LSL #6
> + str r2, [r0]
> + str r3, [r0, #4]
> + str r4, [r0, #8]
> + str r5, [r0, #12]
> + str r6, [r0, #16]
> + str r7, [r0, #20]
> + str r8, [r0, #24]
> + str r1, [r0, #28]
> + movw r0, #0
> + mov sp, ip
> + pop {r4-r11, pc}
> +ENDPROC(curve25519_neon)
> +#endif
> diff --git a/lib/zinc/curve25519/curve25519.c b/lib/zinc/curve25519/curve25519.c
> index 32536340d39d..0d5ea97762d4 100644
> --- a/lib/zinc/curve25519/curve25519.c
> +++ b/lib/zinc/curve25519/curve25519.c
> @@ -21,6 +21,8 @@
>
> #if defined(CONFIG_ZINC_ARCH_X86_64)
> #include "curve25519-x86_64-glue.h"
> +#elif defined(CONFIG_ZINC_ARCH_ARM)
> +#include "curve25519-arm-glue.h"
> #else
> void __init curve25519_fpu_init(void)
> {
> --
> 2.19.0
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
2018-10-02 16:59 ` Ard Biesheuvel
@ 2018-10-02 21:35 ` Richard Weinberger
2018-10-03 1:03 ` Jason A. Donenfeld
2018-10-03 3:10 ` Jason A. Donenfeld
2 siblings, 0 replies; 47+ messages in thread
From: Richard Weinberger @ 2018-10-02 21:35 UTC (permalink / raw)
To: linux-arm-kernel
Ard,
On Tue, Oct 2, 2018 at 7:06 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> I guess qhasm means generated code, right?
Yes.
Please see: http://cr.yp.to/qhasm.html
It generates code from a q-file (the "template").
Actually it is a rather powerful tool, but as you noticed, sometimes
the code is not perfect.
--
Thanks,
//richard
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
2018-10-02 16:59 ` Ard Biesheuvel
2018-10-02 21:35 ` Richard Weinberger
@ 2018-10-03 1:03 ` Jason A. Donenfeld
2018-10-05 15:05 ` D. J. Bernstein
2018-10-03 3:10 ` Jason A. Donenfeld
2 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-10-03 1:03 UTC (permalink / raw)
To: linux-arm-kernel
(+Dan,Peter in CC. Replying to:
<https://lore.kernel.org/lkml/CAKv+Gu9FLDRLxHReKcveZYHNYerR5Y2pZd9gn-hWrU0jb2KgfA@mail.gmail.com/>
for context.)
Hi Ard,
On Tue, Oct 2, 2018 at 6:59 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> Shouldn't this use the new simd abstraction as well?
Yes, it probably should, thanks.
> I guess qhasm means generated code, right?
> Because many of these adds are completely redundant ...
> This looks odd as well.
> Could you elaborate on what qhasm is exactly? And, as with the other
> patches, I would prefer it if we could have your changes as a separate
> patch (although having the qhasm base would be preferred)
Indeed qhasm converts this --
<https://github.com/floodyberry/supercop/blob/master/crypto_scalarmult/curve25519/neon2/scalarmult.pq>
-- into this. It's a thing from Dan (CC'd now) --
<http://cr.yp.to/qhasm.html>. As you've requested, I can layer the
patches to show our changes on top.
> ... you can drop this add
> same here
> and here
> and here
> and here
> and here
> and here
> and here
> redundant add
> I'll stop here - let me just note that this code does not strike me as
> particularly well optimized for in-order cores (such as A7).
> For instance, the sequence
> can be reordered as
> and not have every other instruction depend on the output of the previous one.
> Obviously, the ultimate truth is in the benchmark numbers, but I'd
> thought I'd mention it anyway.
Yes indeed the output is suboptimal in a lot of places. We can
gradually clean this up -- slowly and carefully over time -- if you
want. I can also look into producing a new implementation within HACL*
so that it's verified. Assurance-wise, though, I feel pretty good
about this implementation considering its origins, its breadth of use
(in BoringSSL), the fuzzing hours it's incurred, and the actual
implementation itself.
Either way, performance-wise, it's really worth having.
For example, on a Cortex-A7, we get these results (according to get_cycles()):
neon: 23142 cycles per call
fiat32: 49136 cycles per call
donna32: 71988 cycles per call
And on a Cortex-A9, we get these results (according to get_cycles()):
neon: 5020 cycles per call
fiat32: 17326 cycles per call
donna32: 28076 cycles per call
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
2018-10-02 16:59 ` Ard Biesheuvel
2018-10-02 21:35 ` Richard Weinberger
2018-10-03 1:03 ` Jason A. Donenfeld
@ 2018-10-03 3:10 ` Jason A. Donenfeld
2 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-10-03 3:10 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, Oct 2, 2018 at 6:59 PM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> Could you elaborate on what qhasm is exactly? And, as with the other
> patches, I would prefer it if we could have your changes as a separate
> patch (although having the qhasm base would be preferred)
By the way, as of a few minutes ago, if you look in the development
tree at the commit called "zinc: Curve25519 ARM implementation", that
now shows the diffs to the original, as you requested. I'll probably
obsess over that a little bit more before v7, but if you see anything
gnarly there beforehand, I'd be happy to hear.
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 ARM and ARM64 implementations
2018-09-25 14:56 ` [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 " Jason A. Donenfeld
@ 2018-10-03 6:12 ` Eric Biggers
2018-10-03 7:58 ` Ard Biesheuvel
0 siblings, 1 reply; 47+ messages in thread
From: Eric Biggers @ 2018-10-03 6:12 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, Sep 25, 2018 at 04:56:10PM +0200, Jason A. Donenfeld wrote:
> These NEON and non-NEON implementations come from Andy Polyakov's
> implementation, and are included here in raw form without modification,
> so that subsequent commits that fix these up for the kernel can see how
> it has changed. This awkward commit splitting has been requested for the
> ARM[64] implementations in particular.
>
> While this is CRYPTOGAMS code, the originating code for this happens to
> be the same as OpenSSL's commit 5bb1cd2292b388263a0cc05392bb99141212aa53
Sorry to ruin the fun, but actually there are no Poly1305 implementations in
CRYPTOGAMS (https://github.com/dot-asm/cryptogams). Nor are there any ChaCha20
implementations.
Andy P., can you please add your Poly1305 and ChaCha20 implementations to the
CRYPTOGAMS repository, so that they have a clear kernel-compatible license?
It would be great if you'd include a kernel-compatible license directly in the
versions in the OpenSSL tree too...
Thanks!
- Eric
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 ARM and ARM64 implementations
2018-10-03 6:12 ` Eric Biggers
@ 2018-10-03 7:58 ` Ard Biesheuvel
2018-10-03 14:08 ` Jason A. Donenfeld
0 siblings, 1 reply; 47+ messages in thread
From: Ard Biesheuvel @ 2018-10-03 7:58 UTC (permalink / raw)
To: linux-arm-kernel
On 3 October 2018 at 08:12, Eric Biggers <ebiggers@kernel.org> wrote:
> On Tue, Sep 25, 2018 at 04:56:10PM +0200, Jason A. Donenfeld wrote:
>> These NEON and non-NEON implementations come from Andy Polyakov's
>> implementation, and are included here in raw form without modification,
>> so that subsequent commits that fix these up for the kernel can see how
>> it has changed. This awkward commit splitting has been requested for the
>> ARM[64] implementations in particular.
>>
"This awkward commit splitting"
Seriously?!?
So you really think it is fine to import huge chunks of code like this
from other projects without keeping track of the local changes?
>> While this is CRYPTOGAMS code, the originating code for this happens to
>> be the same as OpenSSL's commit 5bb1cd2292b388263a0cc05392bb99141212aa53
>
> Sorry to ruin the fun, but actually there are no Poly1305 implementations in
> CRYPTOGAMS (https://github.com/dot-asm/cryptogams). Nor are there any ChaCha20
> implementations.
>
So was this code taken directly from the OpenSSL project then? If so,
why do the commit messages claim otherwise?
> Andy P., can you please add your Poly1305 and ChaCha20 implementations to the
> CRYPTOGAMS repository, so that they have a clear kernel-compatible license?
>
> It would be great if you'd include a kernel-compatible license directly in the
> versions in the OpenSSL tree too...
>
Yes please.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 ARM and ARM64 implementations
2018-10-03 7:58 ` Ard Biesheuvel
@ 2018-10-03 14:08 ` Jason A. Donenfeld
2018-10-03 14:45 ` Jason A. Donenfeld
0 siblings, 1 reply; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-10-03 14:08 UTC (permalink / raw)
To: linux-arm-kernel
Hi Ard,
On Wed, Oct 3, 2018 at 9:58 AM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> >> it has changed. This awkward commit splitting has been requested for the
> >> ARM[64] implementations in particular.
> >>
>
> "This awkward commit splitting"
Awkward in the sense that only those two commits are doing it, whereas
the rest of the series is not. Not awkward in any other sense that you
seemed to have divined based on your oversized punctuation below.
Fortunately, at your suggestion for v7, I've now done the splitting
for all of the other places, and so I the comment in the dev tree a
few days ago, since it's now done consistently across the patchset.
>
> Seriously?!?
>
> So you really think it is fine to import huge chunks of code like this
> from other projects without keeping track of the local changes?
As explained above, that's not what I meant at all.
Jason
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 ARM and ARM64 implementations
2018-10-03 14:08 ` Jason A. Donenfeld
@ 2018-10-03 14:45 ` Jason A. Donenfeld
0 siblings, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-10-03 14:45 UTC (permalink / raw)
To: linux-arm-kernel
On Wed, Oct 3, 2018 at 4:08 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> for all of the other places, and so I the comment in the dev tree a
The missing word between "I" and "the" is "extirpated".
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
2018-10-03 1:03 ` Jason A. Donenfeld
@ 2018-10-05 15:05 ` D. J. Bernstein
2018-10-05 15:16 ` Ard Biesheuvel
2018-10-05 18:40 ` Jason A. Donenfeld
0 siblings, 2 replies; 47+ messages in thread
From: D. J. Bernstein @ 2018-10-05 15:05 UTC (permalink / raw)
To: linux-arm-kernel
For the in-order ARM Cortex-A8 (the target for this code), adjacent
multiply-add instructions forward summands quickly. A simple in-order
dot-product computation has no latency problems, while interleaving
computations, as suggested in this thread, creates problems. Also, on
this microarchitecture, occasional ARM instructions run in parallel with
NEON, so trying to manually eliminate ARM instructions through global
pointer tracking wouldn't gain speed; it would simply create unnecessary
code-maintenance problems.
See https://cr.yp.to/papers.html#neoncrypto for analysis of the
performance of---and remaining bottlenecks in---this code. Further
speedups should be possible on this microarchitecture, but, for anyone
interested in this, I recommend focusing on building a cycle-accurate
simulator (e.g., fixing inaccuracies in the Sobole simulator) first.
Of course, there are other ARM microarchitectures, and there are many
cases where different microarchitectures prefer different optimizations.
The kernel already has boot-time benchmarks for different optimizations
for raid6, and should do the same for crypto code, so that implementors
can focus on each microarchitecture separately rather than living in the
barbaric world of having to choose which CPUs to favor.
---Dan
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 801 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20181005/4e88b0fb/attachment.sig>
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
2018-10-05 15:05 ` D. J. Bernstein
@ 2018-10-05 15:16 ` Ard Biesheuvel
2018-10-05 18:40 ` Jason A. Donenfeld
1 sibling, 0 replies; 47+ messages in thread
From: Ard Biesheuvel @ 2018-10-05 15:16 UTC (permalink / raw)
To: linux-arm-kernel
On 5 October 2018 at 17:05, D. J. Bernstein <djb@cr.yp.to> wrote:
> For the in-order ARM Cortex-A8 (the target for this code), adjacent
> multiply-add instructions forward summands quickly. A simple in-order
> dot-product computation has no latency problems, while interleaving
> computations, as suggested in this thread, creates problems. Also, on
> this microarchitecture, occasional ARM instructions run in parallel with
> NEON, so trying to manually eliminate ARM instructions through global
> pointer tracking wouldn't gain speed; it would simply create unnecessary
> code-maintenance problems.
>
> See https://cr.yp.to/papers.html#neoncrypto for analysis of the
> performance of---and remaining bottlenecks in---this code. Further
> speedups should be possible on this microarchitecture, but, for anyone
> interested in this, I recommend focusing on building a cycle-accurate
> simulator (e.g., fixing inaccuracies in the Sobole simulator) first.
>
> Of course, there are other ARM microarchitectures, and there are many
> cases where different microarchitectures prefer different optimizations.
> The kernel already has boot-time benchmarks for different optimizations
> for raid6, and should do the same for crypto code, so that implementors
> can focus on each microarchitecture separately rather than living in the
> barbaric world of having to choose which CPUs to favor.
>
Thanks Dan for the insight.
We have already established in a separate discussion that Cortex-A7,
which is main optimization target for future development, does not
have the microarchitectural peculiarity that you are referring to that
ARM instructions are essentially free when interleaved with NEON code.
But I take your point re benchmarking (as I already indicated in my
reply to Jason): if we optimize towards speed, we should ideally reuse
the existing benchmarking infrastructure we have to select the fastest
implementation at runtime. For instance, it turns out that scalar
ChaCha20 is almost as fast as NEON (or even faster?) on A7, and using
NEON in the kernel has some issues of its own.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation
2018-10-05 15:05 ` D. J. Bernstein
2018-10-05 15:16 ` Ard Biesheuvel
@ 2018-10-05 18:40 ` Jason A. Donenfeld
1 sibling, 0 replies; 47+ messages in thread
From: Jason A. Donenfeld @ 2018-10-05 18:40 UTC (permalink / raw)
To: linux-arm-kernel
Hey Dan,
On Fri, Oct 05, 2018 at 03:05:38PM -0000, D. J. Bernstein wrote:
> Of course, there are other ARM microarchitectures, and there are many
> cases where different microarchitectures prefer different optimizations.
> The kernel already has boot-time benchmarks for different optimizations
> for raid6, and should do the same for crypto code, so that implementors
> can focus on each microarchitecture separately rather than living in the
> barbaric world of having to choose which CPUs to favor.
I've been playing a bit with some code to do this sort of thing,
choosing a set of implementations to enable or disable by trying all the
combinations, and then calculating a quick median. I don't know if I'll
submit that for the initial merge of this patchset -- and in fact all
the current implementations I'm proposing are pretty much okay on
microarchitectures -- but down the line this could be useful as a
mechanism.
Jason
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20181005/5046b070/attachment.sig>
^ permalink raw reply [flat|nested] 47+ messages in thread
end of thread, other threads:[~2018-10-05 18:40 UTC | newest]
Thread overview: 47+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20180925145622.29959-1-Jason@zx2c4.com>
2018-09-25 14:56 ` [PATCH net-next v6 05/23] zinc: import Andy Polyakov's ChaCha20 ARM and ARM64 implementations Jason A. Donenfeld
[not found] ` <CAKv+Gu8-EwxFhQSUPxjEvTA5ZPz34RieMokM6CUqwURDr74jtg@mail.gmail.com>
2018-09-28 15:51 ` Ard Biesheuvel
2018-09-28 15:57 ` Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 06/23] zinc: port " Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 07/23] zinc: " Jason A. Donenfeld
2018-09-26 8:59 ` Ard Biesheuvel
2018-09-26 13:32 ` Jason A. Donenfeld
2018-09-26 14:02 ` Ard Biesheuvel
2018-09-26 15:41 ` Jason A. Donenfeld
2018-09-26 16:54 ` Ard Biesheuvel
2018-09-26 17:07 ` Jason A. Donenfeld
2018-09-26 17:37 ` Eric Biggers
2018-09-26 17:46 ` Jason A. Donenfeld
2018-09-26 15:41 ` Ard Biesheuvel
2018-09-26 15:45 ` Jason A. Donenfeld
2018-09-26 15:49 ` Jason A. Donenfeld
2018-09-26 15:51 ` Ard Biesheuvel
2018-09-26 15:58 ` Jason A. Donenfeld
2018-09-27 0:04 ` Jason A. Donenfeld
2018-09-27 13:26 ` Jason A. Donenfeld
2018-09-27 15:19 ` Jason A. Donenfeld
2018-09-27 16:26 ` Andy Lutomirski
2018-09-27 17:06 ` Jason A. Donenfeld
2018-09-26 16:21 ` Andy Lutomirski
2018-09-26 17:03 ` Jason A. Donenfeld
2018-09-26 17:08 ` Ard Biesheuvel
2018-09-26 17:23 ` Andy Lutomirski
2018-09-26 14:36 ` Andrew Lunn
2018-09-26 15:25 ` Jason A. Donenfeld
2018-09-28 16:01 ` Ard Biesheuvel
2018-09-29 2:20 ` Jason A. Donenfeld
2018-09-29 6:16 ` Ard Biesheuvel
2018-09-30 2:33 ` Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 11/23] zinc: import Andy Polyakov's Poly1305 " Jason A. Donenfeld
2018-10-03 6:12 ` Eric Biggers
2018-10-03 7:58 ` Ard Biesheuvel
2018-10-03 14:08 ` Jason A. Donenfeld
2018-10-03 14:45 ` Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 12/23] zinc: " Jason A. Donenfeld
2018-09-25 14:56 ` [PATCH net-next v6 19/23] zinc: Curve25519 ARM implementation Jason A. Donenfeld
2018-10-02 16:59 ` Ard Biesheuvel
2018-10-02 21:35 ` Richard Weinberger
2018-10-03 1:03 ` Jason A. Donenfeld
2018-10-05 15:05 ` D. J. Bernstein
2018-10-05 15:16 ` Ard Biesheuvel
2018-10-05 18:40 ` Jason A. Donenfeld
2018-10-03 3:10 ` Jason A. Donenfeld
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).