* [PATCH net-next v4 09/20] zinc: Poly1305 x86_64 implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Jean-Philippe Aumasson, Andy Polyakov, Thomas Gleixner,
Ingo Molnar, x86
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
This provides AVX, AVX-2, and AVX-512F implementations for Poly1305.
The AVX-512F implementation is disabled on Skylake, due to throttling.
These come from Andy Polyakov's implementation, with the following
modifications from Samuel Neves:
- Some cosmetic changes, like renaming labels to .Lname, constants,
and other Linux conventions.
- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.
- poly1305_blocks_avx512 jumped to the middle of the poly1305_blocks_avx2
for the final blocks. To appease objtool, the relevant tail avx2 code
was duplicated for the avx512 function.
- The original uses %rbp as a scratch register. However, the kernel
expects %rbp to be a valid frame pointer at any given time in order
to do proper unwinding. Thus we need to alter the code in order to
preserve it. The most straightforward manner in which this was
accomplished was by replacing $d3, formerly %r10, by %rdi, and
replacing %rbp by %r10. Because %rdi, a pointer to the context
structure, does not change and is not used by poly1305_iteration,
it is safe to use it here, and the overhead of saving and restoring
it should be minimal.
- The original hardcodes returns as .byte 0xf3,0xc3, aka "rep ret".
We replace this by "ret". "rep ret" was meant to help with AMD K8
chips, cf. http://repzret.org/p/repzret. It makes no sense to
continue to use this kludge for code that won't even run on ancient
AMD chips.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
---
lib/zinc/Makefile | 4 +
lib/zinc/poly1305/poly1305-x86_64-glue.h | 109 +
lib/zinc/poly1305/poly1305-x86_64.S | 2792 ++++++++++++++++++++++
3 files changed, 2905 insertions(+)
create mode 100644 lib/zinc/poly1305/poly1305-x86_64-glue.h
create mode 100644 lib/zinc/poly1305/poly1305-x86_64.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index f37df89a3f87..72112f8ffba1 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -25,6 +25,10 @@ endif
ifeq ($(CONFIG_ZINC_POLY1305),y)
zinc-y += poly1305/poly1305.o
+ifeq ($(CONFIG_ZINC_ARCH_X86_64),y)
+zinc-y += poly1305/poly1305-x86_64.o
+CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-x86_64-glue.h
+endif
ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
zinc-y += poly1305/poly1305-arm.o
CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-arm-glue.h
diff --git a/lib/zinc/poly1305/poly1305-x86_64-glue.h b/lib/zinc/poly1305/poly1305-x86_64-glue.h
new file mode 100644
index 000000000000..4ae028101e7c
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64-glue.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+asmlinkage void poly1305_init_x86_64(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+#ifdef CONFIG_AS_AVX
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+#endif
+
+static bool poly1305_use_avx __ro_after_init;
+static bool poly1305_use_avx2 __ro_after_init;
+static bool poly1305_use_avx512 __ro_after_init;
+
+void __init poly1305_fpu_init(void)
+{
+ poly1305_use_avx =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ poly1305_use_avx2 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ poly1305_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ poly1305_init_x86_64(ctx, key);
+ return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit,
+ simd_context_t simd_context)
+{
+#ifdef CONFIG_AS_AVX512
+ if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD)
+ poly1305_blocks_avx512(ctx, inp, len, padbit);
+ else
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD)
+ poly1305_blocks_avx2(ctx, inp, len, padbit);
+ else
+#endif
+#ifdef CONFIG_AS_AVX
+ if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD)
+ poly1305_blocks_avx(ctx, inp, len, padbit);
+ else
+#endif
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+#ifdef CONFIG_AS_AVX512
+ if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD)
+ poly1305_emit_avx(ctx, mac, nonce);
+ else
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD)
+ poly1305_emit_avx(ctx, mac, nonce);
+ else
+#endif
+#ifdef CONFIG_AS_AVX
+ if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD)
+ poly1305_emit_avx(ctx, mac, nonce);
+ else
+#endif
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ return true;
+}
+
+#define HAVE_POLY1305_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/poly1305/poly1305-x86_64.S b/lib/zinc/poly1305/poly1305-x86_64.S
new file mode 100644
index 000000000000..26c852e3c769
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64.S
@@ -0,0 +1,2792 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst192.Lconst, "aM", @progbits, 192
+.align 64
+.Lconst:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.long 2,2,2,3,2,0,2,1
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.text
+
+.align 32
+ENTRY(poly1305_init_x86_64)
+ xorq %rax,%rax
+ movq %rax,0(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+
+ cmpq $0,%rsi
+ je .Lno_key
+
+ movq $0x0ffffffc0fffffff,%rax
+ movq $0x0ffffffc0ffffffc,%rcx
+ andq 0(%rsi),%rax
+ andq 8(%rsi),%rcx
+ movq %rax,24(%rdi)
+ movq %rcx,32(%rdi)
+ movl $1,%eax
+.Lno_key:
+ ret
+ENDPROC(poly1305_init_x86_64)
+
+.align 32
+ENTRY(poly1305_blocks_x86_64)
+.Lblocks:
+ shrq $4,%rdx
+ jz .Lno_data
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%r10
+
+ movq %r13,%r12
+ shrq $2,%r13
+ movq %r12,%rax
+ addq %r12,%r13
+ jmp .Loop
+
+.align 32
+.Loop:
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ mulq %r14
+ movq %rax,%r9
+ movq %r11,%rax
+ movq %rdx,%rdi
+
+ mulq %r14
+ movq %rax,%r14
+ movq %r11,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq %r13,%rax
+ adcq %rdx,%rdi
+
+ mulq %rbx
+ movq %r10,%rbx
+ addq %rax,%r14
+ adcq %rdx,%r8
+
+ imulq %r13,%rbx
+ addq %rbx,%r9
+ movq %r8,%rbx
+ adcq $0,%rdi
+
+ imulq %r11,%r10
+ addq %r9,%rbx
+ movq $-4,%rax
+ adcq %r10,%rdi
+
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
+ addq %rax,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r12,%rax
+ decq %r15
+ jnz .Loop
+
+ movq 0(%rsp),%rdi
+
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+.Lno_data:
+.Lblocks_epilogue:
+ ret
+ENDPROC(poly1305_blocks_x86_64)
+
+.align 32
+ENTRY(poly1305_emit_x86_64)
+.Lemit:
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+
+ movq %r8,%rax
+ addq $5,%r8
+ movq %r9,%rcx
+ adcq $0,%r9
+ adcq $0,%r10
+ shrq $2,%r10
+ cmovnzq %r8,%rax
+ cmovnzq %r9,%rcx
+
+ addq 0(%rdx),%rax
+ adcq 8(%rdx),%rcx
+ movq %rax,0(%rsi)
+ movq %rcx,8(%rsi)
+
+ ret
+ENDPROC(poly1305_emit_x86_64)
+
+.macro __poly1305_block
+ mulq %r14
+ movq %rax,%r9
+ movq %r11,%rax
+ movq %rdx,%rdi
+
+ mulq %r14
+ movq %rax,%r14
+ movq %r11,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq %r13,%rax
+ adcq %rdx,%rdi
+
+ mulq %rbx
+ movq %r10,%rbx
+ addq %rax,%r14
+ adcq %rdx,%r8
+
+ imulq %r13,%rbx
+ addq %rbx,%r9
+ movq %r8,%rbx
+ adcq $0,%rdi
+
+ imulq %r11,%r10
+ addq %r9,%rbx
+ movq $-4,%rax
+ adcq %r10,%rdi
+
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
+ addq %rax,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+.endm
+
+.macro __poly1305_init_avx
+ movq %r11,%r14
+ movq %r12,%rbx
+ xorq %r10,%r10
+
+ leaq 48+64(%rdi),%rdi
+
+ movq %r12,%rax
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ movq %r14,%r8
+ andl %r14d,%eax
+ movq %r11,%r9
+ andl %r11d,%edx
+ movl %eax,-64(%rdi)
+ shrq $26,%r8
+ movl %edx,-60(%rdi)
+ shrq $26,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,-48(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-44(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,-32(%rdi)
+ shrq $26,%r8
+ movl %edx,-28(%rdi)
+ shrq $26,%r9
+
+ movq %rbx,%rax
+ movq %r12,%rdx
+ shlq $12,%rax
+ shlq $12,%rdx
+ orq %r8,%rax
+ orq %r9,%rdx
+ andl $0x3ffffff,%eax
+ andl $0x3ffffff,%edx
+ movl %eax,-16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-12(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,0(%rdi)
+ movq %rbx,%r8
+ movl %edx,4(%rdi)
+ movq %r12,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ shrq $14,%r9
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,20(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,32(%rdi)
+ shrq $26,%r8
+ movl %edx,36(%rdi)
+ shrq $26,%r9
+
+ movq %r10,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,48(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r9d,52(%rdi)
+ leaq (%r9,%r9,4),%r9
+ movl %r8d,64(%rdi)
+ movl %r9d,68(%rdi)
+
+ movq %r12,%rax
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-52(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-36(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-20(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-4(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,12(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,28(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,44(%rdi)
+
+ movq %r10,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,60(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,76(%rdi)
+
+ movq %r12,%rax
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-56(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-40(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-24(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-8(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,8(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,24(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,40(%rdi)
+
+ movq %r10,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,56(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,72(%rdi)
+
+ leaq -48-64(%rdi),%rdi
+.endm
+
+#ifdef CONFIG_AS_AVX
+.align 32
+ENTRY(poly1305_blocks_avx)
+
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ andq $-16,%rdx
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ testq $31,%rdx
+ jz .Leven_avx
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_avx_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10
+
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ subq $16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ ret
+
+.align 32
+.Lbase2_64_avx:
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $31,%rdx
+ jz .Linit_avx
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+
+.Linit_avx:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+
+
+.align 32
+.Leven_avx:
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx:
+ leaq 8(%rsp),%r10
+ andq $-32,%rsp
+ subq $8,%rsp
+ leaq -88(%rsp),%r11
+ subq $0x178,%rsp
+ subq $64,%rdx
+ leaq -32(%rsi),%rax
+ cmovcq %rax,%rsi
+
+ vmovdqu 48(%rdi),%xmm14
+ leaq 112(%rdi),%rdi
+ leaq .Lconst(%rip),%rcx
+
+ vmovdqu 32(%rsi),%xmm5
+ vmovdqu 48(%rsi),%xmm6
+ vmovdqa 64(%rcx),%xmm15
+
+ vpsrldq $6,%xmm5,%xmm7
+ vpsrldq $6,%xmm6,%xmm8
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+ vpsrlq $40,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+ jbe .Lskip_loop_avx
+
+
+ vmovdqu -48(%rdi),%xmm11
+ vmovdqu -32(%rdi),%xmm12
+ vpshufd $0xEE,%xmm14,%xmm13
+ vpshufd $0x44,%xmm14,%xmm10
+ vmovdqa %xmm13,-144(%r11)
+ vmovdqa %xmm10,0(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vmovdqu -16(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-128(%r11)
+ vmovdqa %xmm11,16(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqu 0(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-112(%r11)
+ vmovdqa %xmm12,32(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm14
+ vmovdqu 16(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm14,-96(%r11)
+ vmovdqa %xmm10,48(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm13
+ vmovdqu 32(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm13,-80(%r11)
+ vmovdqa %xmm11,64(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm14
+ vmovdqu 48(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm14,-64(%r11)
+ vmovdqa %xmm12,80(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm13
+ vmovdqu 64(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm13,-48(%r11)
+ vmovdqa %xmm10,96(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-32(%r11)
+ vmovdqa %xmm11,112(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqa 0(%rsp),%xmm14
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-16(%r11)
+ vmovdqa %xmm12,128(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vmovdqa %xmm2,32(%r11)
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vmovdqa 16(%rsp),%xmm2
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vmovdqa %xmm0,0(%r11)
+ vpmuludq 32(%rsp),%xmm9,%xmm0
+ vmovdqa %xmm1,16(%r11)
+ vpmuludq %xmm8,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm10,%xmm10
+ vpaddq %xmm1,%xmm14,%xmm14
+ vmovdqa %xmm3,48(%r11)
+ vpmuludq %xmm7,%xmm2,%xmm0
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm13,%xmm13
+ vmovdqa 48(%rsp),%xmm3
+ vpaddq %xmm1,%xmm12,%xmm12
+ vmovdqa %xmm4,64(%r11)
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpmuludq %xmm7,%xmm3,%xmm0
+ vpaddq %xmm2,%xmm11,%xmm11
+
+ vmovdqa 64(%rsp),%xmm4
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm3,%xmm1
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm1,%xmm13,%xmm13
+ vmovdqa 80(%rsp),%xmm2
+ vpaddq %xmm3,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm4,%xmm0
+ vpmuludq %xmm8,%xmm4,%xmm4
+ vpaddq %xmm0,%xmm11,%xmm11
+ vmovdqa 96(%rsp),%xmm3
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vmovdqa 128(%rsp),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpaddq %xmm2,%xmm13,%xmm13
+ vpmuludq %xmm9,%xmm3,%xmm0
+ vpmuludq %xmm8,%xmm3,%xmm1
+ vpaddq %xmm0,%xmm12,%xmm12
+ vmovdqu 0(%rsi),%xmm0
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm3,%xmm3
+ vpmuludq %xmm7,%xmm4,%xmm7
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vmovdqu 16(%rsi),%xmm1
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm8,%xmm4,%xmm8
+ vpmuludq %xmm9,%xmm4,%xmm9
+ vpsrldq $6,%xmm0,%xmm2
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm9,%xmm13,%xmm13
+ vpsrldq $6,%xmm1,%xmm3
+ vpmuludq 112(%rsp),%xmm5,%xmm9
+ vpmuludq %xmm6,%xmm4,%xmm5
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpaddq %xmm9,%xmm14,%xmm14
+ vmovdqa -144(%r11),%xmm9
+ vpaddq %xmm5,%xmm10,%xmm10
+
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+
+ vpsrldq $5,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpand 0(%rcx),%xmm4,%xmm4
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+ leaq 32(%rsi),%rax
+ leaq 64(%rsi),%rsi
+ subq $64,%rdx
+ cmovcq %rax,%rsi
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vmovdqa -128(%r11),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpmuludq -112(%r11),%xmm4,%xmm5
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -96(%r11),%xmm8
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpaddq %xmm7,%xmm11,%xmm11
+
+ vmovdqa -80(%r11),%xmm9
+ vpmuludq %xmm2,%xmm8,%xmm5
+ vpmuludq %xmm1,%xmm8,%xmm6
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -64(%r11),%xmm7
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpmuludq %xmm4,%xmm9,%xmm5
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm5,%xmm11,%xmm11
+ vmovdqa -48(%r11),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vmovdqa -16(%r11),%xmm9
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpmuludq %xmm4,%xmm8,%xmm5
+ vpaddq %xmm7,%xmm13,%xmm13
+ vpaddq %xmm5,%xmm12,%xmm12
+ vmovdqu 32(%rsi),%xmm5
+ vpmuludq %xmm3,%xmm8,%xmm7
+ vpmuludq %xmm2,%xmm8,%xmm8
+ vpaddq %xmm7,%xmm11,%xmm11
+ vmovdqu 48(%rsi),%xmm6
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpmuludq %xmm2,%xmm9,%xmm2
+ vpmuludq %xmm3,%xmm9,%xmm3
+ vpsrldq $6,%xmm5,%xmm7
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm9,%xmm4
+ vpsrldq $6,%xmm6,%xmm8
+ vpaddq %xmm3,%xmm12,%xmm2
+ vpaddq %xmm4,%xmm13,%xmm3
+ vpmuludq -32(%r11),%xmm0,%xmm4
+ vpmuludq %xmm1,%xmm9,%xmm0
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpaddq %xmm4,%xmm14,%xmm4
+ vpaddq %xmm0,%xmm10,%xmm0
+
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+
+ vpsrldq $5,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vmovdqa 0(%rsp),%xmm14
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpand 0(%rcx),%xmm9,%xmm9
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm11,%xmm1
+
+ vpsrlq $26,%xmm4,%xmm10
+ vpand %xmm15,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm1,%xmm11
+ vpand %xmm15,%xmm1,%xmm1
+ vpaddq %xmm11,%xmm2,%xmm2
+
+ vpaddq %xmm10,%xmm0,%xmm0
+ vpsllq $2,%xmm10,%xmm10
+ vpaddq %xmm10,%xmm0,%xmm0
+
+ vpsrlq $26,%xmm2,%xmm12
+ vpand %xmm15,%xmm2,%xmm2
+ vpaddq %xmm12,%xmm3,%xmm3
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm1,%xmm1
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+ vpshufd $0x10,%xmm14,%xmm14
+ addq $32,%rdx
+ jnz .Long_tail_avx
+
+ vpaddq %xmm2,%xmm7,%xmm7
+ vpaddq %xmm0,%xmm5,%xmm5
+ vpaddq %xmm1,%xmm6,%xmm6
+ vpaddq %xmm3,%xmm8,%xmm8
+ vpaddq %xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+ vmovdqa %xmm2,32(%r11)
+ vmovdqa %xmm0,0(%r11)
+ vmovdqa %xmm1,16(%r11)
+ vmovdqa %xmm3,48(%r11)
+ vmovdqa %xmm4,64(%r11)
+
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpshufd $0x10,-48(%rdi),%xmm2
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm8,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpshufd $0x10,-32(%rdi),%xmm3
+ vpmuludq %xmm7,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpshufd $0x10,-16(%rdi),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm9,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vpshufd $0x10,0(%rdi),%xmm2
+ vpmuludq %xmm7,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm13,%xmm13
+ vpshufd $0x10,16(%rdi),%xmm3
+ vpmuludq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpshufd $0x10,32(%rdi),%xmm4
+ vpmuludq %xmm8,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm10,%xmm10
+
+ vpmuludq %xmm6,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm13,%xmm13
+ vpshufd $0x10,48(%rdi),%xmm2
+ vpmuludq %xmm9,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm12,%xmm12
+ vpshufd $0x10,64(%rdi),%xmm3
+ vpmuludq %xmm8,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm14,%xmm14
+ vpmuludq %xmm9,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpmuludq %xmm8,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm7,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm6,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+
+ vpsrldq $6,%xmm0,%xmm2
+ vpsrldq $6,%xmm1,%xmm3
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+ vpsrlq $40,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpshufd $0x32,-64(%rdi),%xmm9
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpshufd $0x32,-48(%rdi),%xmm7
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpshufd $0x32,-32(%rdi),%xmm8
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpshufd $0x32,-16(%rdi),%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpshufd $0x32,0(%rdi),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm13,%xmm13
+ vpshufd $0x32,16(%rdi),%xmm8
+ vpmuludq %xmm0,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm12,%xmm12
+ vpmuludq %xmm4,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpshufd $0x32,32(%rdi),%xmm9
+ vpmuludq %xmm3,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm10,%xmm10
+
+ vpmuludq %xmm1,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpshufd $0x32,48(%rdi),%xmm7
+ vpmuludq %xmm4,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpshufd $0x32,64(%rdi),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm14,%xmm14
+ vpmuludq %xmm4,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm3,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm2,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm1,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+ vpsrldq $8,%xmm14,%xmm9
+ vpsrldq $8,%xmm13,%xmm8
+ vpsrldq $8,%xmm11,%xmm6
+ vpsrldq $8,%xmm10,%xmm5
+ vpsrldq $8,%xmm12,%xmm7
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpaddq %xmm9,%xmm14,%xmm14
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpaddq %xmm7,%xmm12,%xmm12
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm14,%xmm4
+ vpand %xmm15,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm11,%xmm1
+ vpand %xmm15,%xmm11,%xmm11
+ vpaddq %xmm1,%xmm12,%xmm12
+
+ vpaddq %xmm4,%xmm10,%xmm10
+ vpsllq $2,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpsrlq $26,%xmm12,%xmm2
+ vpand %xmm15,%xmm12,%xmm12
+ vpaddq %xmm2,%xmm13,%xmm13
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vmovd %xmm10,-112(%rdi)
+ vmovd %xmm11,-108(%rdi)
+ vmovd %xmm12,-104(%rdi)
+ vmovd %xmm13,-100(%rdi)
+ vmovd %xmm14,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+ENDPROC(poly1305_blocks_avx)
+
+.align 32
+ENTRY(poly1305_emit_avx)
+ cmpl $0,20(%rdi)
+ je .Lemit
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ecx
+ movl 8(%rdi),%r8d
+ movl 12(%rdi),%r11d
+ movl 16(%rdi),%r10d
+
+ shlq $26,%rcx
+ movq %r8,%r9
+ shlq $52,%r8
+ addq %rcx,%rax
+ shrq $12,%r9
+ addq %rax,%r8
+ adcq $0,%r9
+
+ shlq $14,%r11
+ movq %r10,%rax
+ shrq $24,%r10
+ addq %r11,%r9
+ shlq $40,%rax
+ addq %rax,%r9
+ adcq $0,%r10
+
+ movq %r10,%rax
+ movq %r10,%rcx
+ andq $3,%r10
+ shrq $2,%rax
+ andq $-4,%rcx
+ addq %rcx,%rax
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+
+ movq %r8,%rax
+ addq $5,%r8
+ movq %r9,%rcx
+ adcq $0,%r9
+ adcq $0,%r10
+ shrq $2,%r10
+ cmovnzq %r8,%rax
+ cmovnzq %r9,%rcx
+
+ addq 0(%rdx),%rax
+ adcq 8(%rdx),%rcx
+ movq %rax,0(%rsi)
+ movq %rcx,8(%rsi)
+
+ ret
+ENDPROC(poly1305_emit_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+.align 32
+ENTRY(poly1305_blocks_avx2)
+
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2:
+ andq $-16,%rdx
+ jz .Lno_data_avx2
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2
+
+ testq $63,%rdx
+ jz .Leven_avx2
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10
+
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx2:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+ ret
+
+
+.align 32
+.Lbase2_64_avx2:
+
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx2:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue:
+ jmp .Ldo_avx2
+
+
+.align 32
+.Leven_avx2:
+
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2:
+ leaq 8(%rsp),%r10
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2
+
+.byte 0x66,0x90
+.Ltail_avx2:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+
+ENDPROC(poly1305_blocks_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align 32
+ENTRY(poly1305_blocks_avx512)
+
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2_512
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2_512:
+ andq $-16,%rdx
+ jz .Lno_data_avx2_512
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2_512
+
+ testq $63,%rdx
+ jz .Leven_avx2_512
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lblocks_avx2_body_512:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10
+
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2_512
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2_512
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2_512
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx2_512
+
+.align 32
+.Lstore_base2_64_avx2_512:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx2_512
+
+.align 16
+.Lstore_base2_26_avx2_512:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx2_512:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx2_512:
+.Lblocks_avx2_epilogue_512:
+ ret
+
+
+.align 32
+.Lbase2_64_avx2_512:
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx2_body_512:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2_512
+
+.Lbase2_64_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2_512
+
+.Linit_avx2_512:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx2_512:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue_512:
+ jmp .Ldo_avx2_512
+
+
+.align 32
+.Leven_avx2_512:
+
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2_512:
+ cmpq $512,%rdx
+ jae .Lblocks_avx512
+.Lskip_avx512:
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2_512
+ jmp .Loop_avx2_512
+
+.align 32
+.Loop_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2_512
+
+.byte 0x66,0x90
+.Ltail_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+
+.Lblocks_avx512:
+
+ movl $15,%eax
+ kmovw %eax,%k2
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm9
+
+ vmovdqu32 -64(%rdi),%zmm16{%k2}{z}
+ andq $-512,%rsp
+ vmovdqu32 -48(%rdi),%zmm17{%k2}{z}
+ movq $0x20,%rax
+ vmovdqu32 -32(%rdi),%zmm21{%k2}{z}
+ vmovdqu32 -16(%rdi),%zmm18{%k2}{z}
+ vmovdqu32 0(%rdi),%zmm22{%k2}{z}
+ vmovdqu32 16(%rdi),%zmm19{%k2}{z}
+ vmovdqu32 32(%rdi),%zmm23{%k2}{z}
+ vmovdqu32 48(%rdi),%zmm20{%k2}{z}
+ vmovdqu32 64(%rdi),%zmm24{%k2}{z}
+ vpermd %zmm16,%zmm9,%zmm16
+ vpbroadcastq 64(%rcx),%zmm5
+ vpermd %zmm17,%zmm9,%zmm17
+ vpermd %zmm21,%zmm9,%zmm21
+ vpermd %zmm18,%zmm9,%zmm18
+ vmovdqa64 %zmm16,0(%rsp){%k2}
+ vpsrlq $32,%zmm16,%zmm7
+ vpermd %zmm22,%zmm9,%zmm22
+ vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
+ vpsrlq $32,%zmm17,%zmm8
+ vpermd %zmm19,%zmm9,%zmm19
+ vmovdqa64 %zmm21,64(%rsp){%k2}
+ vpermd %zmm23,%zmm9,%zmm23
+ vpermd %zmm20,%zmm9,%zmm20
+ vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
+ vpermd %zmm24,%zmm9,%zmm24
+ vmovdqa64 %zmm22,128(%rsp){%k2}
+ vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm23,192(%rsp){%k2}
+ vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm24,256(%rsp){%k2}
+
+ vpmuludq %zmm7,%zmm16,%zmm11
+ vpmuludq %zmm7,%zmm17,%zmm12
+ vpmuludq %zmm7,%zmm18,%zmm13
+ vpmuludq %zmm7,%zmm19,%zmm14
+ vpmuludq %zmm7,%zmm20,%zmm15
+ vpsrlq $32,%zmm18,%zmm9
+
+ vpmuludq %zmm8,%zmm24,%zmm25
+ vpmuludq %zmm8,%zmm16,%zmm26
+ vpmuludq %zmm8,%zmm17,%zmm27
+ vpmuludq %zmm8,%zmm18,%zmm28
+ vpmuludq %zmm8,%zmm19,%zmm29
+ vpsrlq $32,%zmm19,%zmm10
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+
+ vpmuludq %zmm9,%zmm23,%zmm25
+ vpmuludq %zmm9,%zmm24,%zmm26
+ vpmuludq %zmm9,%zmm17,%zmm28
+ vpmuludq %zmm9,%zmm18,%zmm29
+ vpmuludq %zmm9,%zmm16,%zmm27
+ vpsrlq $32,%zmm20,%zmm6
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm10,%zmm22,%zmm25
+ vpmuludq %zmm10,%zmm16,%zmm28
+ vpmuludq %zmm10,%zmm17,%zmm29
+ vpmuludq %zmm10,%zmm23,%zmm26
+ vpmuludq %zmm10,%zmm24,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm6,%zmm24,%zmm28
+ vpmuludq %zmm6,%zmm16,%zmm29
+ vpmuludq %zmm6,%zmm21,%zmm25
+ vpmuludq %zmm6,%zmm22,%zmm26
+ vpmuludq %zmm6,%zmm23,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vmovdqu64 0(%rsi),%zmm10
+ vmovdqu64 64(%rsi),%zmm6
+ leaq 128(%rsi),%rsi
+
+ vpsrlq $26,%zmm14,%zmm28
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm28,%zmm15,%zmm15
+
+ vpsrlq $26,%zmm11,%zmm25
+ vpandq %zmm5,%zmm11,%zmm11
+ vpaddq %zmm25,%zmm12,%zmm12
+
+ vpsrlq $26,%zmm15,%zmm29
+ vpandq %zmm5,%zmm15,%zmm15
+
+ vpsrlq $26,%zmm12,%zmm26
+ vpandq %zmm5,%zmm12,%zmm12
+ vpaddq %zmm26,%zmm13,%zmm13
+
+ vpaddq %zmm29,%zmm11,%zmm11
+ vpsllq $2,%zmm29,%zmm29
+ vpaddq %zmm29,%zmm11,%zmm11
+
+ vpsrlq $26,%zmm13,%zmm27
+ vpandq %zmm5,%zmm13,%zmm13
+ vpaddq %zmm27,%zmm14,%zmm14
+
+ vpsrlq $26,%zmm11,%zmm25
+ vpandq %zmm5,%zmm11,%zmm11
+ vpaddq %zmm25,%zmm12,%zmm12
+
+ vpsrlq $26,%zmm14,%zmm28
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm28,%zmm15,%zmm15
+
+ vpunpcklqdq %zmm6,%zmm10,%zmm7
+ vpunpckhqdq %zmm6,%zmm10,%zmm6
+
+ vmovdqa32 128(%rcx),%zmm25
+ movl $0x7777,%eax
+ kmovw %eax,%k1
+
+ vpermd %zmm16,%zmm25,%zmm16
+ vpermd %zmm17,%zmm25,%zmm17
+ vpermd %zmm18,%zmm25,%zmm18
+ vpermd %zmm19,%zmm25,%zmm19
+ vpermd %zmm20,%zmm25,%zmm20
+
+ vpermd %zmm11,%zmm25,%zmm16{%k1}
+ vpermd %zmm12,%zmm25,%zmm17{%k1}
+ vpermd %zmm13,%zmm25,%zmm18{%k1}
+ vpermd %zmm14,%zmm25,%zmm19{%k1}
+ vpermd %zmm15,%zmm25,%zmm20{%k1}
+
+ vpslld $2,%zmm17,%zmm21
+ vpslld $2,%zmm18,%zmm22
+ vpslld $2,%zmm19,%zmm23
+ vpslld $2,%zmm20,%zmm24
+ vpaddd %zmm17,%zmm21,%zmm21
+ vpaddd %zmm18,%zmm22,%zmm22
+ vpaddd %zmm19,%zmm23,%zmm23
+ vpaddd %zmm20,%zmm24,%zmm24
+
+ vpbroadcastq 32(%rcx),%zmm30
+
+ vpsrlq $52,%zmm7,%zmm9
+ vpsllq $12,%zmm6,%zmm10
+ vporq %zmm10,%zmm9,%zmm9
+ vpsrlq $26,%zmm7,%zmm8
+ vpsrlq $14,%zmm6,%zmm10
+ vpsrlq $40,%zmm6,%zmm6
+ vpandq %zmm5,%zmm9,%zmm9
+ vpandq %zmm5,%zmm7,%zmm7
+
+ vpaddq %zmm2,%zmm9,%zmm2
+ subq $192,%rdx
+ jbe .Ltail_avx512
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+
+ vpmuludq %zmm2,%zmm17,%zmm14
+ vpaddq %zmm0,%zmm7,%zmm0
+ vpmuludq %zmm2,%zmm18,%zmm15
+ vpandq %zmm5,%zmm8,%zmm8
+ vpmuludq %zmm2,%zmm23,%zmm11
+ vpandq %zmm5,%zmm10,%zmm10
+ vpmuludq %zmm2,%zmm24,%zmm12
+ vporq %zmm30,%zmm6,%zmm6
+ vpmuludq %zmm2,%zmm16,%zmm13
+ vpaddq %zmm1,%zmm8,%zmm1
+ vpaddq %zmm3,%zmm10,%zmm3
+ vpaddq %zmm4,%zmm6,%zmm4
+
+ vmovdqu64 0(%rsi),%zmm10
+ vmovdqu64 64(%rsi),%zmm6
+ leaq 128(%rsi),%rsi
+ vpmuludq %zmm0,%zmm19,%zmm28
+ vpmuludq %zmm0,%zmm20,%zmm29
+ vpmuludq %zmm0,%zmm16,%zmm25
+ vpmuludq %zmm0,%zmm17,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+
+ vpmuludq %zmm1,%zmm18,%zmm28
+ vpmuludq %zmm1,%zmm19,%zmm29
+ vpmuludq %zmm1,%zmm24,%zmm25
+ vpmuludq %zmm0,%zmm18,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpunpcklqdq %zmm6,%zmm10,%zmm7
+ vpunpckhqdq %zmm6,%zmm10,%zmm6
+
+ vpmuludq %zmm3,%zmm16,%zmm28
+ vpmuludq %zmm3,%zmm17,%zmm29
+ vpmuludq %zmm1,%zmm16,%zmm26
+ vpmuludq %zmm1,%zmm17,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm24,%zmm28
+ vpmuludq %zmm4,%zmm16,%zmm29
+ vpmuludq %zmm3,%zmm22,%zmm25
+ vpmuludq %zmm3,%zmm23,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpmuludq %zmm3,%zmm24,%zmm27
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm21,%zmm25
+ vpmuludq %zmm4,%zmm22,%zmm26
+ vpmuludq %zmm4,%zmm23,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm0
+ vpaddq %zmm26,%zmm12,%zmm1
+ vpaddq %zmm27,%zmm13,%zmm2
+
+ vpsrlq $52,%zmm7,%zmm9
+ vpsllq $12,%zmm6,%zmm10
+
+ vpsrlq $26,%zmm14,%zmm3
+ vpandq %zmm5,%zmm14,%zmm14
+ vpaddq %zmm3,%zmm15,%zmm4
+
+ vporq %zmm10,%zmm9,%zmm9
+
+ vpsrlq $26,%zmm0,%zmm11
+ vpandq %zmm5,%zmm0,%zmm0
+ vpaddq %zmm11,%zmm1,%zmm1
+
+ vpandq %zmm5,%zmm9,%zmm9
+
+ vpsrlq $26,%zmm4,%zmm15
+ vpandq %zmm5,%zmm4,%zmm4
+
+ vpsrlq $26,%zmm1,%zmm12
+ vpandq %zmm5,%zmm1,%zmm1
+ vpaddq %zmm12,%zmm2,%zmm2
+
+ vpaddq %zmm15,%zmm0,%zmm0
+ vpsllq $2,%zmm15,%zmm15
+ vpaddq %zmm15,%zmm0,%zmm0
+
+ vpaddq %zmm9,%zmm2,%zmm2
+ vpsrlq $26,%zmm7,%zmm8
+
+ vpsrlq $26,%zmm2,%zmm13
+ vpandq %zmm5,%zmm2,%zmm2
+ vpaddq %zmm13,%zmm14,%zmm3
+
+ vpsrlq $14,%zmm6,%zmm10
+
+ vpsrlq $26,%zmm0,%zmm11
+ vpandq %zmm5,%zmm0,%zmm0
+ vpaddq %zmm11,%zmm1,%zmm1
+
+ vpsrlq $40,%zmm6,%zmm6
+
+ vpsrlq $26,%zmm3,%zmm14
+ vpandq %zmm5,%zmm3,%zmm3
+ vpaddq %zmm14,%zmm4,%zmm4
+
+ vpandq %zmm5,%zmm7,%zmm7
+
+ subq $128,%rdx
+ ja .Loop_avx512
+
+.Ltail_avx512:
+
+ vpsrlq $32,%zmm16,%zmm16
+ vpsrlq $32,%zmm17,%zmm17
+ vpsrlq $32,%zmm18,%zmm18
+ vpsrlq $32,%zmm23,%zmm23
+ vpsrlq $32,%zmm24,%zmm24
+ vpsrlq $32,%zmm19,%zmm19
+ vpsrlq $32,%zmm20,%zmm20
+ vpsrlq $32,%zmm21,%zmm21
+ vpsrlq $32,%zmm22,%zmm22
+
+ leaq (%rsi,%rdx,1),%rsi
+
+ vpaddq %zmm0,%zmm7,%zmm0
+
+ vpmuludq %zmm2,%zmm17,%zmm14
+ vpmuludq %zmm2,%zmm18,%zmm15
+ vpmuludq %zmm2,%zmm23,%zmm11
+ vpandq %zmm5,%zmm8,%zmm8
+ vpmuludq %zmm2,%zmm24,%zmm12
+ vpandq %zmm5,%zmm10,%zmm10
+ vpmuludq %zmm2,%zmm16,%zmm13
+ vporq %zmm30,%zmm6,%zmm6
+ vpaddq %zmm1,%zmm8,%zmm1
+ vpaddq %zmm3,%zmm10,%zmm3
+ vpaddq %zmm4,%zmm6,%zmm4
+
+ vmovdqu 0(%rsi),%xmm7
+ vpmuludq %zmm0,%zmm19,%zmm28
+ vpmuludq %zmm0,%zmm20,%zmm29
+ vpmuludq %zmm0,%zmm16,%zmm25
+ vpmuludq %zmm0,%zmm17,%zmm26
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+
+ vmovdqu 16(%rsi),%xmm8
+ vpmuludq %zmm1,%zmm18,%zmm28
+ vpmuludq %zmm1,%zmm19,%zmm29
+ vpmuludq %zmm1,%zmm24,%zmm25
+ vpmuludq %zmm0,%zmm18,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vpmuludq %zmm3,%zmm16,%zmm28
+ vpmuludq %zmm3,%zmm17,%zmm29
+ vpmuludq %zmm1,%zmm16,%zmm26
+ vpmuludq %zmm1,%zmm17,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ vpmuludq %zmm4,%zmm24,%zmm28
+ vpmuludq %zmm4,%zmm16,%zmm29
+ vpmuludq %zmm3,%zmm22,%zmm25
+ vpmuludq %zmm3,%zmm23,%zmm26
+ vpmuludq %zmm3,%zmm24,%zmm27
+ vpaddq %zmm28,%zmm14,%zmm3
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm4,%zmm21,%zmm25
+ vpmuludq %zmm4,%zmm22,%zmm26
+ vpmuludq %zmm4,%zmm23,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm0
+ vpaddq %zmm26,%zmm12,%zmm1
+ vpaddq %zmm27,%zmm13,%zmm2
+
+ movl $1,%eax
+ vpermq $0xb1,%zmm3,%zmm14
+ vpermq $0xb1,%zmm15,%zmm4
+ vpermq $0xb1,%zmm0,%zmm11
+ vpermq $0xb1,%zmm1,%zmm12
+ vpermq $0xb1,%zmm2,%zmm13
+ vpaddq %zmm14,%zmm3,%zmm3
+ vpaddq %zmm15,%zmm4,%zmm4
+ vpaddq %zmm11,%zmm0,%zmm0
+ vpaddq %zmm12,%zmm1,%zmm1
+ vpaddq %zmm13,%zmm2,%zmm2
+
+ kmovw %eax,%k3
+ vpermq $0x2,%zmm3,%zmm14
+ vpermq $0x2,%zmm4,%zmm15
+ vpermq $0x2,%zmm0,%zmm11
+ vpermq $0x2,%zmm1,%zmm12
+ vpermq $0x2,%zmm2,%zmm13
+ vpaddq %zmm14,%zmm3,%zmm3
+ vpaddq %zmm15,%zmm4,%zmm4
+ vpaddq %zmm11,%zmm0,%zmm0
+ vpaddq %zmm12,%zmm1,%zmm1
+ vpaddq %zmm13,%zmm2,%zmm2
+
+ vextracti64x4 $0x1,%zmm3,%ymm14
+ vextracti64x4 $0x1,%zmm4,%ymm15
+ vextracti64x4 $0x1,%zmm0,%ymm11
+ vextracti64x4 $0x1,%zmm1,%ymm12
+ vextracti64x4 $0x1,%zmm2,%ymm13
+ vpaddq %zmm14,%zmm3,%zmm3{%k3}{z}
+ vpaddq %zmm15,%zmm4,%zmm4{%k3}{z}
+ vpaddq %zmm11,%zmm0,%zmm0{%k3}{z}
+ vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
+ vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm2,%ymm9,%ymm2
+ vpand %ymm5,%ymm8,%ymm8
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ leaq 144(%rsp),%rax
+ addq $64,%rdx
+ jnz .Ltail_avx2_512
+
+ vpsubq %ymm9,%ymm2,%ymm2
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ vzeroall
+ leaq -8(%r10),%rsp
+
+ ret
+
+ENDPROC(poly1305_blocks_avx512)
+#endif /* CONFIG_AS_AVX512 */
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 08/20] zinc: Poly1305 ARM and ARM64 implementations
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Jean-Philippe Aumasson, Andy Polyakov, Russell King,
linux-arm-kernel
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
These NEON and non-NEON implementations come from Andy Polyakov's
implementation. They are exactly the same as Andy Polyakov's original,
with the following exceptions:
- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed to fit kernel conventions.
- The neon code can jump to the scalar code when it makes sense to do
so.
After '/^#/d;/^\..*[^:]$/d', the code has the following diff in actual
instructions from the original.
ARM:
-poly1305_init:
-.Lpoly1305_init:
+ENTRY(poly1305_init_arm)
stmdb sp!,{r4-r11}
eor r3,r3,r3
@@ -18,8 +25,6 @@
moveq r0,#0
beq .Lno_key
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
ldrb r4,[r1,#0]
mov r10,#0x0fffffff
ldrb r5,[r1,#1]
@@ -34,8 +39,6 @@
ldrb r7,[r1,#6]
and r4,r4,r10
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
- ldr r12,[r12]
ldrb r8,[r1,#7]
orr r5,r5,r6,lsl#8
ldrb r6,[r1,#8]
@@ -45,22 +48,6 @@
ldrb r8,[r1,#10]
and r5,r5,r3
- tst r12,#ARMV7_NEON @ check for NEON
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
- it ne
- movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
- it ne
- movne r12,r10
- itete eq
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
ldrb r9,[r1,#11]
orr r6,r6,r7,lsl#8
ldrb r7,[r1,#12]
@@ -79,17 +66,16 @@
str r6,[r0,#8]
and r7,r7,r3
str r7,[r0,#12]
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
- mov r0,#0
.Lno_key:
ldmia sp!,{r4-r11}
bx lr @ bx lr
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-poly1305_blocks:
-.Lpoly1305_blocks:
+ENDPROC(poly1305_init_arm)
+
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
stmdb sp!,{r3-r11,lr}
ands r2,r2,#-16
@@ -231,10 +217,11 @@
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
-poly1305_emit:
+ENDPROC(poly1305_blocks_arm)
+
+ENTRY(poly1305_emit_arm)
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
-
ldmia r0,{r3-r7}
adds r8,r3,#5 @ compare to modulus
adcs r9,r4,#0
@@ -305,8 +292,12 @@
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+ENDPROC(poly1305_emit_arm)
+
+
-poly1305_init_neon:
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
ldr r4,[r0,#20] @ load key base 2^32
ldr r5,[r0,#24]
ldr r6,[r0,#28]
@@ -515,8 +506,9 @@
vst1.32 {d8[1]},[r7]
bx lr @ bx lr
+ENDPROC(poly1305_init_neon)
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
ldr ip,[r0,#36] @ is_base2_26
ands r2,r2,#-16
beq .Lno_data_neon
@@ -524,7 +516,7 @@
cmp r2,#64
bhs .Lenter_neon
tst ip,ip @ is_base2_26?
- beq .Lpoly1305_blocks
+ beq .Lpoly1305_blocks_arm
.Lenter_neon:
stmdb sp!,{r4-r7}
@@ -534,7 +526,7 @@
bne .Lbase2_26_neon
stmdb sp!,{r1-r3,lr}
- bl poly1305_init_neon
+ bl .Lpoly1305_init_neon
ldr r4,[r0,#0] @ load hash value base 2^32
ldr r5,[r0,#4]
@@ -989,8 +981,9 @@
ldmia sp!,{r4-r7}
.Lno_data_neon:
bx lr @ bx lr
+ENDPROC(poly1305_blocks_neon)
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
ldr ip,[r0,#36] @ is_base2_26
stmdb sp!,{r4-r11}
@@ -1055,6 +1048,6 @@
ldmia sp!,{r4-r11}
bx lr @ bx lr
+ENDPROC(poly1305_emit_neon)
ARM64:
-poly1305_init:
+ENTRY(poly1305_init_arm)
cmp x1,xzr
stp xzr,xzr,[x0] // zero hash value
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
@@ -11,14 +15,9 @@
csel x0,xzr,x0,eq
b.eq .Lno_key
- ldrsw x11,.LOPENSSL_armcap_P
- ldr x11,.LOPENSSL_armcap_P
- adr x10,.LOPENSSL_armcap_P
-
ldp x7,x8,[x1] // load key
mov x9,#0xfffffffc0fffffff
movk x9,#0x0fff,lsl#48
- ldr w17,[x10,x11]
rev x7,x7 // flip bytes
rev x8,x8
and x7,x7,x9 // &=0ffffffc0fffffff
@@ -26,24 +25,11 @@
and x8,x8,x9 // &=0ffffffc0ffffffc
stp x7,x8,[x0,#32] // save key value
- tst w17,#ARMV7_NEON
-
- adr x12,poly1305_blocks
- adr x7,poly1305_blocks_neon
- adr x13,poly1305_emit
- adr x8,poly1305_emit_neon
-
- csel x12,x12,x7,eq
- csel x13,x13,x8,eq
-
- stp w12,w13,[x2]
- stp x12,x13,[x2]
-
- mov x0,#1
.Lno_key:
ret
+ENDPROC(poly1305_init_arm)
-poly1305_blocks:
+ENTRY(poly1305_blocks_arm)
ands x2,x2,#-16
b.eq .Lno_data
@@ -100,8 +86,9 @@
.Lno_data:
ret
+ENDPROC(poly1305_blocks_arm)
-poly1305_emit:
+ENTRY(poly1305_emit_arm)
ldp x4,x5,[x0] // load hash base 2^64
ldr x6,[x0,#16]
ldp x10,x11,[x2] // load nonce
@@ -124,7 +111,9 @@
stp x4,x5,[x1] // write result
ret
-poly1305_mult:
+ENDPROC(poly1305_emit_arm)
+
+__poly1305_mult:
mul x12,x4,x7 // h0*r0
umulh x13,x4,x7
@@ -158,7 +147,7 @@
ret
-poly1305_splat:
+__poly1305_splat:
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,x4,#26,#26
extr x14,x5,x4,#52
@@ -182,11 +171,11 @@
ret
-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
ldr x17,[x0,#24]
cmp x2,#128
b.hs .Lblocks_neon
- cbz x17,poly1305_blocks
+ cbz x17,poly1305_blocks_arm
.Lblocks_neon:
stp x29,x30,[sp,#-80]!
@@ -232,7 +221,7 @@
adcs x5,x5,x13
adc x6,x6,x3
- bl poly1305_mult
+ bl __poly1305_mult
ldr x30,[sp,#8]
cbz x3,.Lstore_base2_64_neon
@@ -274,7 +263,7 @@
adcs x5,x5,x13
adc x6,x6,x3
- bl poly1305_mult
+ bl __poly1305_mult
.Linit_neon:
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
@@ -301,19 +290,19 @@
mov x5,x8
mov x6,xzr
add x0,x0,#48+12
- bl poly1305_splat
+ bl __poly1305_splat
- bl poly1305_mult // r^2
+ bl __poly1305_mult // r^2
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat
- bl poly1305_mult // r^3
+ bl __poly1305_mult // r^3
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat
- bl poly1305_mult // r^4
+ bl __poly1305_mult // r^4
sub x0,x0,#4
- bl poly1305_splat
+ bl __poly1305_splat
ldr x30,[sp,#8]
add x16,x1,#32
@@ -743,10 +732,11 @@
.Lno_data_neon:
ldr x29,[sp],#80
ret
+ENDPROC(poly1305_blocks_neon)
-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
ldr x17,[x0,#24]
- cbz x17,poly1305_emit
+ cbz x17,poly1305_emit_arm
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
@@ -788,6 +778,6 @@
stp x4,x5,[x1] // write result
ret
+ENDPROC(poly1305_emit_neon)
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
---
lib/zinc/Makefile | 8 +
lib/zinc/poly1305/poly1305-arm-glue.h | 69 ++
lib/zinc/poly1305/poly1305-arm.S | 1117 +++++++++++++++++++++++++
lib/zinc/poly1305/poly1305-arm64.S | 822 ++++++++++++++++++
4 files changed, 2016 insertions(+)
create mode 100644 lib/zinc/poly1305/poly1305-arm-glue.h
create mode 100644 lib/zinc/poly1305/poly1305-arm.S
create mode 100644 lib/zinc/poly1305/poly1305-arm64.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index d1e3892e06d9..f37df89a3f87 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -25,6 +25,14 @@ endif
ifeq ($(CONFIG_ZINC_POLY1305),y)
zinc-y += poly1305/poly1305.o
+ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
+zinc-y += poly1305/poly1305-arm.o
+CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-arm-glue.h
+endif
+ifeq ($(CONFIG_ZINC_ARCH_ARM64),y)
+zinc-y += poly1305/poly1305-arm64.o
+CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-arm-glue.h
+endif
endif
zinc-y += main.o
diff --git a/lib/zinc/poly1305/poly1305-arm-glue.h b/lib/zinc/poly1305/poly1305-arm-glue.h
new file mode 100644
index 000000000000..53f8fec7f858
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm-glue.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
+asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && \
+ (defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
+#endif
+
+static bool poly1305_use_neon __ro_after_init;
+
+void __init poly1305_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+ poly1305_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+ poly1305_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ poly1305_init_arm(ctx, key);
+ return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit,
+ simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+ if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) {
+ poly1305_blocks_neon(ctx, inp, len, padbit);
+ return true;
+ }
+#endif
+ poly1305_blocks_arm(ctx, inp, len, padbit);
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+ if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) {
+ poly1305_emit_neon(ctx, mac, nonce);
+ return true;
+ }
+#endif
+ poly1305_emit_arm(ctx, mac, nonce);
+ return true;
+}
+
+#define HAVE_POLY1305_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/poly1305/poly1305-arm.S b/lib/zinc/poly1305/poly1305-arm.S
new file mode 100644
index 000000000000..110f4317b5d7
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm.S
@@ -0,0 +1,1117 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.align 5
+ENTRY(poly1305_init_arm)
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp r1,#0
+ str r3,[r0,#0] @ zero hash value
+ str r3,[r0,#4]
+ str r3,[r0,#8]
+ str r3,[r0,#12]
+ str r3,[r0,#16]
+ str r3,[r0,#36] @ is_base2_26
+ add r0,r0,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+ ldrb r4,[r1,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[r1,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[r1,#2]
+ ldrb r7,[r1,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[r1,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[r1,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[r1,#6]
+ and r4,r4,r10
+
+ ldrb r8,[r1,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[r1,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[r1,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[r1,#10]
+ and r5,r5,r3
+
+ ldrb r9,[r1,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[r1,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[r1,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[r1,#14]
+ and r6,r6,r3
+
+ ldrb r10,[r1,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[r0,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[r0,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[r0,#8]
+ and r7,r7,r3
+ str r7,[r0,#12]
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __LINUX_ARM_ARCH__ >= 5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_init_arm)
+
+.align 5
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
+ stmdb sp!,{r3-r11,lr}
+
+ ands r2,r2,#-16
+ beq .Lno_data
+
+ cmp r3,#0
+ add r2,r2,r1 @ end pointer
+ sub sp,sp,#32
+
+ ldmia r0,{r4-r12} @ load context
+
+ str r0,[sp,#12] @ offload stuff
+ mov lr,r1
+ str r2,[sp,#16]
+ str r10,[sp,#20]
+ str r11,[sp,#24]
+ str r12,[sp,#28]
+ b .Loop
+
+.Loop:
+#if __LINUX_ARM_ARCH__ < 7
+ ldrb r0,[lr],#16 @ load input
+#ifdef __thumb2__
+ it hi
+#endif
+ addhi r8,r8,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds r4,r4,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs r5,r5,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs r6,r6,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add r10,r10,r10,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+#ifdef __thumb2__
+ it hi
+#endif
+ addhi r8,r8,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ adds r4,r4,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs r5,r5,r1
+ add r10,r10,r10,lsr#2
+ adcs r6,r6,r2
+#endif
+ add r11,r11,r11,lsr#2
+ adcs r7,r7,r3
+ add r12,r12,r12,lsr#2
+
+ umull r2,r3,r5,r9
+ adc r8,r8,#0
+ umull r0,r1,r4,r9
+ umlal r2,r3,r8,r10
+ umlal r0,r1,r7,r10
+ ldr r10,[sp,#20] @ reload r10
+ umlal r2,r3,r6,r12
+ umlal r0,r1,r5,r12
+ umlal r2,r3,r7,r11
+ umlal r0,r1,r6,r11
+ umlal r2,r3,r4,r10
+ str r0,[sp,#0] @ future r4
+ mul r0,r11,r8
+ ldr r11,[sp,#24] @ reload r11
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future r6
+ str r2,[sp,#4] @ future r5
+
+ mul r2,r12,r8
+ eor r3,r3,r3
+ umlal r0,r1,r7,r12
+ ldr r12,[sp,#28] @ reload r12
+ umlal r2,r3,r7,r9
+ umlal r0,r1,r6,r9
+ umlal r2,r3,r6,r10
+ umlal r0,r1,r5,r10
+ umlal r2,r3,r5,r11
+ umlal r0,r1,r4,r11
+ umlal r2,r3,r4,r12
+ ldr r4,[sp,#0]
+ mul r8,r9,r8
+ ldr r5,[sp,#4]
+
+ adds r6,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds r7,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add r8,r8,r3 @ h4+=d3>>32
+
+ and r1,r8,#-4
+ and r8,r8,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds r4,r4,r1
+ adcs r5,r5,#0
+ adcs r6,r6,#0
+ adcs r7,r7,#0
+ adc r8,r8,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr r0,[sp,#12]
+ add sp,sp,#32
+ stmia r0,{r4-r8} @ store the result
+
+.Lno_data:
+#if __LINUX_ARM_ARCH__ >= 5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_blocks_arm)
+
+.align 5
+ENTRY(poly1305_emit_arm)
+ stmdb sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+ ldmia r0,{r3-r7}
+ adds r8,r3,#5 @ compare to modulus
+ adcs r9,r4,#0
+ adcs r10,r5,#0
+ adcs r11,r6,#0
+ adc r7,r7,#0
+ tst r7,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r3,r8
+ ldr r8,[r2,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r4,r9
+ ldr r9,[r2,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r5,r10
+ ldr r10,[r2,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r6,r11
+ ldr r11,[r2,#12]
+
+ adds r3,r3,r8
+ adcs r4,r4,r9
+ adcs r5,r5,r10
+ adc r6,r6,r11
+
+#if __LINUX_ARM_ARCH__ >= 7
+#ifdef __ARMEB__
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+#endif
+ str r3,[r1,#0]
+ str r4,[r1,#4]
+ str r5,[r1,#8]
+ str r6,[r1,#12]
+#else
+ strb r3,[r1,#0]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#4]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#8]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#12]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#1]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#5]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#9]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#13]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#2]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#6]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#10]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#14]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#3]
+ strb r4,[r1,#7]
+ strb r5,[r1,#11]
+ strb r6,[r1,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __LINUX_ARM_ARCH__ >= 5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_emit_arm)
+
+
+#if __LINUX_ARM_ARCH__ >= 7
+.fpu neon
+
+.align 5
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
+ ldr r4,[r0,#20] @ load key base 2^32
+ ldr r5,[r0,#24]
+ ldr r6,[r0,#28]
+ ldr r7,[r0,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 d0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 d1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 d2,r2
+ vdup.32 d3,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 d4,r3
+ vdup.32 d5,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 d6,r4
+ vdup.32 d7,r6
+ vdup.32 d8,r5
+
+ mov r5,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 q5,d0,d0[1]
+ vmull.u32 q6,d1,d0[1]
+ vmull.u32 q7,d3,d0[1]
+ vmull.u32 q8,d5,d0[1]
+ vmull.u32 q9,d7,d0[1]
+
+ vmlal.u32 q5,d7,d2[1]
+ vmlal.u32 q6,d0,d1[1]
+ vmlal.u32 q7,d1,d1[1]
+ vmlal.u32 q8,d3,d1[1]
+ vmlal.u32 q9,d5,d1[1]
+
+ vmlal.u32 q5,d5,d4[1]
+ vmlal.u32 q6,d7,d4[1]
+ vmlal.u32 q8,d1,d3[1]
+ vmlal.u32 q7,d0,d3[1]
+ vmlal.u32 q9,d3,d3[1]
+
+ vmlal.u32 q5,d3,d6[1]
+ vmlal.u32 q8,d0,d5[1]
+ vmlal.u32 q6,d5,d6[1]
+ vmlal.u32 q7,d7,d6[1]
+ vmlal.u32 q9,d1,d5[1]
+
+ vmlal.u32 q8,d7,d8[1]
+ vmlal.u32 q5,d1,d8[1]
+ vmlal.u32 q6,d3,d8[1]
+ vmlal.u32 q7,d5,d8[1]
+ vmlal.u32 q9,d0,d7[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vbic.i32 d18,#0xfc000000
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vadd.i32 d10,d10,d30 @ h4 -> h0
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vbic.i32 d14,#0xfc000000
+
+ vshr.u32 d30,d10,#26
+ vbic.i32 d10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+
+ subs r5,r5,#1
+ beq .Lsquare_break_neon
+
+ add r6,r0,#(48+0*9*4)
+ add r7,r0,#(48+1*9*4)
+
+ vtrn.32 d0,d10 @ r^2:r^1
+ vtrn.32 d3,d14
+ vtrn.32 d5,d16
+ vtrn.32 d1,d12
+ vtrn.32 d7,d18
+
+ vshl.u32 d4,d3,#2 @ *5
+ vshl.u32 d6,d5,#2
+ vshl.u32 d2,d1,#2
+ vshl.u32 d8,d7,#2
+ vadd.i32 d4,d4,d3
+ vadd.i32 d2,d2,d1
+ vadd.i32 d6,d6,d5
+ vadd.i32 d8,d8,d7
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6,:32]
+ vst1.32 {d8[1]},[r7,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add r6,r0,#(48+2*4*9)
+ add r7,r0,#(48+3*4*9)
+
+ vmov d0,d10 @ r^4:r^3
+ vshl.u32 d2,d12,#2 @ *5
+ vmov d1,d12
+ vshl.u32 d4,d14,#2
+ vmov d3,d14
+ vshl.u32 d6,d16,#2
+ vmov d5,d16
+ vshl.u32 d8,d18,#2
+ vmov d7,d18
+ vadd.i32 d2,d2,d12
+ vadd.i32 d4,d4,d14
+ vadd.i32 d6,d6,d16
+ vadd.i32 d8,d8,d18
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6]
+ vst1.32 {d8[1]},[r7]
+
+ bx lr @ bx lr
+ENDPROC(poly1305_init_neon)
+
+.align 5
+ENTRY(poly1305_blocks_neon)
+ ldr ip,[r0,#36] @ is_base2_26
+ ands r2,r2,#-16
+ beq .Lno_data_neon
+
+ cmp r2,#64
+ bhs .Lenter_neon
+ tst ip,ip @ is_base2_26?
+ beq .Lpoly1305_blocks_arm
+
+.Lenter_neon:
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl .Lpoly1305_init_neon
+
+ ldr r4,[r0,#0] @ load hash value base 2^32
+ ldr r5,[r0,#4]
+ ldr r6,[r0,#8]
+ ldr r7,[r0,#12]
+ ldr ip,[r0,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor d10,d10,d10
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor d12,d12,d12
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor d14,d14,d14
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor d16,d16,d16
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor d18,d18,d18
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[r0,#36] @ is_base2_26
+
+ vmov.32 d10[0],r2
+ vmov.32 d12[0],r3
+ vmov.32 d14[0],r4
+ vmov.32 d16[0],r5
+ vmov.32 d18[0],r6
+ adr r5,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lbase2_32_neon
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor d10,d10,d10
+ veor d12,d12,d12
+ veor d14,d14,d14
+ veor d16,d16,d16
+ veor d18,d18,d18
+ vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ adr r5,.Lzeros
+ vld1.32 {d18[0]},[r0]
+ sub r0,r0,#16 @ rewind
+
+.Lbase2_32_neon:
+ add r4,r1,#32
+ mov r3,r3,lsl#24
+ tst r2,#31
+ beq .Leven
+
+ vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
+ vmov.32 d28[0],r3
+ sub r2,r2,#16
+ add r4,r1,#32
+
+#ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+#endif
+ vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
+ vshl.u32 d26,d26,#18
+
+ vsri.u32 d26,d24,#14
+ vshl.u32 d24,d24,#12
+ vadd.i32 d29,d28,d18 @ add hash value and move to #hi
+
+ vbic.i32 d26,#0xfc000000
+ vsri.u32 d24,d22,#20
+ vshl.u32 d22,d22,#6
+
+ vbic.i32 d24,#0xfc000000
+ vsri.u32 d22,d20,#26
+ vadd.i32 d27,d26,d16
+
+ vbic.i32 d20,#0xfc000000
+ vbic.i32 d22,#0xfc000000
+ vadd.i32 d25,d24,d14
+
+ vadd.i32 d21,d20,d10
+ vadd.i32 d23,d22,d12
+
+ mov r7,r5
+ add r6,r0,#48
+
+ cmp r2,r2
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs r2,r2,#64
+ it lo
+ movlo r4,r5
+
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+ itt hi
+ addhi r7,r0,#(48+1*9*4)
+ addhi r6,r0,#(48+3*9*4)
+
+#ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+#endif
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vshl.u32 q13,q13,#18
+
+ vsri.u32 q13,q12,#14
+ vshl.u32 q12,q12,#12
+
+ vbic.i32 q13,#0xfc000000
+ vsri.u32 q12,q11,#20
+ vshl.u32 q11,q11,#6
+
+ vbic.i32 q12,#0xfc000000
+ vsri.u32 q11,q10,#26
+
+ vbic.i32 q10,#0xfc000000
+ vbic.i32 q11,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ ___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ ___________________/ ____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
+ vmull.u32 q7,d25,d0[1]
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0[1]
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0[1]
+ vmlal.u32 q7,d23,d1[1]
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0[1]
+
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0[1]
+ subs r2,r2,#64
+ vmlal.u32 q5,d29,d2[1]
+ it lo
+ movlo r4,r5
+ vmlal.u32 q8,d25,d1[1]
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q6,d21,d1[1]
+ vmlal.u32 q9,d27,d1[1]
+
+ vmlal.u32 q5,d27,d4[1]
+ vmlal.u32 q8,d23,d3[1]
+ vmlal.u32 q9,d25,d3[1]
+ vmlal.u32 q6,d29,d4[1]
+ vmlal.u32 q7,d21,d3[1]
+
+ vmlal.u32 q8,d21,d5[1]
+ vmlal.u32 q5,d25,d6[1]
+ vmlal.u32 q9,d23,d5[1]
+ vmlal.u32 q6,d27,d6[1]
+ vmlal.u32 q7,d29,d6[1]
+
+ vmlal.u32 q8,d29,d8[1]
+ vmlal.u32 q5,d23,d8[1]
+ vmlal.u32 q9,d21,d7[1]
+ vmlal.u32 q6,d25,d8[1]
+ vmlal.u32 q7,d27,d8[1]
+
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 q8,d26,d0[0]
+ vmlal.u32 q5,d20,d0[0]
+ vmlal.u32 q9,d28,d0[0]
+ vmlal.u32 q6,d22,d0[0]
+ vmlal.u32 q7,d24,d0[0]
+ vld1.32 d8[0],[r6,:32]
+
+ vmlal.u32 q8,d24,d1[0]
+ vmlal.u32 q5,d28,d2[0]
+ vmlal.u32 q9,d26,d1[0]
+ vmlal.u32 q6,d20,d1[0]
+ vmlal.u32 q7,d22,d1[0]
+
+ vmlal.u32 q8,d22,d3[0]
+ vmlal.u32 q5,d26,d4[0]
+ vmlal.u32 q9,d24,d3[0]
+ vmlal.u32 q6,d28,d4[0]
+ vmlal.u32 q7,d20,d3[0]
+
+ vmlal.u32 q8,d20,d5[0]
+ vmlal.u32 q5,d24,d6[0]
+ vmlal.u32 q9,d22,d5[0]
+ vmlal.u32 q6,d26,d6[0]
+ vmlal.u32 q8,d28,d8[0]
+
+ vmlal.u32 q7,d28,d6[0]
+ vmlal.u32 q5,d22,d8[0]
+ vmlal.u32 q9,d20,d7[0]
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vmlal.u32 q6,d24,d8[0]
+ vmlal.u32 q7,d26,d8[0]
+
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+#ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+ vrev32.8 q13,q13
+#endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vshl.u32 q13,q13,#18
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vsri.u32 q13,q12,#14
+ vbic.i32 d18,#0xfc000000
+ vshl.u32 q12,q12,#12
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vbic.i32 q13,#0xfc000000
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
+ vsri.u32 q12,q11,#20
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vshl.u32 q11,q11,#6
+ vbic.i32 d14,#0xfc000000
+ vbic.i32 q12,#0xfc000000
+
+ vshrn.u64 d30,q5,#26 @ re-narrow
+ vmovn.i64 d10,q5
+ vsri.u32 q11,q10,#26
+ vbic.i32 q10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vbic.i32 d10,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+ vbic.i32 q11,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add r7,r0,#(48+0*9*4)
+ add r6,r0,#(48+1*9*4)
+ adds r2,r2,#32
+ it ne
+ movne r2,#0
+ bne .Long_tail
+
+ vadd.i32 d25,d24,d14 @ add hash value and move to #hi
+ vadd.i32 d21,d20,d10
+ vadd.i32 d27,d26,d16
+ vadd.i32 d23,d22,d12
+ vadd.i32 d29,d28,d18
+
+.Long_tail:
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
+
+ vadd.i32 d24,d24,d14 @ can be redundant
+ vmull.u32 q7,d25,d0
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0
+
+ vmlal.u32 q5,d29,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d25,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d21,d1
+ vmlal.u32 q9,d27,d1
+ vmlal.u32 q7,d23,d1
+
+ vmlal.u32 q8,d23,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d27,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d25,d3
+ vmlal.u32 q6,d29,d4
+ vmlal.u32 q7,d21,d3
+
+ vmlal.u32 q8,d21,d5
+ it ne
+ addne r7,r0,#(48+2*9*4)
+ vmlal.u32 q5,d25,d6
+ it ne
+ addne r6,r0,#(48+3*9*4)
+ vmlal.u32 q9,d23,d5
+ vmlal.u32 q6,d27,d6
+ vmlal.u32 q7,d29,d6
+
+ vmlal.u32 q8,d29,d8
+ vorn q0,q0,q0 @ all-ones, can be redundant
+ vmlal.u32 q5,d23,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d21,d7
+ vmlal.u32 q6,d25,d8
+ vmlal.u32 q7,d27,d8
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+
+ vmlal.u32 q7,d24,d0
+ vmlal.u32 q5,d20,d0
+ vmlal.u32 q8,d26,d0
+ vmlal.u32 q6,d22,d0
+ vmlal.u32 q9,d28,d0
+
+ vmlal.u32 q5,d28,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d24,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d20,d1
+ vmlal.u32 q9,d26,d1
+ vmlal.u32 q7,d22,d1
+
+ vmlal.u32 q8,d22,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d26,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d24,d3
+ vmlal.u32 q6,d28,d4
+ vmlal.u32 q7,d20,d3
+
+ vmlal.u32 q8,d20,d5
+ vmlal.u32 q5,d24,d6
+ vmlal.u32 q9,d22,d5
+ vmlal.u32 q6,d26,d6
+ vmlal.u32 q7,d28,d6
+
+ vmlal.u32 q8,d28,d8
+ vorn q0,q0,q0 @ all-ones
+ vmlal.u32 q5,d22,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d20,d7
+ vmlal.u32 q6,d24,d8
+ vmlal.u32 q7,d26,d8
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 d16,d16,d17
+ vadd.i64 d10,d10,d11
+ vadd.i64 d18,d18,d19
+ vadd.i64 d12,d12,d13
+ vadd.i64 d14,d14,d15
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 q15,q8,#26
+ vand.i64 q8,q8,q0
+ vshr.u64 q4,q5,#26
+ vand.i64 q5,q5,q0
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+
+ vshr.u64 q15,q9,#26
+ vand.i64 q9,q9,q0
+ vshr.u64 q4,q6,#26
+ vand.i64 q6,q6,q0
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+
+ vadd.i64 q5,q5,q15
+ vshl.u64 q15,q15,#2
+ vshr.u64 q4,q7,#26
+ vand.i64 q7,q7,q0
+ vadd.i64 q5,q5,q15 @ h4 -> h0
+ vadd.i64 q8,q8,q4 @ h2 -> h3
+
+ vshr.u64 q15,q5,#26
+ vand.i64 q5,q5,q0
+ vshr.u64 q4,q8,#26
+ vand.i64 q8,q8,q0
+ vadd.i64 q6,q6,q15 @ h0 -> h1
+ vadd.i64 q9,q9,q4 @ h3 -> h4
+
+ cmp r2,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ vst1.32 {d18[0]},[r0]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+.Lno_data_neon:
+ bx lr @ bx lr
+ENDPROC(poly1305_blocks_neon)
+
+.align 5
+ENTRY(poly1305_emit_neon)
+ ldr ip,[r0,#36] @ is_base2_26
+
+ stmdb sp!,{r4-r11}
+
+ tst ip,ip
+ beq .Lpoly1305_emit_enter
+
+ ldmia r0,{r3-r7}
+ eor r8,r8,r8
+
+ adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
+ mov r4,r4,lsr#6
+ adcs r4,r4,r5,lsl#20
+ mov r5,r5,lsr#12
+ adcs r5,r5,r6,lsl#14
+ mov r6,r6,lsr#18
+ adcs r6,r6,r7,lsl#8
+ adc r7,r8,r7,lsr#24 @ can be partially reduced ...
+
+ and r8,r7,#-4 @ ... so reduce
+ and r7,r6,#3
+ add r8,r8,r8,lsr#2 @ *= 5
+ adds r3,r3,r8
+ adcs r4,r4,#0
+ adcs r5,r5,#0
+ adcs r6,r6,#0
+ adc r7,r7,#0
+
+ adds r8,r3,#5 @ compare to modulus
+ adcs r9,r4,#0
+ adcs r10,r5,#0
+ adcs r11,r6,#0
+ adc r7,r7,#0
+ tst r7,#4 @ did it carry/borrow?
+
+ it ne
+ movne r3,r8
+ ldr r8,[r2,#0]
+ it ne
+ movne r4,r9
+ ldr r9,[r2,#4]
+ it ne
+ movne r5,r10
+ ldr r10,[r2,#8]
+ it ne
+ movne r6,r11
+ ldr r11,[r2,#12]
+
+ adds r3,r3,r8 @ accumulate nonce
+ adcs r4,r4,r9
+ adcs r5,r5,r10
+ adc r6,r6,r11
+
+#ifdef __ARMEB__
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+#endif
+ str r3,[r1,#0] @ store the result
+ str r4,[r1,#4]
+ str r5,[r1,#8]
+ str r6,[r1,#12]
+
+ ldmia sp!,{r4-r11}
+ bx lr @ bx lr
+ENDPROC(poly1305_emit_neon)
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#endif
diff --git a/lib/zinc/poly1305/poly1305-arm64.S b/lib/zinc/poly1305/poly1305-arm64.S
new file mode 100644
index 000000000000..c20023544183
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm64.S
@@ -0,0 +1,822 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+.text
+
+.align 5
+ENTRY(poly1305_init_arm)
+ cmp x1,xzr
+ stp xzr,xzr,[x0] // zero hash value
+ stp xzr,xzr,[x0,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+ ldp x7,x8,[x1] // load key
+ mov x9,#0xfffffffc0fffffff
+ movk x9,#0x0fff,lsl#48
+#ifdef __ARMEB__
+ rev x7,x7 // flip bytes
+ rev x8,x8
+#endif
+ and x7,x7,x9 // &=0ffffffc0fffffff
+ and x9,x9,#-4
+ and x8,x8,x9 // &=0ffffffc0ffffffc
+ stp x7,x8,[x0,#32] // save key value
+
+.Lno_key:
+ ret
+ENDPROC(poly1305_init_arm)
+
+.align 5
+ENTRY(poly1305_blocks_arm)
+ ands x2,x2,#-16
+ b.eq .Lno_data
+
+ ldp x4,x5,[x0] // load hash value
+ ldp x7,x8,[x0,#32] // load key value
+ ldr x6,[x0,#16]
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ b .Loop
+
+.align 5
+.Loop:
+ ldp x10,x11,[x1],#16 // load input
+ sub x2,x2,#16
+#ifdef __ARMEB__
+ rev x10,x10
+ rev x11,x11
+#endif
+ adds x4,x4,x10 // accumulate input
+ adcs x5,x5,x11
+
+ mul x12,x4,x7 // h0*r0
+ adc x6,x6,x3
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ cbnz x2,.Loop
+
+ stp x4,x5,[x0] // store hash value
+ str x6,[x0,#16]
+
+.Lno_data:
+ ret
+ENDPROC(poly1305_blocks_arm)
+
+.align 5
+ENTRY(poly1305_emit_arm)
+ ldp x4,x5,[x0] // load hash base 2^64
+ ldr x6,[x0,#16]
+ ldp x10,x11,[x2] // load nonce
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __ARMEB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __ARMEB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+ENDPROC(poly1305_emit_arm)
+
+.align 5
+__poly1305_mult:
+ mul x12,x4,x7 // h0*r0
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ ret
+
+__poly1305_splat:
+ and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,x4,#26,#26
+ extr x14,x5,x4,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,x5,#14,#26
+ extr x16,x6,x5,#40
+
+ str w12,[x0,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[x0,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[x0,#16*2] // s1
+ str w14,[x0,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[x0,#16*4] // s2
+ str w15,[x0,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[x0,#16*6] // s3
+ str w16,[x0,#16*7] // r4
+ str w15,[x0,#16*8] // s4
+
+ ret
+
+.align 5
+ENTRY(poly1305_blocks_neon)
+ ldr x17,[x0,#24]
+ cmp x2,#128
+ b.hs .Lblocks_neon
+ cbz x17,poly1305_blocks_arm
+
+.Lblocks_neon:
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ ands x2,x2,#-16
+ b.eq .Lno_data_neon
+
+ cbz x17,.Lbase2_64_neon
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ tst x2,#31
+ b.eq .Leven_neon
+
+ ldp x7,x8,[x0,#32] // load key value
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x14,x6,xzr // can be partially reduced...
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+
+ and x10,x14,#-4 // ... so reduce
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x4,x10
+ adcs x5,x5,xzr
+ adc x6,x6,xzr
+
+#ifdef __ARMEB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl __poly1305_mult
+ ldr x30,[sp,#8]
+
+ cbz x3,.Lstore_base2_64_neon
+
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ cbnz x2,.Leven_neon
+
+ stp w10,w11,[x0] // store hash value base 2^26
+ stp w12,w13,[x0,#8]
+ str w14,[x0,#16]
+ b .Lno_data_neon
+
+.align 4
+.Lstore_base2_64_neon:
+ stp x4,x5,[x0] // store hash value base 2^64
+ stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
+ b .Lno_data_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp x7,x8,[x0,#32] // load key value
+
+ ldp x4,x5,[x0] // load hash value base 2^64
+ ldr x6,[x0,#16]
+
+ tst x2,#31
+ b.eq .Linit_neon
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __ARMEB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl __poly1305_mult
+
+.Linit_neon:
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+ ////////////////////////////////// initialize r^n table
+ mov x4,x7 // r^1
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov x5,x8
+ mov x6,xzr
+ add x0,x0,#48+12
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^2
+ sub x0,x0,#4
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^3
+ sub x0,x0,#4
+ bl __poly1305_splat
+
+ bl __poly1305_mult // r^4
+ sub x0,x0,#4
+ bl __poly1305_splat
+ ldr x30,[sp,#8]
+
+ add x16,x1,#32
+ adr x17,.Lzeros
+ subs x2,x2,#64
+ csel x16,x17,x16,lo
+
+ mov x4,#1
+ str x4,[x0,#-24] // set is_base2_26
+ sub x0,x0,#48 // restore original x0
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ add x16,x1,#32
+ adr x17,.Lzeros
+ subs x2,x2,#64
+ csel x16,x17,x16,lo
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+.Ldo_neon:
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ ldp x9,x13,[x16],#48
+
+ lsl x3,x3,#24
+ add x15,x0,#48
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d14,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d15,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov d16,x8
+ fmov d17,x10
+ fmov d18,x12
+
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ ldp x9,x13,[x1],#48
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
+ ld1 {v8.4s},[x15]
+
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d9,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d10,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi v31.2d,#-1
+ fmov d11,x8
+ fmov d12,x10
+ fmov d13,x12
+ ushr v31.2d,v31.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // ___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // ___________________/ ____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs x2,x2,#64
+ umull v23.2d,v14.2s,v7.s[2]
+ csel x16,x17,x16,lo
+ umull v22.2d,v14.2s,v5.s[2]
+ umull v21.2d,v14.2s,v3.s[2]
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ umull v20.2d,v14.2s,v1.s[2]
+ ldp x9,x13,[x16],#48
+ umull v19.2d,v14.2s,v0.s[2]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal v23.2d,v15.2s,v5.s[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v22.2d,v15.2s,v3.s[2]
+ and x5,x9,#0x03ffffff
+ umlal v21.2d,v15.2s,v1.s[2]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v15.2s,v0.s[2]
+ ubfx x7,x9,#26,#26
+ umlal v19.2d,v15.2s,v8.s[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal v23.2d,v16.2s,v3.s[2]
+ extr x8,x12,x8,#52
+ umlal v22.2d,v16.2s,v1.s[2]
+ extr x9,x13,x9,#52
+ umlal v21.2d,v16.2s,v0.s[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v20.2d,v16.2s,v8.s[2]
+ fmov d14,x4
+ umlal v19.2d,v16.2s,v6.s[2]
+ and x8,x8,#0x03ffffff
+
+ umlal v23.2d,v17.2s,v1.s[2]
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v17.2s,v0.s[2]
+ ubfx x10,x12,#14,#26
+ umlal v21.2d,v17.2s,v8.s[2]
+ ubfx x11,x13,#14,#26
+ umlal v20.2d,v17.2s,v6.s[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v19.2d,v17.2s,v4.s[2]
+ fmov d15,x6
+
+ add v11.2s,v11.2s,v26.2s
+ add x12,x3,x12,lsr#40
+ umlal v23.2d,v18.2s,v0.s[2]
+ add x13,x3,x13,lsr#40
+ umlal v22.2d,v18.2s,v8.s[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v21.2d,v18.2s,v6.s[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v18.2s,v4.s[2]
+ fmov d16,x8
+ umlal v19.2d,v18.2s,v2.s[2]
+ fmov d17,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ fmov d18,x12
+ umlal v22.2d,v11.2s,v1.s[0]
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ umlal v19.2d,v11.2s,v6.s[0]
+ ldp x9,x13,[x1],#48
+ umlal v23.2d,v11.2s,v3.s[0]
+ umlal v20.2d,v11.2s,v8.s[0]
+ umlal v21.2d,v11.2s,v0.s[0]
+#ifdef __ARMEB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.s[0]
+ umlal v23.2d,v9.2s,v7.s[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v21.2d,v9.2s,v3.s[0]
+ and x5,x9,#0x03ffffff
+ umlal v19.2d,v9.2s,v0.s[0]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v9.2s,v1.s[0]
+ ubfx x7,x9,#26,#26
+
+ add v12.2s,v12.2s,v27.2s
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal v22.2d,v10.2s,v3.s[0]
+ extr x8,x12,x8,#52
+ umlal v23.2d,v10.2s,v5.s[0]
+ extr x9,x13,x9,#52
+ umlal v19.2d,v10.2s,v8.s[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v21.2d,v10.2s,v1.s[0]
+ fmov d9,x4
+ umlal v20.2d,v10.2s,v0.s[0]
+ and x8,x8,#0x03ffffff
+
+ add v13.2s,v13.2s,v28.2s
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v12.2s,v0.s[0]
+ ubfx x10,x12,#14,#26
+ umlal v19.2d,v12.2s,v4.s[0]
+ ubfx x11,x13,#14,#26
+ umlal v23.2d,v12.2s,v1.s[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v20.2d,v12.2s,v6.s[0]
+ fmov d10,x6
+ umlal v21.2d,v12.2s,v8.s[0]
+ add x12,x3,x12,lsr#40
+
+ umlal v22.2d,v13.2s,v8.s[0]
+ add x13,x3,x13,lsr#40
+ umlal v19.2d,v13.2s,v2.s[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v23.2d,v13.2s,v0.s[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v13.2s,v4.s[0]
+ fmov d11,x8
+ umlal v21.2d,v13.2s,v6.s[0]
+ fmov d12,x10
+ fmov d13,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr v29.2d,v22.2d,#26
+ xtn v27.2s,v22.2d
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ xtn v28.2s,v23.2d
+ ushr v30.2d,v20.2d,#26
+ xtn v25.2s,v20.2d
+ bic v28.2s,#0xfc,lsl#24
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ shrn v30.2s,v21.2d,#26
+ xtn v26.2s,v21.2d
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ bic v25.2s,#0xfc,lsl#24
+ add v27.2s,v27.2s,v30.2s // h2 -> h3
+ bic v26.2s,#0xfc,lsl#24
+
+ shrn v29.2s,v19.2d,#26
+ xtn v24.2s,v19.2d
+ ushr v30.2s,v27.2s,#26
+ bic v27.2s,#0xfc,lsl#24
+ bic v24.2s,#0xfc,lsl#24
+ add v25.2s,v25.2s,v29.2s // h0 -> h1
+ add v28.2s,v28.2s,v30.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup v16.2d,v16.d[0]
+ add v11.2s,v11.2s,v26.2s
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds x2,x2,#32
+ b.ne .Long_tail
+
+ dup v16.2d,v11.d[0]
+ add v14.2s,v9.2s,v24.2s
+ add v17.2s,v12.2s,v27.2s
+ add v15.2s,v10.2s,v25.2s
+ add v18.2s,v13.2s,v28.2s
+
+.Long_tail:
+ dup v14.2d,v14.d[0]
+ umull2 v19.2d,v16.4s,v6.4s
+ umull2 v22.2d,v16.4s,v1.4s
+ umull2 v23.2d,v16.4s,v3.4s
+ umull2 v21.2d,v16.4s,v0.4s
+ umull2 v20.2d,v16.4s,v8.4s
+
+ dup v15.2d,v15.d[0]
+ umlal2 v19.2d,v14.4s,v0.4s
+ umlal2 v21.2d,v14.4s,v3.4s
+ umlal2 v22.2d,v14.4s,v5.4s
+ umlal2 v23.2d,v14.4s,v7.4s
+ umlal2 v20.2d,v14.4s,v1.4s
+
+ dup v17.2d,v17.d[0]
+ umlal2 v19.2d,v15.4s,v8.4s
+ umlal2 v22.2d,v15.4s,v3.4s
+ umlal2 v21.2d,v15.4s,v1.4s
+ umlal2 v23.2d,v15.4s,v5.4s
+ umlal2 v20.2d,v15.4s,v0.4s
+
+ dup v18.2d,v18.d[0]
+ umlal2 v22.2d,v17.4s,v0.4s
+ umlal2 v23.2d,v17.4s,v1.4s
+ umlal2 v19.2d,v17.4s,v4.4s
+ umlal2 v20.2d,v17.4s,v6.4s
+ umlal2 v21.2d,v17.4s,v8.4s
+
+ umlal2 v22.2d,v18.4s,v8.4s
+ umlal2 v19.2d,v18.4s,v2.4s
+ umlal2 v23.2d,v18.4s,v0.4s
+ umlal2 v20.2d,v18.4s,v4.4s
+ umlal2 v21.2d,v18.4s,v6.4s
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ umlal v22.2d,v11.2s,v1.2s
+ umlal v19.2d,v11.2s,v6.2s
+ umlal v23.2d,v11.2s,v3.2s
+ umlal v20.2d,v11.2s,v8.2s
+ umlal v21.2d,v11.2s,v0.2s
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.2s
+ umlal v19.2d,v9.2s,v0.2s
+ umlal v23.2d,v9.2s,v7.2s
+ umlal v20.2d,v9.2s,v1.2s
+ umlal v21.2d,v9.2s,v3.2s
+
+ add v12.2s,v12.2s,v27.2s
+ umlal v22.2d,v10.2s,v3.2s
+ umlal v19.2d,v10.2s,v8.2s
+ umlal v23.2d,v10.2s,v5.2s
+ umlal v20.2d,v10.2s,v0.2s
+ umlal v21.2d,v10.2s,v1.2s
+
+ add v13.2s,v13.2s,v28.2s
+ umlal v22.2d,v12.2s,v0.2s
+ umlal v19.2d,v12.2s,v4.2s
+ umlal v23.2d,v12.2s,v1.2s
+ umlal v20.2d,v12.2s,v6.2s
+ umlal v21.2d,v12.2s,v8.2s
+
+ umlal v22.2d,v13.2s,v8.2s
+ umlal v19.2d,v13.2s,v2.2s
+ umlal v23.2d,v13.2s,v0.2s
+ umlal v20.2d,v13.2s,v4.2s
+ umlal v21.2d,v13.2s,v6.2s
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp v22.2d,v22.2d,v22.2d
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp v19.2d,v19.2d,v19.2d
+ ldp d10,d11,[sp,#32]
+ addp v23.2d,v23.2d,v23.2d
+ ldp d12,d13,[sp,#48]
+ addp v20.2d,v20.2d,v20.2d
+ ldp d14,d15,[sp,#64]
+ addp v21.2d,v21.2d,v21.2d
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr v29.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ and v23.16b,v23.16b,v31.16b
+ ushr v30.2d,v20.2d,#26
+ and v20.16b,v20.16b,v31.16b
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ ushr v30.2d,v21.2d,#26
+ and v21.16b,v21.16b,v31.16b
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ add v22.2d,v22.2d,v30.2d // h2 -> h3
+
+ ushr v29.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ ushr v30.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ add v20.2d,v20.2d,v29.2d // h0 -> h1
+ add v23.2d,v23.2d,v30.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
+ st1 {v23.s}[0],[x0]
+
+.Lno_data_neon:
+ ldr x29,[sp],#80
+ ret
+ENDPROC(poly1305_blocks_neon)
+
+.align 5
+ENTRY(poly1305_emit_neon)
+ ldr x17,[x0,#24]
+ cbz x17,poly1305_emit_arm
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x6,x6,xzr // can be partially reduced...
+
+ ldp x10,x11,[x2] // load nonce
+
+ and x12,x6,#-4 // ... so reduce
+ add x12,x12,x6,lsr#2
+ and x6,x6,#3
+ adds x4,x4,x12
+ adcs x5,x5,xzr
+ adc x6,x6,xzr
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __ARMEB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __ARMEB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+ENDPROC(poly1305_emit_neon)
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 07/20] zinc: Poly1305 generic C implementations and selftest
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Jean-Philippe Aumasson
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
These two C implementations -- a 32x32 one and a 64x64 one, depending on
the platform -- come from Andrew Moon's public domain poly1305-donna
portable code, modified for usage in the kernel and for usage with
accelerated primitives.
Information: https://cr.yp.to/mac.html
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
---
include/zinc/poly1305.h | 38 ++
lib/zinc/Kconfig | 4 +
lib/zinc/Makefile | 4 +
lib/zinc/main.c | 5 +
lib/zinc/poly1305/poly1305-donna32.h | 205 +++++++
lib/zinc/poly1305/poly1305-donna64.h | 182 ++++++
lib/zinc/poly1305/poly1305.c | 131 ++++
lib/zinc/selftest/poly1305.h | 876 +++++++++++++++++++++++++++
8 files changed, 1445 insertions(+)
create mode 100644 include/zinc/poly1305.h
create mode 100644 lib/zinc/poly1305/poly1305-donna32.h
create mode 100644 lib/zinc/poly1305/poly1305-donna64.h
create mode 100644 lib/zinc/poly1305/poly1305.c
create mode 100644 lib/zinc/selftest/poly1305.h
diff --git a/include/zinc/poly1305.h b/include/zinc/poly1305.h
new file mode 100644
index 000000000000..338430c8477a
--- /dev/null
+++ b/include/zinc/poly1305.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _ZINC_POLY1305_H
+#define _ZINC_POLY1305_H
+
+#include <linux/simd.h>
+#include <linux/types.h>
+
+enum poly1305_lengths {
+ POLY1305_BLOCK_SIZE = 16,
+ POLY1305_KEY_SIZE = 32,
+ POLY1305_MAC_SIZE = 16
+};
+
+struct poly1305_ctx {
+ u8 opaque[24 * sizeof(u64)];
+ u32 nonce[4];
+ u8 data[POLY1305_BLOCK_SIZE];
+ size_t num;
+} __aligned(8);
+
+void poly1305_fpu_init(void);
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context);
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *input, size_t len,
+ simd_context_t simd_context);
+void poly1305_final(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE],
+ simd_context_t simd_context);
+
+#ifdef DEBUG
+bool poly1305_selftest(void);
+#endif
+
+#endif /* _ZINC_POLY1305_H */
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
index e7d396d61607..bc8c61334362 100644
--- a/lib/zinc/Kconfig
+++ b/lib/zinc/Kconfig
@@ -6,6 +6,10 @@ config ZINC_CHACHA20
select ZINC
select CRYPTO_ALGAPI
+config ZINC_POLY1305
+ bool
+ select ZINC
+
config ZINC_DEBUG
bool "Zinc cryptography library debugging and self-tests"
depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 9f6a5e65d729..d1e3892e06d9 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -23,6 +23,10 @@ CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
endif
endif
+ifeq ($(CONFIG_ZINC_POLY1305),y)
+zinc-y += poly1305/poly1305.o
+endif
+
zinc-y += main.o
obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
index 7e8e84b706b7..d871dd406a5c 100644
--- a/lib/zinc/main.c
+++ b/lib/zinc/main.c
@@ -4,6 +4,7 @@
*/
#include <zinc/chacha20.h>
+#include <zinc/poly1305.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -21,6 +22,10 @@ static int __init mod_init(void)
{
#ifdef CONFIG_ZINC_CHACHA20
chacha20_fpu_init();
+#endif
+#ifdef CONFIG_ZINC_POLY1305
+ poly1305_fpu_init();
+ selftest(poly1305);
#endif
return 0;
}
diff --git a/lib/zinc/poly1305/poly1305-donna32.h b/lib/zinc/poly1305/poly1305-donna32.h
new file mode 100644
index 000000000000..dc32123210f9
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-donna32.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is based in part on Andrew Moon's poly1305-donna, which is in the
+ * public domain.
+ */
+
+struct poly1305_internal {
+ u32 h[5];
+ u32 r[5];
+ u32 s[4];
+};
+
+static void poly1305_init_generic(void *ctx, const u8 key[16])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ st->r[0] = (get_unaligned_le32(&key[0])) & 0x3ffffff;
+ st->r[1] = (get_unaligned_le32(&key[3]) >> 2) & 0x3ffff03;
+ st->r[2] = (get_unaligned_le32(&key[6]) >> 4) & 0x3ffc0ff;
+ st->r[3] = (get_unaligned_le32(&key[9]) >> 6) & 0x3f03fff;
+ st->r[4] = (get_unaligned_le32(&key[12]) >> 8) & 0x00fffff;
+
+ /* s = 5*r */
+ st->s[0] = st->r[1] * 5;
+ st->s[1] = st->r[2] * 5;
+ st->s[2] = st->r[3] * 5;
+ st->s[3] = st->r[4] * 5;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+}
+
+static void poly1305_blocks_generic(void *ctx, const u8 *input, size_t len,
+ const u32 padbit)
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ const u32 hibit = padbit << 24;
+ u32 r0, r1, r2, r3, r4;
+ u32 s1, s2, s3, s4;
+ u32 h0, h1, h2, h3, h4;
+ u64 d0, d1, d2, d3, d4;
+ u32 c;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+ r4 = st->r[4];
+
+ s1 = st->s[0];
+ s2 = st->s[1];
+ s3 = st->s[2];
+ s4 = st->s[3];
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (len >= POLY1305_BLOCK_SIZE) {
+ /* h += m[i] */
+ h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff;
+ h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff;
+ h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff;
+ h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff;
+ h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit;
+
+ /* h *= r */
+ d0 = ((u64)h0 * r0) + ((u64)h1 * s4) +
+ ((u64)h2 * s3) + ((u64)h3 * s2) +
+ ((u64)h4 * s1);
+ d1 = ((u64)h0 * r1) + ((u64)h1 * r0) +
+ ((u64)h2 * s4) + ((u64)h3 * s3) +
+ ((u64)h4 * s2);
+ d2 = ((u64)h0 * r2) + ((u64)h1 * r1) +
+ ((u64)h2 * r0) + ((u64)h3 * s4) +
+ ((u64)h4 * s3);
+ d3 = ((u64)h0 * r3) + ((u64)h1 * r2) +
+ ((u64)h2 * r1) + ((u64)h3 * r0) +
+ ((u64)h4 * s4);
+ d4 = ((u64)h0 * r4) + ((u64)h1 * r3) +
+ ((u64)h2 * r2) + ((u64)h3 * r1) +
+ ((u64)h4 * r0);
+
+ /* (partial) h %= p */
+ c = (u32)(d0 >> 26);
+ h0 = (u32)d0 & 0x3ffffff;
+ d1 += c;
+ c = (u32)(d1 >> 26);
+ h1 = (u32)d1 & 0x3ffffff;
+ d2 += c;
+ c = (u32)(d2 >> 26);
+ h2 = (u32)d2 & 0x3ffffff;
+ d3 += c;
+ c = (u32)(d3 >> 26);
+ h3 = (u32)d3 & 0x3ffffff;
+ d4 += c;
+ c = (u32)(d4 >> 26);
+ h4 = (u32)d4 & 0x3ffffff;
+ h0 += c * 5;
+ c = (h0 >> 26);
+ h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ input += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+}
+
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u32 h0, h1, h2, h3, h4, c;
+ u32 g0, g1, g2, g3, g4;
+ u64 f;
+ u32 mask;
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ c = h1 >> 26;
+ h1 = h1 & 0x3ffffff;
+ h2 += c;
+ c = h2 >> 26;
+ h2 = h2 & 0x3ffffff;
+ h3 += c;
+ c = h3 >> 26;
+ h3 = h3 & 0x3ffffff;
+ h4 += c;
+ c = h4 >> 26;
+ h4 = h4 & 0x3ffffff;
+ h0 += c * 5;
+ c = h0 >> 26;
+ h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5;
+ c = g0 >> 26;
+ g0 &= 0x3ffffff;
+ g1 = h1 + c;
+ c = g1 >> 26;
+ g1 &= 0x3ffffff;
+ g2 = h2 + c;
+ c = g2 >> 26;
+ g2 &= 0x3ffffff;
+ g3 = h3 + c;
+ c = g3 >> 26;
+ g3 &= 0x3ffffff;
+ g4 = h4 + c - (1UL << 26);
+
+ /* select h if h < p, or h + -p if h >= p */
+ mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ g4 &= mask;
+ mask = ~mask;
+
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+ h4 = (h4 & mask) | g4;
+
+ /* h = h % (2^128) */
+ h0 = ((h0) | (h1 << 26)) & 0xffffffff;
+ h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
+ h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
+ h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
+
+ /* mac = (h + nonce) % (2^128) */
+ f = (u64)h0 + nonce[0];
+ h0 = (u32)f;
+ f = (u64)h1 + nonce[1] + (f >> 32);
+ h1 = (u32)f;
+ f = (u64)h2 + nonce[2] + (f >> 32);
+ h2 = (u32)f;
+ f = (u64)h3 + nonce[3] + (f >> 32);
+ h3 = (u32)f;
+
+ put_unaligned_le32(h0, &mac[0]);
+ put_unaligned_le32(h1, &mac[4]);
+ put_unaligned_le32(h2, &mac[8]);
+ put_unaligned_le32(h3, &mac[12]);
+}
diff --git a/lib/zinc/poly1305/poly1305-donna64.h b/lib/zinc/poly1305/poly1305-donna64.h
new file mode 100644
index 000000000000..de7ab1246024
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-donna64.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is based in part on Andrew Moon's poly1305-donna, which is in the
+ * public domain.
+ */
+
+typedef __uint128_t u128;
+
+struct poly1305_internal {
+ u64 r[3];
+ u64 h[3];
+ u64 s[2];
+};
+
+static void poly1305_init_generic(void *ctx, const u8 key[16])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u64 t0, t1;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ t0 = get_unaligned_le64(&key[0]);
+ t1 = get_unaligned_le64(&key[8]);
+
+ st->r[0] = t0 & 0xffc0fffffff;
+ st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
+ st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f;
+
+ /* s = 20*r */
+ st->s[0] = st->r[1] * 20;
+ st->s[1] = st->r[2] * 20;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+}
+
+static void poly1305_blocks_generic(void *ctx, const u8 *input, size_t len,
+ const u32 padbit)
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ const u64 hibit = ((u64)padbit) << 40;
+ u64 r0, r1, r2;
+ u64 s1, s2;
+ u64 h0, h1, h2;
+ u64 c;
+ u128 d0, d1, d2, d;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ s1 = st->s[0];
+ s2 = st->s[1];
+
+ while (len >= POLY1305_BLOCK_SIZE) {
+ u64 t0, t1;
+
+ /* h += m[i] */
+ t0 = get_unaligned_le64(&input[0]);
+ t1 = get_unaligned_le64(&input[8]);
+
+ h0 += t0 & 0xfffffffffff;
+ h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
+ h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
+
+ /* h *= r */
+ d0 = (u128)h0 * r0;
+ d = (u128)h1 * s2;
+ d0 += d;
+ d = (u128)h2 * s1;
+ d0 += d;
+ d1 = (u128)h0 * r1;
+ d = (u128)h1 * r0;
+ d1 += d;
+ d = (u128)h2 * s2;
+ d1 += d;
+ d2 = (u128)h0 * r2;
+ d = (u128)h1 * r1;
+ d2 += d;
+ d = (u128)h2 * r0;
+ d2 += d;
+
+ /* (partial) h %= p */
+ c = (u64)(d0 >> 44);
+ h0 = (u64)d0 & 0xfffffffffff;
+ d1 += c;
+ c = (u64)(d1 >> 44);
+ h1 = (u64)d1 & 0xfffffffffff;
+ d2 += c;
+ c = (u64)(d2 >> 42);
+ h2 = (u64)d2 & 0x3ffffffffff;
+ h0 += c * 5;
+ c = h0 >> 44;
+ h0 = h0 & 0xfffffffffff;
+ h1 += c;
+
+ input += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+}
+
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u64 h0, h1, h2, c;
+ u64 g0, g1, g2;
+ u64 t0, t1;
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ c = h1 >> 44;
+ h1 &= 0xfffffffffff;
+ h2 += c;
+ c = h2 >> 42;
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = h0 >> 44;
+ h0 &= 0xfffffffffff;
+ h1 += c;
+ c = h1 >> 44;
+ h1 &= 0xfffffffffff;
+ h2 += c;
+ c = h2 >> 42;
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = h0 >> 44;
+ h0 &= 0xfffffffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5;
+ c = g0 >> 44;
+ g0 &= 0xfffffffffff;
+ g1 = h1 + c;
+ c = g1 >> 44;
+ g1 &= 0xfffffffffff;
+ g2 = h2 + c - (1ULL << 42);
+
+ /* select h if h < p, or h + -p if h >= p */
+ c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1;
+ g0 &= c;
+ g1 &= c;
+ g2 &= c;
+ c = ~c;
+ h0 = (h0 & c) | g0;
+ h1 = (h1 & c) | g1;
+ h2 = (h2 & c) | g2;
+
+ /* h = (h + nonce) */
+ t0 = ((u64)nonce[1] << 32) | nonce[0];
+ t1 = ((u64)nonce[3] << 32) | nonce[2];
+
+ h0 += t0 & 0xfffffffffff;
+ c = h0 >> 44;
+ h0 &= 0xfffffffffff;
+ h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
+ c = h1 >> 44;
+ h1 &= 0xfffffffffff;
+ h2 += (((t1 >> 24)) & 0x3ffffffffff) + c;
+ h2 &= 0x3ffffffffff;
+
+ /* mac = h % (2^128) */
+ h0 = h0 | (h1 << 44);
+ h1 = (h1 >> 20) | (h2 << 24);
+
+ put_unaligned_le64(h0, &mac[0]);
+ put_unaligned_le64(h1, &mac[8]);
+}
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
new file mode 100644
index 000000000000..9a71ac1d4e39
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305.c
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Implementation of the Poly1305 message authenticator.
+ *
+ * Information: https://cr.yp.to/mac.html
+ */
+
+#include <zinc/poly1305.h>
+
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#ifndef HAVE_POLY1305_ARCH_IMPLEMENTATION
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ return false;
+}
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *input,
+ const size_t len, const u32 padbit,
+ simd_context_t simd_context)
+{
+ return false;
+}
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+ return false;
+}
+void __init poly1305_fpu_init(void)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+#include "poly1305-donna64.h"
+#else
+#include "poly1305-donna32.h"
+#endif
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE],
+ simd_context_t simd_context)
+{
+ ctx->nonce[0] = get_unaligned_le32(&key[16]);
+ ctx->nonce[1] = get_unaligned_le32(&key[20]);
+ ctx->nonce[2] = get_unaligned_le32(&key[24]);
+ ctx->nonce[3] = get_unaligned_le32(&key[28]);
+
+ if (!poly1305_init_arch(ctx->opaque, key, simd_context))
+ poly1305_init_generic(ctx->opaque, key);
+
+ ctx->num = 0;
+}
+EXPORT_SYMBOL(poly1305_init);
+
+static inline void poly1305_blocks(void *ctx, const u8 *input, const size_t len,
+ const u32 padbit,
+ simd_context_t simd_context)
+{
+ if (!poly1305_blocks_arch(ctx, input, len, padbit, simd_context))
+ poly1305_blocks_generic(ctx, input, len, padbit);
+}
+
+static inline void poly1305_emit(void *ctx, u8 mac[POLY1305_KEY_SIZE],
+ const u32 nonce[4],
+ simd_context_t simd_context)
+{
+ if (!poly1305_emit_arch(ctx, mac, nonce, simd_context))
+ poly1305_emit_generic(ctx, mac, nonce);
+}
+
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *input, size_t len,
+ simd_context_t simd_context)
+{
+ const size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+ size_t rem;
+
+ if (num) {
+ rem = POLY1305_BLOCK_SIZE - num;
+ if (len < rem) {
+ memcpy(ctx->data + num, input, len);
+ ctx->num = num + len;
+ return;
+ }
+ memcpy(ctx->data + num, input, rem);
+ poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1,
+ simd_context);
+ input += rem;
+ len -= rem;
+ }
+
+ rem = len % POLY1305_BLOCK_SIZE;
+ len -= rem;
+
+ if (len >= POLY1305_BLOCK_SIZE) {
+ poly1305_blocks(ctx->opaque, input, len, 1, simd_context);
+ input += len;
+ }
+
+ if (rem)
+ memcpy(ctx->data, input, rem);
+
+ ctx->num = rem;
+}
+EXPORT_SYMBOL(poly1305_update);
+
+void poly1305_final(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE],
+ simd_context_t simd_context)
+{
+ size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+
+ if (num) {
+ ctx->data[num++] = 1;
+ while (num < POLY1305_BLOCK_SIZE)
+ ctx->data[num++] = 0;
+ poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0,
+ simd_context);
+ }
+
+ poly1305_emit(ctx->opaque, mac, ctx->nonce, simd_context);
+
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(poly1305_final);
+
+#include "../selftest/poly1305.h"
diff --git a/lib/zinc/selftest/poly1305.h b/lib/zinc/selftest/poly1305.h
new file mode 100644
index 000000000000..8138a9399aed
--- /dev/null
+++ b/lib/zinc/selftest/poly1305.h
@@ -0,0 +1,876 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifdef DEBUG
+struct poly1305_testvec {
+ u8 input[600];
+ u8 output[POLY1305_MAC_SIZE];
+ u8 key[POLY1305_KEY_SIZE];
+ size_t ilen;
+};
+
+static const struct poly1305_testvec poly1305_testvecs[] __initconst = {
+{ /* RFC7539 */
+ .input = { 0x43, 0x72, 0x79, 0x70, 0x74, 0x6f, 0x67, 0x72,
+ 0x61, 0x70, 0x68, 0x69, 0x63, 0x20, 0x46, 0x6f,
+ 0x72, 0x75, 0x6d, 0x20, 0x52, 0x65, 0x73, 0x65,
+ 0x61, 0x72, 0x63, 0x68, 0x20, 0x47, 0x72, 0x6f,
+ 0x75, 0x70 },
+ .ilen = 34,
+ .output = { 0xa8, 0x06, 0x1d, 0xc1, 0x30, 0x51, 0x36, 0xc6,
+ 0xc2, 0x2b, 0x8b, 0xaf, 0x0c, 0x01, 0x27, 0xa9 },
+ .key = { 0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33,
+ 0x7f, 0x44, 0x52, 0xfe, 0x42, 0xd5, 0x06, 0xa8,
+ 0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d, 0xb2, 0xfd,
+ 0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b },
+}, { /* "The Poly1305-AES message-authentication code" */
+ .input = { 0xf3, 0xf6 },
+ .ilen = 2,
+ .output = { 0xf4, 0xc6, 0x33, 0xc3, 0x04, 0x4f, 0xc1, 0x45,
+ 0xf8, 0x4f, 0x33, 0x5c, 0xb8, 0x19, 0x53, 0xde },
+ .key = { 0x85, 0x1f, 0xc4, 0x0c, 0x34, 0x67, 0xac, 0x0b,
+ 0xe0, 0x5c, 0xc2, 0x04, 0x04, 0xf3, 0xf7, 0x00,
+ 0x58, 0x0b, 0x3b, 0x0f, 0x94, 0x47, 0xbb, 0x1e,
+ 0x69, 0xd0, 0x95, 0xb5, 0x92, 0x8b, 0x6d, 0xbc },
+}, {
+ .input = "",
+ .ilen = 0,
+ .output = { 0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+ 0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7 },
+ .key = { 0xa0, 0xf3, 0x08, 0x00, 0x00, 0xf4, 0x64, 0x00,
+ 0xd0, 0xc7, 0xe9, 0x07, 0x6c, 0x83, 0x44, 0x03,
+ 0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+ 0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7 },
+}, {
+ .input = { 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+ .ilen = 32,
+ .output = { 0x0e, 0xe1, 0xc1, 0x6b, 0xb7, 0x3f, 0x0f, 0x4f,
+ 0xd1, 0x98, 0x81, 0x75, 0x3c, 0x01, 0xcd, 0xbe },
+ .key = { 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9 },
+ .ilen = 63,
+ .output = { 0x51, 0x54, 0xad, 0x0d, 0x2c, 0xb2, 0x6e, 0x01,
+ 0x27, 0x4f, 0xc5, 0x11, 0x48, 0x49, 0x1f, 0x1b },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, { /* self-generated vectors exercise "significant" lengths, such that they
+ * are handled by different code paths */
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf },
+ .ilen = 64,
+ .output = { 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67 },
+ .ilen = 48,
+ .output = { 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+ .ilen = 96,
+ .output = { 0xbb, 0xb6, 0x13, 0xb2, 0xb6, 0xd7, 0x53, 0xba,
+ 0x07, 0x39, 0x5b, 0x91, 0x6a, 0xae, 0xce, 0x15 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24 },
+ .ilen = 112,
+ .output = { 0xc7, 0x94, 0xd7, 0x05, 0x7d, 0x17, 0x78, 0xc4,
+ 0xbb, 0xee, 0x0a, 0x39, 0xb3, 0xd9, 0x73, 0x42 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+ .ilen = 128,
+ .output = { 0xff, 0xbc, 0xb9, 0xb3, 0x71, 0x42, 0x31, 0x52,
+ 0xd7, 0xfc, 0xa5, 0xad, 0x04, 0x2f, 0xba, 0xa9 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66 },
+ .ilen = 144,
+ .output = { 0x06, 0x9e, 0xd6, 0xb8, 0xef, 0x0f, 0x20, 0x7b,
+ 0x3e, 0x24, 0x3b, 0xb1, 0x01, 0x9f, 0xe6, 0x32 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61 },
+ .ilen = 160,
+ .output = { 0xcc, 0xa3, 0x39, 0xd9, 0xa4, 0x5f, 0xa2, 0x36,
+ 0x8c, 0x2c, 0x68, 0xb3, 0xa4, 0x17, 0x91, 0x33 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+ .ilen = 288,
+ .output = { 0x53, 0xf6, 0xe8, 0x28, 0xa2, 0xf0, 0xfe, 0x0e,
+ 0xe8, 0x15, 0xbf, 0x0b, 0xd5, 0x84, 0x1a, 0x34 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+ .input = { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61 },
+ .ilen = 320,
+ .output = { 0xb8, 0x46, 0xd4, 0x4e, 0x9b, 0xbd, 0x53, 0xce,
+ 0xdf, 0xfb, 0xfb, 0xb6, 0xb7, 0xfa, 0x49, 0x33 },
+ .key = { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, { /* 4th power of the key spills to 131th bit in SIMD key setup */
+ .input = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .ilen = 256,
+ .output = { 0x07, 0x14, 0x5a, 0x4c, 0x02, 0xfe, 0x5f, 0xa3,
+ 0x20, 0x36, 0xde, 0x68, 0xfa, 0xbe, 0x90, 0x66 },
+ .key = { 0xad, 0x62, 0x81, 0x07, 0xe8, 0x35, 0x1d, 0x0f,
+ 0x2c, 0x23, 0x1a, 0x05, 0xdc, 0x4a, 0x41, 0x06,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* OpenSSL's poly1305_ieee754.c failed this in final stage */
+ .input = { 0x84, 0x23, 0x64, 0xe1, 0x56, 0x33, 0x6c, 0x09,
+ 0x98, 0xb9, 0x33, 0xa6, 0x23, 0x77, 0x26, 0x18,
+ 0x0d, 0x9e, 0x3f, 0xdc, 0xbd, 0xe4, 0xcd, 0x5d,
+ 0x17, 0x08, 0x0f, 0xc3, 0xbe, 0xb4, 0x96, 0x14,
+ 0xd7, 0x12, 0x2c, 0x03, 0x74, 0x63, 0xff, 0x10,
+ 0x4d, 0x73, 0xf1, 0x9c, 0x12, 0x70, 0x46, 0x28,
+ 0xd4, 0x17, 0xc4, 0xc5, 0x4a, 0x3f, 0xe3, 0x0d,
+ 0x3c, 0x3d, 0x77, 0x14, 0x38, 0x2d, 0x43, 0xb0,
+ 0x38, 0x2a, 0x50, 0xa5, 0xde, 0xe5, 0x4b, 0xe8,
+ 0x44, 0xb0, 0x76, 0xe8, 0xdf, 0x88, 0x20, 0x1a,
+ 0x1c, 0xd4, 0x3b, 0x90, 0xeb, 0x21, 0x64, 0x3f,
+ 0xa9, 0x6f, 0x39, 0xb5, 0x18, 0xaa, 0x83, 0x40,
+ 0xc9, 0x42, 0xff, 0x3c, 0x31, 0xba, 0xf7, 0xc9,
+ 0xbd, 0xbf, 0x0f, 0x31, 0xae, 0x3f, 0xa0, 0x96,
+ 0xbf, 0x8c, 0x63, 0x03, 0x06, 0x09, 0x82, 0x9f,
+ 0xe7, 0x2e, 0x17, 0x98, 0x24, 0x89, 0x0b, 0xc8,
+ 0xe0, 0x8c, 0x31, 0x5c, 0x1c, 0xce, 0x2a, 0x83,
+ 0x14, 0x4d, 0xbb, 0xff, 0x09, 0xf7, 0x4e, 0x3e,
+ 0xfc, 0x77, 0x0b, 0x54, 0xd0, 0x98, 0x4a, 0x8f,
+ 0x19, 0xb1, 0x47, 0x19, 0xe6, 0x36, 0x35, 0x64,
+ 0x1d, 0x6b, 0x1e, 0xed, 0xf6, 0x3e, 0xfb, 0xf0,
+ 0x80, 0xe1, 0x78, 0x3d, 0x32, 0x44, 0x54, 0x12,
+ 0x11, 0x4c, 0x20, 0xde, 0x0b, 0x83, 0x7a, 0x0d,
+ 0xfa, 0x33, 0xd6, 0xb8, 0x28, 0x25, 0xff, 0xf4,
+ 0x4c, 0x9a, 0x70, 0xea, 0x54, 0xce, 0x47, 0xf0,
+ 0x7d, 0xf6, 0x98, 0xe6, 0xb0, 0x33, 0x23, 0xb5,
+ 0x30, 0x79, 0x36, 0x4a, 0x5f, 0xc3, 0xe9, 0xdd,
+ 0x03, 0x43, 0x92, 0xbd, 0xde, 0x86, 0xdc, 0xcd,
+ 0xda, 0x94, 0x32, 0x1c, 0x5e, 0x44, 0x06, 0x04,
+ 0x89, 0x33, 0x6c, 0xb6, 0x5b, 0xf3, 0x98, 0x9c,
+ 0x36, 0xf7, 0x28, 0x2c, 0x2f, 0x5d, 0x2b, 0x88,
+ 0x2c, 0x17, 0x1e, 0x74 },
+ .ilen = 252,
+ .output = { 0xf2, 0x48, 0x31, 0x2e, 0x57, 0x8d, 0x9d, 0x58,
+ 0xf8, 0xb7, 0xbb, 0x4d, 0x19, 0x10, 0x54, 0x31 },
+ .key = { 0x95, 0xd5, 0xc0, 0x05, 0x50, 0x3e, 0x51, 0x0d,
+ 0x8c, 0xd0, 0xaa, 0x07, 0x2c, 0x4a, 0x4d, 0x06,
+ 0x6e, 0xab, 0xc5, 0x2d, 0x11, 0x65, 0x3d, 0xf4,
+ 0x7f, 0xbf, 0x63, 0xab, 0x19, 0x8b, 0xcc, 0x26 },
+}, { /* AVX2 in OpenSSL's poly1305-x86.pl failed this with 176+32 split */
+ .input = { 0x24, 0x8a, 0xc3, 0x10, 0x85, 0xb6, 0xc2, 0xad,
+ 0xaa, 0xa3, 0x82, 0x59, 0xa0, 0xd7, 0x19, 0x2c,
+ 0x5c, 0x35, 0xd1, 0xbb, 0x4e, 0xf3, 0x9a, 0xd9,
+ 0x4c, 0x38, 0xd1, 0xc8, 0x24, 0x79, 0xe2, 0xdd,
+ 0x21, 0x59, 0xa0, 0x77, 0x02, 0x4b, 0x05, 0x89,
+ 0xbc, 0x8a, 0x20, 0x10, 0x1b, 0x50, 0x6f, 0x0a,
+ 0x1a, 0xd0, 0xbb, 0xab, 0x76, 0xe8, 0x3a, 0x83,
+ 0xf1, 0xb9, 0x4b, 0xe6, 0xbe, 0xae, 0x74, 0xe8,
+ 0x74, 0xca, 0xb6, 0x92, 0xc5, 0x96, 0x3a, 0x75,
+ 0x43, 0x6b, 0x77, 0x61, 0x21, 0xec, 0x9f, 0x62,
+ 0x39, 0x9a, 0x3e, 0x66, 0xb2, 0xd2, 0x27, 0x07,
+ 0xda, 0xe8, 0x19, 0x33, 0xb6, 0x27, 0x7f, 0x3c,
+ 0x85, 0x16, 0xbc, 0xbe, 0x26, 0xdb, 0xbd, 0x86,
+ 0xf3, 0x73, 0x10, 0x3d, 0x7c, 0xf4, 0xca, 0xd1,
+ 0x88, 0x8c, 0x95, 0x21, 0x18, 0xfb, 0xfb, 0xd0,
+ 0xd7, 0xb4, 0xbe, 0xdc, 0x4a, 0xe4, 0x93, 0x6a,
+ 0xff, 0x91, 0x15, 0x7e, 0x7a, 0xa4, 0x7c, 0x54,
+ 0x44, 0x2e, 0xa7, 0x8d, 0x6a, 0xc2, 0x51, 0xd3,
+ 0x24, 0xa0, 0xfb, 0xe4, 0x9d, 0x89, 0xcc, 0x35,
+ 0x21, 0xb6, 0x6d, 0x16, 0xe9, 0xc6, 0x6a, 0x37,
+ 0x09, 0x89, 0x4e, 0x4e, 0xb0, 0xa4, 0xee, 0xdc,
+ 0x4a, 0xe1, 0x94, 0x68, 0xe6, 0x6b, 0x81, 0xf2,
+ 0x71, 0x35, 0x1b, 0x1d, 0x92, 0x1e, 0xa5, 0x51,
+ 0x04, 0x7a, 0xbc, 0xc6, 0xb8, 0x7a, 0x90, 0x1f,
+ 0xde, 0x7d, 0xb7, 0x9f, 0xa1, 0x81, 0x8c, 0x11,
+ 0x33, 0x6d, 0xbc, 0x07, 0x24, 0x4a, 0x40, 0xeb },
+ .ilen = 208,
+ .output = { 0xbc, 0x93, 0x9b, 0xc5, 0x28, 0x14, 0x80, 0xfa,
+ 0x99, 0xc6, 0xd6, 0x8c, 0x25, 0x8e, 0xc4, 0x2f },
+ .key = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* test vectors from Google */
+ .input = "",
+ .ilen = 0,
+ .output = { 0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+ 0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c },
+ .key = { 0xc8, 0xaf, 0xaa, 0xc3, 0x31, 0xee, 0x37, 0x2c,
+ 0xd6, 0x08, 0x2d, 0xe1, 0x34, 0x94, 0x3b, 0x17,
+ 0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+ 0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c },
+}, {
+ .input = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+ 0x72, 0x6c, 0x64, 0x21 },
+ .ilen = 12,
+ .output = { 0xa6, 0xf7, 0x45, 0x00, 0x8f, 0x81, 0xc9, 0x16,
+ 0xa2, 0x0d, 0xcc, 0x74, 0xee, 0xf2, 0xb2, 0xf0 },
+ .key = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+ 0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+ 0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+ 0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35 },
+}, {
+ .input = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .ilen = 32,
+ .output = { 0x49, 0xec, 0x78, 0x09, 0x0e, 0x48, 0x1e, 0xc6,
+ 0xc2, 0x6b, 0x33, 0xb9, 0x1c, 0xcc, 0x03, 0x07 },
+ .key = { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+ 0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+ 0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+ 0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35 },
+}, {
+ .input = { 0x89, 0xda, 0xb8, 0x0b, 0x77, 0x17, 0xc1, 0xdb,
+ 0x5d, 0xb4, 0x37, 0x86, 0x0a, 0x3f, 0x70, 0x21,
+ 0x8e, 0x93, 0xe1, 0xb8, 0xf4, 0x61, 0xfb, 0x67,
+ 0x7f, 0x16, 0xf3, 0x5f, 0x6f, 0x87, 0xe2, 0xa9,
+ 0x1c, 0x99, 0xbc, 0x3a, 0x47, 0xac, 0xe4, 0x76,
+ 0x40, 0xcc, 0x95, 0xc3, 0x45, 0xbe, 0x5e, 0xcc,
+ 0xa5, 0xa3, 0x52, 0x3c, 0x35, 0xcc, 0x01, 0x89,
+ 0x3a, 0xf0, 0xb6, 0x4a, 0x62, 0x03, 0x34, 0x27,
+ 0x03, 0x72, 0xec, 0x12, 0x48, 0x2d, 0x1b, 0x1e,
+ 0x36, 0x35, 0x61, 0x69, 0x8a, 0x57, 0x8b, 0x35,
+ 0x98, 0x03, 0x49, 0x5b, 0xb4, 0xe2, 0xef, 0x19,
+ 0x30, 0xb1, 0x7a, 0x51, 0x90, 0xb5, 0x80, 0xf1,
+ 0x41, 0x30, 0x0d, 0xf3, 0x0a, 0xdb, 0xec, 0xa2,
+ 0x8f, 0x64, 0x27, 0xa8, 0xbc, 0x1a, 0x99, 0x9f,
+ 0xd5, 0x1c, 0x55, 0x4a, 0x01, 0x7d, 0x09, 0x5d,
+ 0x8c, 0x3e, 0x31, 0x27, 0xda, 0xf9, 0xf5, 0x95 },
+ .ilen = 128,
+ .output = { 0xc8, 0x5d, 0x15, 0xed, 0x44, 0xc3, 0x78, 0xd6,
+ 0xb0, 0x0e, 0x23, 0x06, 0x4c, 0x7b, 0xcd, 0x51 },
+ .key = { 0x2d, 0x77, 0x3b, 0xe3, 0x7a, 0xdb, 0x1e, 0x4d,
+ 0x68, 0x3b, 0xf0, 0x07, 0x5e, 0x79, 0xc4, 0xee,
+ 0x03, 0x79, 0x18, 0x53, 0x5a, 0x7f, 0x99, 0xcc,
+ 0xb7, 0x04, 0x0f, 0xb5, 0xf5, 0xf4, 0x3a, 0xea },
+}, {
+ .input = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b,
+ 0x17, 0x03, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00,
+ 0x06, 0xdb, 0x1f, 0x1f, 0x36, 0x8d, 0x69, 0x6a,
+ 0x81, 0x0a, 0x34, 0x9c, 0x0c, 0x71, 0x4c, 0x9a,
+ 0x5e, 0x78, 0x50, 0xc2, 0x40, 0x7d, 0x72, 0x1a,
+ 0xcd, 0xed, 0x95, 0xe0, 0x18, 0xd7, 0xa8, 0x52,
+ 0x66, 0xa6, 0xe1, 0x28, 0x9c, 0xdb, 0x4a, 0xeb,
+ 0x18, 0xda, 0x5a, 0xc8, 0xa2, 0xb0, 0x02, 0x6d,
+ 0x24, 0xa5, 0x9a, 0xd4, 0x85, 0x22, 0x7f, 0x3e,
+ 0xae, 0xdb, 0xb2, 0xe7, 0xe3, 0x5e, 0x1c, 0x66,
+ 0xcd, 0x60, 0xf9, 0xab, 0xf7, 0x16, 0xdc, 0xc9,
+ 0xac, 0x42, 0x68, 0x2d, 0xd7, 0xda, 0xb2, 0x87,
+ 0xa7, 0x02, 0x4c, 0x4e, 0xef, 0xc3, 0x21, 0xcc,
+ 0x05, 0x74, 0xe1, 0x67, 0x93, 0xe3, 0x7c, 0xec,
+ 0x03, 0xc5, 0xbd, 0xa4, 0x2b, 0x54, 0xc1, 0x14,
+ 0xa8, 0x0b, 0x57, 0xaf, 0x26, 0x41, 0x6c, 0x7b,
+ 0xe7, 0x42, 0x00, 0x5e, 0x20, 0x85, 0x5c, 0x73,
+ 0xe2, 0x1d, 0xc8, 0xe2, 0xed, 0xc9, 0xd4, 0x35,
+ 0xcb, 0x6f, 0x60, 0x59, 0x28, 0x00, 0x11, 0xc2,
+ 0x70, 0xb7, 0x15, 0x70, 0x05, 0x1c, 0x1c, 0x9b,
+ 0x30, 0x52, 0x12, 0x66, 0x20, 0xbc, 0x1e, 0x27,
+ 0x30, 0xfa, 0x06, 0x6c, 0x7a, 0x50, 0x9d, 0x53,
+ 0xc6, 0x0e, 0x5a, 0xe1, 0xb4, 0x0a, 0xa6, 0xe3,
+ 0x9e, 0x49, 0x66, 0x92, 0x28, 0xc9, 0x0e, 0xec,
+ 0xb4, 0xa5, 0x0d, 0xb3, 0x2a, 0x50, 0xbc, 0x49,
+ 0xe9, 0x0b, 0x4f, 0x4b, 0x35, 0x9a, 0x1d, 0xfd,
+ 0x11, 0x74, 0x9c, 0xd3, 0x86, 0x7f, 0xcf, 0x2f,
+ 0xb7, 0xbb, 0x6c, 0xd4, 0x73, 0x8f, 0x6a, 0x4a,
+ 0xd6, 0xf7, 0xca, 0x50, 0x58, 0xf7, 0x61, 0x88,
+ 0x45, 0xaf, 0x9f, 0x02, 0x0f, 0x6c, 0x3b, 0x96,
+ 0x7b, 0x8f, 0x4c, 0xd4, 0xa9, 0x1e, 0x28, 0x13,
+ 0xb5, 0x07, 0xae, 0x66, 0xf2, 0xd3, 0x5c, 0x18,
+ 0x28, 0x4f, 0x72, 0x92, 0x18, 0x60, 0x62, 0xe1,
+ 0x0f, 0xd5, 0x51, 0x0d, 0x18, 0x77, 0x53, 0x51,
+ 0xef, 0x33, 0x4e, 0x76, 0x34, 0xab, 0x47, 0x43,
+ 0xf5, 0xb6, 0x8f, 0x49, 0xad, 0xca, 0xb3, 0x84,
+ 0xd3, 0xfd, 0x75, 0xf7, 0x39, 0x0f, 0x40, 0x06,
+ 0xef, 0x2a, 0x29, 0x5c, 0x8c, 0x7a, 0x07, 0x6a,
+ 0xd5, 0x45, 0x46, 0xcd, 0x25, 0xd2, 0x10, 0x7f,
+ 0xbe, 0x14, 0x36, 0xc8, 0x40, 0x92, 0x4a, 0xae,
+ 0xbe, 0x5b, 0x37, 0x08, 0x93, 0xcd, 0x63, 0xd1,
+ 0x32, 0x5b, 0x86, 0x16, 0xfc, 0x48, 0x10, 0x88,
+ 0x6b, 0xc1, 0x52, 0xc5, 0x32, 0x21, 0xb6, 0xdf,
+ 0x37, 0x31, 0x19, 0x39, 0x32, 0x55, 0xee, 0x72,
+ 0xbc, 0xaa, 0x88, 0x01, 0x74, 0xf1, 0x71, 0x7f,
+ 0x91, 0x84, 0xfa, 0x91, 0x64, 0x6f, 0x17, 0xa2,
+ 0x4a, 0xc5, 0x5d, 0x16, 0xbf, 0xdd, 0xca, 0x95,
+ 0x81, 0xa9, 0x2e, 0xda, 0x47, 0x92, 0x01, 0xf0,
+ 0xed, 0xbf, 0x63, 0x36, 0x00, 0xd6, 0x06, 0x6d,
+ 0x1a, 0xb3, 0x6d, 0x5d, 0x24, 0x15, 0xd7, 0x13,
+ 0x51, 0xbb, 0xcd, 0x60, 0x8a, 0x25, 0x10, 0x8d,
+ 0x25, 0x64, 0x19, 0x92, 0xc1, 0xf2, 0x6c, 0x53,
+ 0x1c, 0xf9, 0xf9, 0x02, 0x03, 0xbc, 0x4c, 0xc1,
+ 0x9f, 0x59, 0x27, 0xd8, 0x34, 0xb0, 0xa4, 0x71,
+ 0x16, 0xd3, 0x88, 0x4b, 0xbb, 0x16, 0x4b, 0x8e,
+ 0xc8, 0x83, 0xd1, 0xac, 0x83, 0x2e, 0x56, 0xb3,
+ 0x91, 0x8a, 0x98, 0x60, 0x1a, 0x08, 0xd1, 0x71,
+ 0x88, 0x15, 0x41, 0xd5, 0x94, 0xdb, 0x39, 0x9c,
+ 0x6a, 0xe6, 0x15, 0x12, 0x21, 0x74, 0x5a, 0xec,
+ 0x81, 0x4c, 0x45, 0xb0, 0xb0, 0x5b, 0x56, 0x54,
+ 0x36, 0xfd, 0x6f, 0x13, 0x7a, 0xa1, 0x0a, 0x0c,
+ 0x0b, 0x64, 0x37, 0x61, 0xdb, 0xd6, 0xf9, 0xa9,
+ 0xdc, 0xb9, 0x9b, 0x1a, 0x6e, 0x69, 0x08, 0x54,
+ 0xce, 0x07, 0x69, 0xcd, 0xe3, 0x97, 0x61, 0xd8,
+ 0x2f, 0xcd, 0xec, 0x15, 0xf0, 0xd9, 0x2d, 0x7d,
+ 0x8e, 0x94, 0xad, 0xe8, 0xeb, 0x83, 0xfb, 0xe0 },
+ .ilen = 528,
+ .output = { 0x26, 0x37, 0x40, 0x8f, 0xe1, 0x30, 0x86, 0xea,
+ 0x73, 0xf9, 0x71, 0xe3, 0x42, 0x5e, 0x28, 0x20 },
+ .key = { 0x99, 0xe5, 0x82, 0x2d, 0xd4, 0x17, 0x3c, 0x99,
+ 0x5e, 0x3d, 0xae, 0x0d, 0xde, 0xfb, 0x97, 0x74,
+ 0x3f, 0xde, 0x3b, 0x08, 0x01, 0x34, 0xb3, 0x9f,
+ 0x76, 0xe9, 0xbf, 0x8d, 0x0e, 0x88, 0xd5, 0x46 },
+}, { /* test vectors from Hanno Böck */
+ .input = { 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0x80, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xc5,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe3, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xac, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe6,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x00, 0x00, 0x00,
+ 0xaf, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xff, 0xff, 0xff, 0xf5, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0xff, 0xff, 0xff, 0xe7, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x71, 0x92, 0x05, 0xa8, 0x52, 0x1d,
+ 0xfc },
+ .ilen = 257,
+ .output = { 0x85, 0x59, 0xb8, 0x76, 0xec, 0xee, 0xd6, 0x6e,
+ 0xb3, 0x77, 0x98, 0xc0, 0x45, 0x7b, 0xaf, 0xf9 },
+ .key = { 0x7f, 0x1b, 0x02, 0x64, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc },
+}, {
+ .input = { 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x64 },
+ .ilen = 39,
+ .output = { 0x00, 0xbd, 0x12, 0x58, 0x97, 0x8e, 0x20, 0x54,
+ 0x44, 0xc9, 0xaa, 0xaa, 0x82, 0x00, 0x6f, 0xed },
+ .key = { 0xe0, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa },
+}, {
+ .input = { 0x02, 0xfc },
+ .ilen = 2,
+ .output = { 0x06, 0x12, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c },
+ .key = { 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c },
+}, {
+ .input = { 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7a, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x5c, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x6e, 0x7b, 0x00, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7a, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x5c,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x6e, 0x7b, 0x00, 0x13, 0x00, 0x00, 0x00,
+ 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00,
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x00,
+ 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00, 0x09,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x7a, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+ 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc },
+ .ilen = 415,
+ .output = { 0x33, 0x20, 0x5b, 0xbf, 0x9e, 0x9f, 0x8f, 0x72,
+ 0x12, 0xab, 0x9e, 0x2a, 0xb9, 0xb7, 0xe4, 0xa5 },
+ .key = { 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7b, 0x7b },
+}, {
+ .input = { 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0xff, 0xff, 0xff, 0xe9,
+ 0xe9, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+ 0xac, 0xac, 0xac, 0xac, 0x00, 0x00, 0xac, 0xac,
+ 0xec, 0x01, 0x00, 0xac, 0xac, 0xac, 0x2c, 0xac,
+ 0xa2, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+ 0xac, 0xac, 0xac, 0xac, 0x64, 0xf2 },
+ .ilen = 118,
+ .output = { 0x02, 0xee, 0x7c, 0x8c, 0x54, 0x6d, 0xde, 0xb1,
+ 0xa4, 0x67, 0xe4, 0xc3, 0x98, 0x11, 0x58, 0xb9 },
+ .key = { 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7f,
+ 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xcf, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77 },
+}, { /* nacl */
+ .input = { 0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
+ 0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
+ 0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
+ 0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
+ 0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
+ 0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
+ 0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
+ 0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
+ 0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
+ 0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
+ 0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
+ 0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
+ 0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
+ 0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
+ 0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
+ 0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
+ 0xe3, 0x55, 0xa5 },
+ .ilen = 131,
+ .output = { 0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
+ 0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9 },
+ .key = { 0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
+ 0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
+ 0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
+ 0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80 },
+}, { /* wrap 2^130-5 */
+ .input = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .ilen = 16,
+ .output = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .key = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* wrap 2^128 */
+ .input = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .ilen = 16,
+ .output = { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .key = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+}, { /* limb carry */
+ .input = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .ilen = 48,
+ .output = { 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .key = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 2^130-5 */
+ .input = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xfb, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 },
+ .ilen = 48,
+ .output = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .key = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 2^130-6 */
+ .input = { 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .ilen = 16,
+ .output = { 0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .key = { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 5*H+L reduction intermediate */
+ .input = { 0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .ilen = 64,
+ .output = { 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .key = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 5*H+L reduction final */
+ .input = { 0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .ilen = 48,
+ .output = { 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .key = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+} };
+
+bool __init poly1305_selftest(void)
+{
+ simd_context_t simd_context = simd_get();
+ bool success = true;
+ size_t i, j;
+
+ for (i = 0; i < ARRAY_SIZE(poly1305_testvecs); ++i) {
+ struct poly1305_ctx poly1305;
+ u8 out[POLY1305_MAC_SIZE];
+
+ memset(out, 0, sizeof(out));
+ memset(&poly1305, 0, sizeof(poly1305));
+ poly1305_init(&poly1305, poly1305_testvecs[i].key,
+ simd_context);
+ poly1305_update(&poly1305, poly1305_testvecs[i].input,
+ poly1305_testvecs[i].ilen, simd_context);
+ poly1305_final(&poly1305, out, simd_context);
+ if (memcmp(out, poly1305_testvecs[i].output,
+ POLY1305_MAC_SIZE)) {
+ pr_info("poly1305 self-test %zu: FAIL\n", i + 1);
+ success = false;
+ }
+ simd_context = simd_relax(simd_context);
+
+ if (poly1305_testvecs[i].ilen <= 1)
+ continue;
+
+ for (j = 1; j < poly1305_testvecs[i].ilen - 1; ++j) {
+ memset(out, 0, sizeof(out));
+ memset(&poly1305, 0, sizeof(poly1305));
+ poly1305_init(&poly1305, poly1305_testvecs[i].key,
+ simd_context);
+ poly1305_update(&poly1305, poly1305_testvecs[i].input,
+ j, simd_context);
+ poly1305_update(&poly1305,
+ poly1305_testvecs[i].input + j,
+ poly1305_testvecs[i].ilen - j,
+ simd_context);
+ poly1305_final(&poly1305, out, simd_context);
+ if (memcmp(out, poly1305_testvecs[i].output,
+ POLY1305_MAC_SIZE)) {
+ pr_info("poly1305 self-test %zu (split %zu): FAIL\n",
+ i + 1, j);
+ success = false;
+ }
+ simd_context = simd_relax(simd_context);
+ }
+ }
+ simd_put(simd_context);
+
+ if (success)
+ pr_info("poly1305 self-tests: pass\n");
+
+ return success;
+}
+#endif
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 06/20] zinc: ChaCha20 MIPS32r2 implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, René van Dorst, Samuel Neves,
Andy Lutomirski, Jean-Philippe Aumasson, Ralf Baechle,
Paul Burton, James Hogan, linux-mips
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
This MIPS32r2 implementation comes from René van Dorst and me and
results in a nice speedup on the usual OpenWRT targets.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: René van Dorst <opensource@vdorst.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@linux-mips.org
---
lib/zinc/Makefile | 4 +
lib/zinc/chacha20/chacha20-mips-glue.h | 28 ++
lib/zinc/chacha20/chacha20-mips.S | 474 +++++++++++++++++++++++++
3 files changed, 506 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-mips-glue.h
create mode 100644 lib/zinc/chacha20/chacha20-mips.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 32e4bd94ea0b..9f6a5e65d729 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -17,6 +17,10 @@ ifeq ($(CONFIG_ZINC_ARCH_ARM64),y)
zinc-y += chacha20/chacha20-arm64.o
CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
endif
+ifeq ($(CONFIG_ZINC_ARCH_MIPS)$(CONFIG_CPU_MIPS32_R2),yy)
+zinc-y += chacha20/chacha20-mips.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
+endif
endif
zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-mips-glue.h b/lib/zinc/chacha20/chacha20-mips-glue.h
new file mode 100644
index 000000000000..5b2c8cec36c8
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips-glue.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+
+asmlinkage void chacha20_mips(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+void __init chacha20_fpu_init(void)
+{
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+ chacha20_mips(dst, src, len, key, counter);
+ return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-mips.S b/lib/zinc/chacha20/chacha20-mips.S
new file mode 100644
index 000000000000..77da2c2fb240
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips.S
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32 0x3c
+#define MASK_BYTES 0x03
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE 4*16
+
+#define X0 $t0
+#define X1 $t1
+#define X2 $t2
+#define X3 $t3
+#define X4 $t4
+#define X5 $t5
+#define X6 $t6
+#define X7 $t7
+#define X8 $v1
+#define X9 $fp
+#define X10 $s7
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0 $s1
+#define T1 $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define OUT $a0
+#define IN $a1
+#define BYTES $a2
+/* KEY and NONCE argument must be u32 aligned */
+#define KEY $a3
+/* NONCE pointer is given via stack */
+#define NONCE $t9
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0 $v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X X15
+#define SAVED_CA $ra
+
+#define PTR_LAST_ROUND $t8
+
+/* ChaCha20 constants and stack location */
+#define CONSTANT_OFS_SP 48
+#define UNALIGNED_OFS_SP 40
+
+#define CONSTANT_1 0x61707865
+#define CONSTANT_2 0x3320646e
+#define CONSTANT_3 0x79622d32
+#define CONSTANT_4 0x6b206574
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define CPU_TO_LE32(n) \
+ wsbh n; \
+ rotr n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define STORE_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \
+ .if ((s != NONCE) || (o != 0)); \
+ lw T0, o(s); \
+ .endif; \
+ lwl T1, x-4+MSB ## (IN); \
+ lwr T1, x-4+LSB ## (IN); \
+ .if ((s == NONCE) && (o == 0)); \
+ addu X ## a, NONCE_0; \
+ .else; \
+ addu X ## a, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## a); \
+ xor X ## a, T1; \
+ swl X ## a, x-4+MSB ## (OUT); \
+ swr X ## a, x-4+LSB ## (OUT);
+
+#define STORE_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \
+ .if ((s != NONCE) || (o != 0)); \
+ lw T0, o(s); \
+ .endif; \
+ lw T1, x-4 ## (IN); \
+ .if ((s == NONCE) && (o == 0)); \
+ addu X ## a, NONCE_0; \
+ .else; \
+ addu X ## a, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## a); \
+ xor X ## a, T1; \
+ sw X ## a, x-4 ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_aligned_ ## a: ; \
+ .if ((s == NONCE) && (o == 0)); \
+ move SAVED_CA, NONCE_0; \
+ .else; \
+ lw SAVED_CA, o(s);\
+ .endif; \
+ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
+ move SAVED_X, X ## a;
+
+#define JMPTBL_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \
+ .if ((s == NONCE) && (o == 0)); \
+ move SAVED_CA, NONCE_0; \
+ .else; \
+ lw SAVED_CA, o(s);\
+ .endif; \
+ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+ move SAVED_X, X ## a;
+
+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
+ addu X(A), X(K); \
+ addu X(B), X(L); \
+ addu X(C), X(M); \
+ addu X(D), X(N); \
+ xor X(V), X(A); \
+ xor X(W), X(B); \
+ xor X(Y), X(C); \
+ xor X(Z), X(D); \
+ rotl X(V), S; \
+ rotl X(W), S; \
+ rotl X(Y), S; \
+ rotl X(Z), S;
+
+.text
+.set reorder
+.set noat
+.globl chacha20_mips
+.ent chacha20_mips
+chacha20_mips:
+ .frame $sp, STACK_SIZE, $ra
+ /* This is in the fifth argument */
+ lw NONCE, 16($sp)
+
+ /* Return bytes = 0. */
+ .set noreorder
+ beqz BYTES, .Lchacha20_mips_end
+ addiu $sp, -STACK_SIZE
+ .set reorder
+
+ /* Calculate PTR_LAST_ROUND */
+ addiu PTR_LAST_ROUND, BYTES, -1
+ ins PTR_LAST_ROUND, $zero, 0, 6
+ addu PTR_LAST_ROUND, OUT
+
+ /* Save s0-s7, fp, ra. */
+ sw $ra, 0($sp)
+ sw $fp, 4($sp)
+ sw $s0, 8($sp)
+ sw $s1, 12($sp)
+ sw $s2, 16($sp)
+ sw $s3, 20($sp)
+ sw $s4, 24($sp)
+ sw $s5, 28($sp)
+ sw $s6, 32($sp)
+ sw $s7, 36($sp)
+
+ lw NONCE_0, 0(NONCE)
+ /* Test IN or OUT is unaligned.
+ * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003
+ */
+ or T1, IN, OUT
+ andi T1, 0x3
+
+ /* Load constant */
+ lui X0, %hi(CONSTANT_1)
+ lui X1, %hi(CONSTANT_2)
+ lui X2, %hi(CONSTANT_3)
+ lui X3, %hi(CONSTANT_4)
+ ori X0, %lo(CONSTANT_1)
+ ori X1, %lo(CONSTANT_2)
+ ori X2, %lo(CONSTANT_3)
+ ori X3, %lo(CONSTANT_4)
+
+ /* Store constant on stack. */
+ sw X0, 0+CONSTANT_OFS_SP($sp)
+ sw X1, 4+CONSTANT_OFS_SP($sp)
+ sw X2, 8+CONSTANT_OFS_SP($sp)
+ sw X3, 12+CONSTANT_OFS_SP($sp)
+
+ sw T1, UNALIGNED_OFS_SP($sp)
+
+ .set noreorder
+ b .Lchacha20_rounds_start
+ andi BYTES, (CHACHA20_BLOCK_SIZE-1)
+ .set reorder
+
+.align 4
+.Loop_chacha20_rounds:
+ addiu IN, CHACHA20_BLOCK_SIZE
+ addiu OUT, CHACHA20_BLOCK_SIZE
+ addiu NONCE_0, 1
+
+ lw X0, 0+CONSTANT_OFS_SP($sp)
+ lw X1, 4+CONSTANT_OFS_SP($sp)
+ lw X2, 8+CONSTANT_OFS_SP($sp)
+ lw X3, 12+CONSTANT_OFS_SP($sp)
+ lw T1, UNALIGNED_OFS_SP($sp)
+
+.Lchacha20_rounds_start:
+ lw X4, 0(KEY)
+ lw X5, 4(KEY)
+ lw X6, 8(KEY)
+ lw X7, 12(KEY)
+ lw X8, 16(KEY)
+ lw X9, 20(KEY)
+ lw X10, 24(KEY)
+ lw X11, 28(KEY)
+
+ move X12, NONCE_0
+ lw X13, 4(NONCE)
+ lw X14, 8(NONCE)
+ lw X15, 12(NONCE)
+
+ li $at, 9
+.Loop_chacha20_xor_rounds:
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ .set noreorder
+ bnez $at, .Loop_chacha20_xor_rounds
+ addiu $at, -1
+
+ /* Unaligned? Jump */
+ bnez T1, .Loop_chacha20_unaligned
+ andi $at, BYTES, MASK_U32
+
+ /* Last round? No jump */
+ bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+
+ /* Full block? Jump */
+ beqz BYTES, .Lchacha20_mips_xor_aligned_64_b
+ /* Calculate lower half jump table addr and offset */
+ ins T0, $at, 2, 6
+
+ subu T0, $at
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+
+ jr T0
+ /* Delay slot */
+ nop
+
+ .set reorder
+
+.Loop_chacha20_unaligned:
+ .set noreorder
+
+ /* Last round? no jump */
+ bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ /* Full block? Jump */
+ beqz BYTES, .Lchacha20_mips_xor_unaligned_64_b
+
+ /* Calculate lower half jump table addr and offset */
+ ins T0, $at, 2, 6
+ subu T0, $at
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ jr T0
+ /* Delay slot */
+ nop
+
+ .set reorder
+
+/* Aligned code path
+ */
+.align 4
+ STORE_ALIGNED(64, 15, NONCE,12)
+ STORE_ALIGNED(60, 14, NONCE, 8)
+ STORE_ALIGNED(56, 13, NONCE, 4)
+ STORE_ALIGNED(52, 12, NONCE, 0)
+ STORE_ALIGNED(48, 11, KEY, 28)
+ STORE_ALIGNED(44, 10, KEY, 24)
+ STORE_ALIGNED(40, 9, KEY, 20)
+ STORE_ALIGNED(36, 8, KEY, 16)
+ STORE_ALIGNED(32, 7, KEY, 12)
+ STORE_ALIGNED(28, 6, KEY, 8)
+ STORE_ALIGNED(24, 5, KEY, 4)
+ STORE_ALIGNED(20, 4, KEY, 0)
+ STORE_ALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP)
+ STORE_ALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP)
+ STORE_ALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_aligned_4_b:
+ /* STORE_ALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */
+ lw T0, 0+CONSTANT_OFS_SP($sp)
+ lw T1, 0(IN)
+ addu X0, T0
+ CPU_TO_LE32(X0)
+ xor X0, T1
+ .set noreorder
+ bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ sw X0, 0(OUT)
+ .set reorder
+
+ .set noreorder
+ bne $at, BYTES, .Lchacha20_mips_xor_bytes
+ /* Empty delayslot, Increase NONCE_0, return NONCE_0 value */
+ addiu NONCE_0, 1
+ .set noreorder
+
+.Lchacha20_mips_xor_done:
+ /* Restore used registers */
+ lw $ra, 0($sp)
+ lw $fp, 4($sp)
+ lw $s0, 8($sp)
+ lw $s1, 12($sp)
+ lw $s2, 16($sp)
+ lw $s3, 20($sp)
+ lw $s4, 24($sp)
+ lw $s5, 28($sp)
+ lw $s6, 32($sp)
+ lw $s7, 36($sp)
+.Lchacha20_mips_end:
+ .set noreorder
+ jr $ra
+ addiu $sp, STACK_SIZE
+ .set reorder
+
+ .set noreorder
+ /* Start jump table */
+ JMPTBL_ALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED(16, 4, KEY, 0)
+ JMPTBL_ALIGNED(20, 5, KEY, 4)
+ JMPTBL_ALIGNED(24, 6, KEY, 8)
+ JMPTBL_ALIGNED(28, 7, KEY, 12)
+ JMPTBL_ALIGNED(32, 8, KEY, 16)
+ JMPTBL_ALIGNED(36, 9, KEY, 20)
+ JMPTBL_ALIGNED(40, 10, KEY, 24)
+ JMPTBL_ALIGNED(44, 11, KEY, 28)
+ JMPTBL_ALIGNED(48, 12, NONCE, 0)
+ JMPTBL_ALIGNED(52, 13, NONCE, 4)
+ JMPTBL_ALIGNED(56, 14, NONCE, 8)
+ JMPTBL_ALIGNED(60, 15, NONCE,12)
+ /* End jump table */
+ .set reorder
+
+/* Unaligned code path
+ */
+ STORE_UNALIGNED(64, 15, NONCE,12)
+ STORE_UNALIGNED(60, 14, NONCE, 8)
+ STORE_UNALIGNED(56, 13, NONCE, 4)
+ STORE_UNALIGNED(52, 12, NONCE, 0)
+ STORE_UNALIGNED(48, 11, KEY, 28)
+ STORE_UNALIGNED(44, 10, KEY, 24)
+ STORE_UNALIGNED(40, 9, KEY, 20)
+ STORE_UNALIGNED(36, 8, KEY, 16)
+ STORE_UNALIGNED(32, 7, KEY, 12)
+ STORE_UNALIGNED(28, 6, KEY, 8)
+ STORE_UNALIGNED(24, 5, KEY, 4)
+ STORE_UNALIGNED(20, 4, KEY, 0)
+ STORE_UNALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP)
+ STORE_UNALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP)
+ STORE_UNALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_unaligned_4_b:
+ /* STORE_UNALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */
+ lw T0, 0+CONSTANT_OFS_SP($sp)
+ lwl T1, 0+MSB(IN)
+ lwr T1, 0+LSB(IN)
+ addu X0, T0
+ CPU_TO_LE32(X0)
+ xor X0, T1
+ swl X0, 0+MSB(OUT)
+ .set noreorder
+ bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ swr X0, 0+LSB(OUT)
+ .set reorder
+
+ /* Fall through to byte handling */
+ .set noreorder
+ beq $at, BYTES, .Lchacha20_mips_xor_done
+ /* Empty delayslot, increase NONCE_0, return NONCE_0 value */
+.Lchacha20_mips_xor_unaligned_0_b:
+.Lchacha20_mips_xor_aligned_0_b:
+ addiu NONCE_0, 1
+ .set reorder
+
+.Lchacha20_mips_xor_bytes:
+ addu OUT, $at
+ addu IN, $at
+ addu SAVED_X, SAVED_CA
+ /* First byte */
+ lbu T1, 0(IN)
+ andi $at, BYTES, 2
+ CPU_TO_LE32(SAVED_X)
+ ROTR(SAVED_X)
+ xor T1, SAVED_X
+ .set noreorder
+ beqz $at, .Lchacha20_mips_xor_done
+ sb T1, 0(OUT)
+ .set reorder
+ /* Second byte */
+ lbu T1, 1(IN)
+ andi $at, BYTES, 1
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ .set noreorder
+ beqz $at, .Lchacha20_mips_xor_done
+ sb T1, 1(OUT)
+ .set reorder
+ /* Third byte */
+ lbu T1, 2(IN)
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ .set noreorder
+ b .Lchacha20_mips_xor_done
+ sb T1, 2(OUT)
+ .set reorder
+.set noreorder
+
+.Lchacha20_mips_jmptbl_unaligned:
+ /* Start jump table */
+ JMPTBL_UNALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED(16, 4, KEY, 0)
+ JMPTBL_UNALIGNED(20, 5, KEY, 4)
+ JMPTBL_UNALIGNED(24, 6, KEY, 8)
+ JMPTBL_UNALIGNED(28, 7, KEY, 12)
+ JMPTBL_UNALIGNED(32, 8, KEY, 16)
+ JMPTBL_UNALIGNED(36, 9, KEY, 20)
+ JMPTBL_UNALIGNED(40, 10, KEY, 24)
+ JMPTBL_UNALIGNED(44, 11, KEY, 28)
+ JMPTBL_UNALIGNED(48, 12, NONCE, 0)
+ JMPTBL_UNALIGNED(52, 13, NONCE, 4)
+ JMPTBL_UNALIGNED(56, 14, NONCE, 8)
+ JMPTBL_UNALIGNED(60, 15, NONCE,12)
+ /* End jump table */
+.set reorder
+
+.end chacha20_mips
+.set at
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 05/20] zinc: ChaCha20 x86_64 implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Jean-Philippe Aumasson, Andy Polyakov, Thomas Gleixner,
Ingo Molnar, x86
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
This provides SSSE3, AVX-2, AVX-512F, and AVX-512VL implementations for
ChaCha20. The AVX-512F implementation is disabled on Skylake, due to
throttling, and the VL ymm implementation is used instead. These come
from Andy Polyakov's implementation, with the following modifications
from Samuel Neves:
- Some cosmetic changes, like renaming labels to .Lname, constants,
and other Linux conventions.
- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.
- Eliminate translating certain instructions, such as pshufb, palignr,
vprotd, etc, to .byte directives. This is meant for compatibility
with ancient toolchains, but presumably it is unnecessary here,
since the build system already does checks on what GNU as can
assemble.
- When aligning the stack, the original code was saving %rsp to %r9.
To keep objtool happy, we use instead the DRAP idiom to save %rsp
to %r10:
leaq 8(%rsp),%r10
... code here ...
leaq -8(%r10),%rsp
- The original code assumes the stack comes aligned to 16 bytes. This
is not necessarily the case, and to avoid crashes,
`andq $-alignment, %rsp` was added in the prolog of a few functions.
- The original hardcodes returns as .byte 0xf3,0xc3, aka "rep ret".
We replace this by "ret". "rep ret" was meant to help with AMD K8
chips, cf. http://repzret.org/p/repzret. It makes no sense to
continue to use this kludge for code that won't even run on ancient
AMD chips.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
---
lib/zinc/Makefile | 4 +
lib/zinc/chacha20/chacha20-x86_64-glue.h | 102 +
lib/zinc/chacha20/chacha20-x86_64.S | 2632 ++++++++++++++++++++++
3 files changed, 2738 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-x86_64-glue.h
create mode 100644 lib/zinc/chacha20/chacha20-x86_64.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 8d14cb13349a..32e4bd94ea0b 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -5,6 +5,10 @@ ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
ifeq ($(CONFIG_ZINC_CHACHA20),y)
zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_ZINC_ARCH_X86_64),y)
+zinc-y += chacha20/chacha20-x86_64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-x86_64-glue.h
+endif
ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
zinc-y += chacha20/chacha20-arm.o
CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
diff --git a/lib/zinc/chacha20/chacha20-x86_64-glue.h b/lib/zinc/chacha20/chacha20-x86_64-glue.h
new file mode 100644
index 000000000000..e4f6c3162d3f
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64-glue.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+#ifdef CONFIG_AS_SSSE3
+asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce,
+ const u8 *key);
+asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_ssse3 __ro_after_init;
+static bool chacha20_use_avx2 __ro_after_init;
+static bool chacha20_use_avx512 __ro_after_init;
+static bool chacha20_use_avx512vl __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+ chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
+ chacha20_use_avx2 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ chacha20_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+ chacha20_use_avx512vl =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL);
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+ if (simd_context != HAVE_FULL_SIMD)
+ return false;
+
+#ifdef CONFIG_AS_AVX512
+ if (chacha20_use_avx512) {
+ chacha20_avx512(dst, src, len, key, counter);
+ return true;
+ }
+ if (chacha20_use_avx512vl) {
+ chacha20_avx512vl(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (chacha20_use_avx2) {
+ chacha20_avx2(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_SSSE3
+ if (chacha20_use_ssse3) {
+ chacha20_ssse3(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+#if defined(CONFIG_AS_SSSE3)
+ if (simd_context == HAVE_FULL_SIMD && chacha20_use_ssse3) {
+ hchacha20_ssse3(derived_key, nonce, key);
+ return true;
+ }
+#endif
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-x86_64.S b/lib/zinc/chacha20/chacha20-x86_64.S
new file mode 100644
index 000000000000..3f503a319692
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64.S
@@ -0,0 +1,2632 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.Lzero, "aM", @progbits, 16
+.align 16
+.Lzero:
+.long 0,0,0,0
+.section .rodata.cst16.Lone, "aM", @progbits, 16
+.align 16
+.Lone:
+.long 1,0,0,0
+.section .rodata.cst16.Linc, "aM", @progbits, 16
+.align 16
+.Linc:
+.long 0,1,2,3
+.section .rodata.cst16.Lfour, "aM", @progbits, 16
+.align 16
+.Lfour:
+.long 4,4,4,4
+.section .rodata.cst32.Lincy, "aM", @progbits, 32
+.align 32
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.section .rodata.cst32.Leight, "aM", @progbits, 32
+.align 32
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.section .rodata.cst16.Lrot16, "aM", @progbits, 16
+.align 16
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.section .rodata.cst16.Lrot24, "aM", @progbits, 16
+.align 16
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.section .rodata.cst16.Lsigma, "aM", @progbits, 16
+.align 16
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.section .rodata.cst64.Lfourz, "aM", @progbits, 64
+.align 64
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.section .rodata.cst64.Lincz, "aM", @progbits, 64
+.align 64
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
+.align 64
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
+.align 64
+.Ltwoy:
+.long 2,0,0,0, 2,0,0,0
+
+.text
+
+#ifdef CONFIG_AS_SSSE3
+.align 32
+ENTRY(hchacha20_ssse3)
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rdx),%xmm1
+ movdqu 16(%rdx),%xmm2
+ movdqu (%rsi),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+ movq $10,%r8
+ .align 32
+.Loop_hssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_hssse3
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ ret
+ENDPROC(hchacha20_ssse3)
+
+.align 32
+ENTRY(chacha20_ssse3)
+.Lchacha20_ssse3:
+ cmpq $0,%rdx
+ je .Lssse3_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_4x
+
+.Ldo_sse3_after_all:
+ subq $64+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ leaq -8(%r10),%rsp
+
+.Lssse3_epilogue:
+ ret
+
+.align 32
+.Lchacha20_4x:
+ leaq 8(%rsp),%r10
+
+.Lproceed4x:
+ subq $0x140+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r9),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm7,%xmm0
+ pshufb %xmm7,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm7,%xmm2
+ pshufb %xmm7,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm6,%xmm2
+ pshufb %xmm6,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm7,%xmm3
+ pshufb %xmm7,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm6,%xmm3
+ pshufb %xmm6,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm7,%xmm1
+ pshufb %xmm7,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r9,%r9
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ leaq -8(%r10),%rsp
+
+.L4x_epilogue:
+ ret
+ENDPROC(chacha20_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX2
+.align 32
+ENTRY(chacha20_avx2)
+.Lchacha20_avx2:
+ cmpq $0,%rdx
+ je .L8x_epilogue
+ leaq 8(%rsp),%r10
+
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r9),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r9,%r9
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L8x_epilogue:
+ ret
+ENDPROC(chacha20_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align 32
+ENTRY(chacha20_avx512)
+.Lchacha20_avx512:
+ cmpq $0,%rdx
+ je .Lavx512_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $512,%rdx
+ ja .Lchacha20_16x
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti32x4 .Lsigma(%rip),%zmm0
+ vbroadcasti32x4 (%rcx),%zmm1
+ vbroadcasti32x4 16(%rcx),%zmm2
+ vbroadcasti32x4 (%r8),%zmm3
+
+ vmovdqa32 %zmm0,%zmm16
+ vmovdqa32 %zmm1,%zmm17
+ vmovdqa32 %zmm2,%zmm18
+ vpaddd .Lzeroz(%rip),%zmm3,%zmm3
+ vmovdqa32 .Lfourz(%rip),%zmm20
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 16
+.Loop_outer_avx512:
+ vmovdqa32 %zmm16,%zmm0
+ vmovdqa32 %zmm17,%zmm1
+ vmovdqa32 %zmm18,%zmm2
+ vpaddd %zmm20,%zmm19,%zmm3
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $57,%zmm1,%zmm1
+ vpshufd $147,%zmm3,%zmm3
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $147,%zmm1,%zmm1
+ vpshufd $57,%zmm3,%zmm3
+ decq %r8
+ jnz .Loop_avx512
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $1,%zmm0,%xmm4
+ vextracti32x4 $1,%zmm1,%xmm5
+ vextracti32x4 $1,%zmm2,%xmm6
+ vextracti32x4 $1,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $2,%zmm0,%xmm4
+ vextracti32x4 $2,%zmm1,%xmm5
+ vextracti32x4 $2,%zmm2,%xmm6
+ vextracti32x4 $2,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $3,%zmm0,%xmm4
+ vextracti32x4 $3,%zmm1,%xmm5
+ vextracti32x4 $3,%zmm2,%xmm6
+ vextracti32x4 $3,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jnz .Loop_outer_avx512
+
+ jmp .Ldone_avx512
+
+.align 16
+.Ltail64_avx512:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512
+
+.align 16
+.Ltail_avx512:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512
+
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone_avx512:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.Lavx512_epilogue:
+ ret
+
+.align 32
+.Lchacha20_16x:
+ leaq 8(%rsp),%r10
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti32x4 (%r9),%zmm3
+ vbroadcasti32x4 (%rcx),%zmm7
+ vbroadcasti32x4 16(%rcx),%zmm11
+ vbroadcasti32x4 (%r8),%zmm15
+
+ vpshufd $0x00,%zmm3,%zmm0
+ vpshufd $0x55,%zmm3,%zmm1
+ vpshufd $0xaa,%zmm3,%zmm2
+ vpshufd $0xff,%zmm3,%zmm3
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ vpshufd $0x00,%zmm7,%zmm4
+ vpshufd $0x55,%zmm7,%zmm5
+ vpshufd $0xaa,%zmm7,%zmm6
+ vpshufd $0xff,%zmm7,%zmm7
+ vmovdqa64 %zmm4,%zmm20
+ vmovdqa64 %zmm5,%zmm21
+ vmovdqa64 %zmm6,%zmm22
+ vmovdqa64 %zmm7,%zmm23
+
+ vpshufd $0x00,%zmm11,%zmm8
+ vpshufd $0x55,%zmm11,%zmm9
+ vpshufd $0xaa,%zmm11,%zmm10
+ vpshufd $0xff,%zmm11,%zmm11
+ vmovdqa64 %zmm8,%zmm24
+ vmovdqa64 %zmm9,%zmm25
+ vmovdqa64 %zmm10,%zmm26
+ vmovdqa64 %zmm11,%zmm27
+
+ vpshufd $0x00,%zmm15,%zmm12
+ vpshufd $0x55,%zmm15,%zmm13
+ vpshufd $0xaa,%zmm15,%zmm14
+ vpshufd $0xff,%zmm15,%zmm15
+ vpaddd .Lincz(%rip),%zmm12,%zmm12
+ vmovdqa64 %zmm12,%zmm28
+ vmovdqa64 %zmm13,%zmm29
+ vmovdqa64 %zmm14,%zmm30
+ vmovdqa64 %zmm15,%zmm31
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop_outer16x:
+ vpbroadcastd 0(%r9),%zmm0
+ vpbroadcastd 4(%r9),%zmm1
+ vpbroadcastd 8(%r9),%zmm2
+ vpbroadcastd 12(%r9),%zmm3
+ vpaddd .Lsixteen(%rip),%zmm28,%zmm28
+ vmovdqa64 %zmm20,%zmm4
+ vmovdqa64 %zmm21,%zmm5
+ vmovdqa64 %zmm22,%zmm6
+ vmovdqa64 %zmm23,%zmm7
+ vmovdqa64 %zmm24,%zmm8
+ vmovdqa64 %zmm25,%zmm9
+ vmovdqa64 %zmm26,%zmm10
+ vmovdqa64 %zmm27,%zmm11
+ vmovdqa64 %zmm28,%zmm12
+ vmovdqa64 %zmm29,%zmm13
+ vmovdqa64 %zmm30,%zmm14
+ vmovdqa64 %zmm31,%zmm15
+
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop16x:
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ decl %eax
+ jnz .Loop16x
+
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ vpunpckldq %zmm1,%zmm0,%zmm18
+ vpunpckldq %zmm3,%zmm2,%zmm19
+ vpunpckhdq %zmm1,%zmm0,%zmm0
+ vpunpckhdq %zmm3,%zmm2,%zmm2
+ vpunpcklqdq %zmm19,%zmm18,%zmm1
+ vpunpckhqdq %zmm19,%zmm18,%zmm18
+ vpunpcklqdq %zmm2,%zmm0,%zmm3
+ vpunpckhqdq %zmm2,%zmm0,%zmm0
+ vpaddd %zmm20,%zmm4,%zmm4
+ vpaddd %zmm21,%zmm5,%zmm5
+ vpaddd %zmm22,%zmm6,%zmm6
+ vpaddd %zmm23,%zmm7,%zmm7
+
+ vpunpckldq %zmm5,%zmm4,%zmm2
+ vpunpckldq %zmm7,%zmm6,%zmm19
+ vpunpckhdq %zmm5,%zmm4,%zmm4
+ vpunpckhdq %zmm7,%zmm6,%zmm6
+ vpunpcklqdq %zmm19,%zmm2,%zmm5
+ vpunpckhqdq %zmm19,%zmm2,%zmm2
+ vpunpcklqdq %zmm6,%zmm4,%zmm7
+ vpunpckhqdq %zmm6,%zmm4,%zmm4
+ vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19
+ vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
+ vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
+ vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
+ vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
+ vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
+ vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
+ vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
+ vpaddd %zmm24,%zmm8,%zmm8
+ vpaddd %zmm25,%zmm9,%zmm9
+ vpaddd %zmm26,%zmm10,%zmm10
+ vpaddd %zmm27,%zmm11,%zmm11
+
+ vpunpckldq %zmm9,%zmm8,%zmm6
+ vpunpckldq %zmm11,%zmm10,%zmm0
+ vpunpckhdq %zmm9,%zmm8,%zmm8
+ vpunpckhdq %zmm11,%zmm10,%zmm10
+ vpunpcklqdq %zmm0,%zmm6,%zmm9
+ vpunpckhqdq %zmm0,%zmm6,%zmm6
+ vpunpcklqdq %zmm10,%zmm8,%zmm11
+ vpunpckhqdq %zmm10,%zmm8,%zmm8
+ vpaddd %zmm28,%zmm12,%zmm12
+ vpaddd %zmm29,%zmm13,%zmm13
+ vpaddd %zmm30,%zmm14,%zmm14
+ vpaddd %zmm31,%zmm15,%zmm15
+
+ vpunpckldq %zmm13,%zmm12,%zmm10
+ vpunpckldq %zmm15,%zmm14,%zmm0
+ vpunpckhdq %zmm13,%zmm12,%zmm12
+ vpunpckhdq %zmm15,%zmm14,%zmm14
+ vpunpcklqdq %zmm0,%zmm10,%zmm13
+ vpunpckhqdq %zmm0,%zmm10,%zmm10
+ vpunpcklqdq %zmm14,%zmm12,%zmm15
+ vpunpckhqdq %zmm14,%zmm12,%zmm12
+ vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0
+ vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
+ vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
+ vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
+ vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
+ vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
+ vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
+ vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
+ vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16
+ vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
+ vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
+ vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
+ vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
+ vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
+ vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
+ vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
+ vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
+ vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
+ vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
+ vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
+ vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
+ vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
+ vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
+ vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
+ cmpq $1024,%rdx
+ jb .Ltail16x
+
+ vpxord 0(%rsi),%zmm16,%zmm16
+ vpxord 64(%rsi),%zmm17,%zmm17
+ vpxord 128(%rsi),%zmm14,%zmm14
+ vpxord 192(%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm16,0(%rdi)
+ vmovdqu32 %zmm17,64(%rdi)
+ vmovdqu32 %zmm14,128(%rdi)
+ vmovdqu32 %zmm8,192(%rdi)
+
+ vpxord 256(%rsi),%zmm19,%zmm19
+ vpxord 320(%rsi),%zmm1,%zmm1
+ vpxord 384(%rsi),%zmm18,%zmm18
+ vpxord 448(%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm19,256(%rdi)
+ vmovdqu32 %zmm1,320(%rdi)
+ vmovdqu32 %zmm18,384(%rdi)
+ vmovdqu32 %zmm3,448(%rdi)
+
+ vpxord 512(%rsi),%zmm0,%zmm0
+ vpxord 576(%rsi),%zmm9,%zmm9
+ vpxord 640(%rsi),%zmm6,%zmm6
+ vpxord 704(%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm0,512(%rdi)
+ vmovdqu32 %zmm9,576(%rdi)
+ vmovdqu32 %zmm6,640(%rdi)
+ vmovdqu32 %zmm11,704(%rdi)
+
+ vpxord 768(%rsi),%zmm13,%zmm13
+ vpxord 832(%rsi),%zmm10,%zmm10
+ vpxord 896(%rsi),%zmm15,%zmm15
+ vpxord 960(%rsi),%zmm12,%zmm12
+ leaq 1024(%rsi),%rsi
+ vmovdqu32 %zmm13,768(%rdi)
+ vmovdqu32 %zmm10,832(%rdi)
+ vmovdqu32 %zmm15,896(%rdi)
+ vmovdqu32 %zmm12,960(%rdi)
+ leaq 1024(%rdi),%rdi
+
+ subq $1024,%rdx
+ jnz .Loop_outer16x
+
+ jmp .Ldone16x
+
+.align 32
+.Ltail16x:
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm16,%zmm16
+ vmovdqu32 %zmm16,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm17,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm17,%zmm17
+ vmovdqu32 %zmm17,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm14,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm14,%zmm14
+ vmovdqu32 %zmm14,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm8,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm8,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm19,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm19,%zmm19
+ vmovdqu32 %zmm19,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm1,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm1,%zmm1
+ vmovdqu32 %zmm1,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm18,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm18,%zmm18
+ vmovdqu32 %zmm18,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm3,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $512,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm3,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm0,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $576,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm0,%zmm0
+ vmovdqu32 %zmm0,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm9,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $640,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm9,%zmm9
+ vmovdqu32 %zmm9,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm6,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $704,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm6,%zmm6
+ vmovdqu32 %zmm6,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm11,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $768,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm11,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm13,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $832,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm13,%zmm13
+ vmovdqu32 %zmm13,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm10,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $896,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm10,%zmm10
+ vmovdqu32 %zmm10,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm15,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $960,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm15,%zmm15
+ vmovdqu32 %zmm15,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm12,%zmm16
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_16x:
+ vmovdqa32 %zmm16,0(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail16x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail16x
+
+ vpxord %zmm16,%zmm16,%zmm16
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone16x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L16x_epilogue:
+ ret
+ENDPROC(chacha20_avx512)
+
+.align 32
+ENTRY(chacha20_avx512vl)
+ cmpq $0,%rdx
+ je .Lavx512vl_epilogue
+
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_8xvl
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti128 .Lsigma(%rip),%ymm0
+ vbroadcasti128 (%rcx),%ymm1
+ vbroadcasti128 16(%rcx),%ymm2
+ vbroadcasti128 (%r8),%ymm3
+
+ vmovdqa32 %ymm0,%ymm16
+ vmovdqa32 %ymm1,%ymm17
+ vmovdqa32 %ymm2,%ymm18
+ vpaddd .Lzeroz(%rip),%ymm3,%ymm3
+ vmovdqa32 .Ltwoy(%rip),%ymm20
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 16
+.Loop_outer_avx512vl:
+ vmovdqa32 %ymm18,%ymm2
+ vpaddd %ymm20,%ymm19,%ymm3
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 32
+.Loop_avx512vl:
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $57,%ymm1,%ymm1
+ vpshufd $147,%ymm3,%ymm3
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $147,%ymm1,%ymm1
+ vpshufd $57,%ymm3,%ymm3
+ decq %r8
+ jnz .Loop_avx512vl
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512vl
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512vl
+
+ vextracti128 $1,%ymm0,%xmm4
+ vextracti128 $1,%ymm1,%xmm5
+ vextracti128 $1,%ymm2,%xmm6
+ vextracti128 $1,%ymm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512vl
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ vmovdqa32 %ymm16,%ymm0
+ vmovdqa32 %ymm17,%ymm1
+ jnz .Loop_outer_avx512vl
+
+ jmp .Ldone_avx512vl
+
+.align 16
+.Ltail64_avx512vl:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512vl
+
+.align 16
+.Ltail_avx512vl:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512vl:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512vl
+
+ vmovdqa32 %ymm16,0(%rsp)
+ vmovdqa32 %ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.Lavx512vl_epilogue:
+ ret
+
+.align 32
+.Lchacha20_8xvl:
+ leaq 8(%rsp),%r10
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti128 (%r9),%ymm3
+ vbroadcasti128 (%rcx),%ymm7
+ vbroadcasti128 16(%rcx),%ymm11
+ vbroadcasti128 (%r8),%ymm15
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vpshufd $0xaa,%ymm3,%ymm2
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpshufd $0xaa,%ymm7,%ymm6
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vpshufd $0xaa,%ymm11,%ymm10
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vpshufd $0xaa,%ymm15,%ymm14
+ vpshufd $0xff,%ymm15,%ymm15
+ vpaddd .Lincy(%rip),%ymm12,%ymm12
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop_outer8xvl:
+
+
+ vpbroadcastd 8(%r9),%ymm2
+ vpbroadcastd 12(%r9),%ymm3
+ vpaddd .Leight(%rip),%ymm28,%ymm28
+ vmovdqa64 %ymm20,%ymm4
+ vmovdqa64 %ymm21,%ymm5
+ vmovdqa64 %ymm22,%ymm6
+ vmovdqa64 %ymm23,%ymm7
+ vmovdqa64 %ymm24,%ymm8
+ vmovdqa64 %ymm25,%ymm9
+ vmovdqa64 %ymm26,%ymm10
+ vmovdqa64 %ymm27,%ymm11
+ vmovdqa64 %ymm28,%ymm12
+ vmovdqa64 %ymm29,%ymm13
+ vmovdqa64 %ymm30,%ymm14
+ vmovdqa64 %ymm31,%ymm15
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop8xvl:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ decl %eax
+ jnz .Loop8xvl
+
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm18
+ vpunpckldq %ymm3,%ymm2,%ymm19
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm19,%ymm18,%ymm1
+ vpunpckhqdq %ymm19,%ymm18,%ymm18
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm2
+ vpunpckldq %ymm7,%ymm6,%ymm19
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm19,%ymm2,%ymm5
+ vpunpckhqdq %ymm19,%ymm2,%ymm2
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vshufi32x4 $0,%ymm5,%ymm1,%ymm19
+ vshufi32x4 $3,%ymm5,%ymm1,%ymm5
+ vshufi32x4 $0,%ymm2,%ymm18,%ymm1
+ vshufi32x4 $3,%ymm2,%ymm18,%ymm2
+ vshufi32x4 $0,%ymm7,%ymm3,%ymm18
+ vshufi32x4 $3,%ymm7,%ymm3,%ymm7
+ vshufi32x4 $0,%ymm4,%ymm0,%ymm3
+ vshufi32x4 $3,%ymm4,%ymm0,%ymm4
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm6
+ vpunpckldq %ymm11,%ymm10,%ymm0
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm0,%ymm6,%ymm9
+ vpunpckhqdq %ymm0,%ymm6,%ymm6
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ vpunpckldq %ymm13,%ymm12,%ymm10
+ vpunpckldq %ymm15,%ymm14,%ymm0
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm0,%ymm10,%ymm13
+ vpunpckhqdq %ymm0,%ymm10,%ymm10
+ vpunpcklqdq %ymm14,%ymm12,%ymm15
+ vpunpckhqdq %ymm14,%ymm12,%ymm12
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+ vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+ cmpq $512,%rdx
+ jb .Ltail8xvl
+
+ movl $0x80,%eax
+ vpxord 0(%rsi),%ymm19,%ymm19
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vpxor 64(%rsi),%ymm5,%ymm5
+ vpxor 96(%rsi),%ymm13,%ymm13
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm19,0(%rdi)
+ vmovdqu %ymm0,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm13,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm2,%ymm2
+ vpxor 96(%rsi),%ymm10,%ymm10
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm1,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm10,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vpxor 64(%rsi),%ymm7,%ymm7
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm18,0(%rdi)
+ vmovdqu %ymm6,32(%rdi)
+ vmovdqu %ymm7,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vpxor 64(%rsi),%ymm4,%ymm4
+ vpxor 96(%rsi),%ymm12,%ymm12
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vmovdqu %ymm12,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpbroadcastd 0(%r9),%ymm0
+ vpbroadcastd 4(%r9),%ymm1
+
+ subq $512,%rdx
+ jnz .Loop_outer8xvl
+
+ jmp .Ldone8xvl
+
+.align 32
+.Ltail8xvl:
+ vmovdqa64 %ymm19,%ymm8
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm8,%ymm8
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vmovdqu %ymm8,0(%rdi,%rsi,1)
+ vmovdqu %ymm0,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm5,%ymm8
+ vmovdqa %ymm13,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm5,%ymm5
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm5,0(%rdi,%rsi,1)
+ vmovdqu %ymm13,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm1,%ymm8
+ vmovdqa %ymm9,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm1,0(%rdi,%rsi,1)
+ vmovdqu %ymm9,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm2,%ymm8
+ vmovdqa %ymm10,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm2,%ymm2
+ vpxor 32(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm2,0(%rdi,%rsi,1)
+ vmovdqu %ymm10,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa32 %ymm18,%ymm8
+ vmovdqa %ymm6,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_8xvl
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vmovdqu32 %ymm18,0(%rdi,%rsi,1)
+ vmovdqu %ymm6,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm7,%ymm8
+ vmovdqa %ymm15,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm7,%ymm7
+ vpxor 32(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm7,0(%rdi,%rsi,1)
+ vmovdqu %ymm15,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm3,%ymm8
+ vmovdqa %ymm11,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi,%rsi,1)
+ vmovdqu %ymm11,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm4,%ymm8
+ vmovdqa %ymm12,%ymm0
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm0,32(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail8xvl:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8xvl
+
+ vpxor %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+
+.Ldone8xvl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.L8xvl_epilogue:
+ ret
+ENDPROC(chacha20_avx512vl)
+
+#endif /* CONFIG_AS_AVX512 */
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 04/20] zinc: ChaCha20 ARM and ARM64 implementations
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Jean-Philippe Aumasson, Andy Polyakov, Russell King,
linux-arm-kernel
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
These NEON and non-NEON implementations come from Andy Polyakov's
implementation. They are exactly the same as Andy Polyakov's original,
with the following exceptions:
- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed (prefixed with .L) to fit kernel conventions.
- Constants have been rearranged so that they are closer to the code
that is using them. [ARM only]
- The neon code can jump to the scalar code when it makes sense to do
so.
- The neon_512 function as a separate function has been removed, leaving
the decision up to the main neon entry point. [ARM64 only]
After '/^#/d;/^\..*[^:]$/d', the code has the following diff in actual
instructions from the original.
ARM:
-ChaCha20_ctr32:
-.LChaCha20_ctr32:
+ENTRY(chacha20_arm)
ldr r12,[sp,#0] @ pull pointer to counter and nonce
stmdb sp!,{r0-r2,r4-r11,lr}
- sub r14,pc,#16 @ ChaCha20_ctr32
- adr r14,.LChaCha20_ctr32
cmp r2,#0 @ len==0?
itt eq
addeq sp,sp,#4*3
- beq .Lno_data
- cmp r2,#192 @ test len
- bls .Lshort
- ldr r4,[r14,#-32]
- ldr r4,[r14,r4]
- ldr r4,[r4]
- tst r4,#ARMV7_NEON
- bne .LChaCha20_neon
+ beq .Lno_data_arm
.Lshort:
ldmia r12,{r4-r7} @ load counter and nonce
sub sp,sp,#4*(16) @ off-load area
- sub r14,r14,#64 @ .Lsigma
+ sub r14,pc,#100 @ .Lsigma
+ adr r14,.Lsigma @ .Lsigma
stmdb sp!,{r4-r7} @ copy counter and nonce
ldmia r3,{r4-r11} @ load key
ldmia r14,{r0-r3} @ load sigma
@@ -617,14 +615,25 @@
.Ldone:
add sp,sp,#4*(32+3)
-.Lno_data:
+.Lno_data_arm:
ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)
-ChaCha20_neon:
+ENTRY(chacha20_neon)
ldr r12,[sp,#0] @ pull pointer to counter and nonce
stmdb sp!,{r0-r2,r4-r11,lr}
-.LChaCha20_neon:
- adr r14,.Lsigma
+ cmp r2,#0 @ len==0?
+ itt eq
+ addeq sp,sp,#4*3
+ beq .Lno_data_neon
+ cmp r2,#192 @ test len
+ bls .Lshort
+.Lchacha20_neon_begin:
+ adr r14,.Lsigma2
vstmdb sp!,{d8-d15} @ ABI spec says so
stmdb sp!,{r0-r3}
@@ -1265,4 +1274,6 @@
add sp,sp,#4*(32+4)
vldmia sp,{d8-d15}
add sp,sp,#4*(16+3)
+.Lno_data_neon:
ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_neon)
ARM64:
-ChaCha20_ctr32:
+ENTRY(chacha20_arm)
cbz x2,.Labort
- adr x5,.LOPENSSL_armcap_P
- cmp x2,#192
- b.lo .Lshort
- ldrsw x6,[x5]
- ldr x6,[x5]
- ldr w17,[x6,x5]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
-
.Lshort:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@@ -279,8 +274,13 @@
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
+ENDPROC(chacha20_arm)
+
+ENTRY(chacha20_neon)
+ cbz x2,.Labort_neon
+ cmp x2,#192
+ b.lo .Lshort
-ChaCha20_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@@ -763,16 +763,6 @@
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
-ChaCha20_512_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr x5,.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
.L512_or_more_neon:
sub sp,sp,#128+64
@@ -1920,4 +1910,6 @@
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
+.Labort_neon:
ret
+ENDPROC(chacha20_neon)
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
---
lib/zinc/Makefile | 8 +
lib/zinc/chacha20/chacha20-arm-glue.h | 50 +
lib/zinc/chacha20/chacha20-arm.S | 1473 +++++++++++++++++++
lib/zinc/chacha20/chacha20-arm64.S | 1942 +++++++++++++++++++++++++
4 files changed, 3473 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-arm-glue.h
create mode 100644 lib/zinc/chacha20/chacha20-arm.S
create mode 100644 lib/zinc/chacha20/chacha20-arm64.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 0b5a964bfba6..8d14cb13349a 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -5,6 +5,14 @@ ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
ifeq ($(CONFIG_ZINC_CHACHA20),y)
zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
+zinc-y += chacha20/chacha20-arm.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
+ifeq ($(CONFIG_ZINC_ARCH_ARM64),y)
+zinc-y += chacha20/chacha20-arm64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
endif
zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-arm-glue.h b/lib/zinc/chacha20/chacha20-arm-glue.h
new file mode 100644
index 000000000000..d32361514f3a
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm-glue.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && \
+ (defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_neon __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+ chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+ chacha20_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+ if (simd_context == HAVE_FULL_SIMD && chacha20_use_neon) {
+ chacha20_neon(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+ chacha20_arm(dst, src, len, key, counter);
+ return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-arm.S b/lib/zinc/chacha20/chacha20-arm.S
new file mode 100644
index 000000000000..0ea1db1492eb
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm.S
@@ -0,0 +1,1473 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+.word -1
+
+.align 5
+ENTRY(chacha20_arm)
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_arm
+.Lshort:
+ ldmia r12,{r4-r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
+ sub r14,pc,#100 @ .Lsigma
+#else
+ adr r14,.Lsigma @ .Lsigma
+#endif
+ stmdb sp!,{r4-r7} @ copy counter and nonce
+ ldmia r3,{r4-r11} @ load key
+ ldmia r14,{r0-r3} @ load sigma
+ stmdb sp!,{r4-r11} @ copy key
+ stmdb sp!,{r0-r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0-r9} @ load key material
+ str r11,[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs r11,r11,#1
+ add r0,r0,r4
+ mov r12,r12,ror#16
+ add r1,r1,r5
+ mov r10,r10,ror#16
+ eor r12,r12,r0,ror#16
+ eor r10,r10,r1,ror#16
+ add r8,r8,r12
+ mov r4,r4,ror#20
+ add r9,r9,r10
+ mov r5,r5,ror#20
+ eor r4,r4,r8,ror#20
+ eor r5,r5,r9,ror#20
+ add r0,r0,r4
+ mov r12,r12,ror#24
+ add r1,r1,r5
+ mov r10,r10,ror#24
+ eor r12,r12,r0,ror#24
+ eor r10,r10,r1,ror#24
+ add r8,r8,r12
+ mov r4,r4,ror#25
+ add r9,r9,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+13)]
+ ldr r10,[sp,#4*(16+15)]
+ eor r4,r4,r8,ror#25
+ eor r5,r5,r9,ror#25
+ str r8,[sp,#4*(16+8)]
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6
+ mov r14,r14,ror#16
+ str r9,[sp,#4*(16+9)]
+ ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7
+ mov r10,r10,ror#16
+ eor r14,r14,r2,ror#16
+ eor r10,r10,r3,ror#16
+ add r8,r8,r14
+ mov r6,r6,ror#20
+ add r9,r9,r10
+ mov r7,r7,ror#20
+ eor r6,r6,r8,ror#20
+ eor r7,r7,r9,ror#20
+ add r2,r2,r6
+ mov r14,r14,ror#24
+ add r3,r3,r7
+ mov r10,r10,ror#24
+ eor r14,r14,r2,ror#24
+ eor r10,r10,r3,ror#24
+ add r8,r8,r14
+ mov r6,r6,ror#25
+ add r9,r9,r10
+ mov r7,r7,ror#25
+ eor r6,r6,r8,ror#25
+ eor r7,r7,r9,ror#25
+ add r0,r0,r5
+ mov r10,r10,ror#16
+ add r1,r1,r6
+ mov r12,r12,ror#16
+ eor r10,r10,r0,ror#16
+ eor r12,r12,r1,ror#16
+ add r8,r8,r10
+ mov r5,r5,ror#20
+ add r9,r9,r12
+ mov r6,r6,ror#20
+ eor r5,r5,r8,ror#20
+ eor r6,r6,r9,ror#20
+ add r0,r0,r5
+ mov r10,r10,ror#24
+ add r1,r1,r6
+ mov r12,r12,ror#24
+ eor r10,r10,r0,ror#24
+ eor r12,r12,r1,ror#24
+ add r8,r8,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+15)]
+ ldr r10,[sp,#4*(16+13)]
+ add r9,r9,r12
+ mov r6,r6,ror#25
+ eor r5,r5,r8,ror#25
+ eor r6,r6,r9,ror#25
+ str r8,[sp,#4*(16+10)]
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7
+ mov r10,r10,ror#16
+ str r9,[sp,#4*(16+11)]
+ ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4
+ mov r14,r14,ror#16
+ eor r10,r10,r2,ror#16
+ eor r14,r14,r3,ror#16
+ add r8,r8,r10
+ mov r7,r7,ror#20
+ add r9,r9,r14
+ mov r4,r4,ror#20
+ eor r7,r7,r8,ror#20
+ eor r4,r4,r9,ror#20
+ add r2,r2,r7
+ mov r10,r10,ror#24
+ add r3,r3,r4
+ mov r14,r14,ror#24
+ eor r10,r10,r2,ror#24
+ eor r14,r14,r3,ror#24
+ add r8,r8,r10
+ mov r7,r7,ror#25
+ add r9,r9,r14
+ mov r4,r4,ror#25
+ eor r7,r7,r8,ror#25
+ eor r4,r4,r9,ror#25
+ bne .Loop
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ cmp r11,#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr r8,[sp,#4*(0)] @ load key material
+ ldr r9,[sp,#4*(1)]
+
+#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ < 7
+ orr r10,r12,r14
+ tst r10,#3 @ are input and output aligned?
+ ldr r10,[sp,#4*(2)]
+ bne .Lunaligned
+ cmp r11,#64 @ restore flags
+#else
+ ldr r10,[sp,#4*(2)]
+#endif
+ ldr r11,[sp,#4*(3)]
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+
+ add r2,r2,r10
+ add r3,r3,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r0,r0,r8 @ xor with input
+ eorhs r1,r1,r9
+ add r8,sp,#4*(4)
+ str r0,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r1,[r14,#-12]
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+ add r8,sp,#4*(8)
+ str r4,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r5,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+#ifdef __thumb2__
+ itt hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r2,r2,r10
+ add r3,r3,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r0,r0,r8
+ eorhs r1,r1,r9
+ add r8,sp,#4*(12)
+ str r0,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ str r1,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+#ifdef __thumb2__
+ itt hi
+#endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+#ifdef __thumb2__
+ it ne
+#endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r4,[r14],#16 @ store output
+ str r5,[r14,#-12]
+#ifdef __thumb2__
+ it hs
+#endif
+ subhs r11,r8,#64 @ len-=64
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+#if __LINUX_ARM_ARCH__ < 7
+ b .Ltail
+
+.align 4
+.Lunaligned: @ unaligned endian-neutral path
+ cmp r11,#64 @ restore flags
+#endif
+#endif
+#if __LINUX_ARM_ARCH__ < 7
+ ldr r11,[sp,#4*(3)]
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+0)
+ ldmia r8,{r8-r11} @ load key material
+ add r0,sp,#4*(16+8)
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+ add r6,r6,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+ add r8,sp,#4*(4+4)
+ ldmia r8,{r8-r11} @ load key material
+ ldmia r0,{r0-r7} @ load second half
+#ifdef __thumb2__
+ itt hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx"
+ strhi r11,[sp,#4*(16+11)] @ copy "rx"
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+8)
+ ldmia r8,{r8-r11} @ load key material
+ add r4,r4,r8 @ accumulate key material
+#ifdef __thumb2__
+ itt hi
+#endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+ add r5,r5,r9
+ add r6,r6,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+#ifdef __thumb2__
+ it ne
+#endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+#ifdef __thumb2__
+ it hs
+#endif
+ subhs r11,r8,#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add r9,sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb r10,[r9],#1 @ read buffer on stack
+ ldrb r11,[r12],#1 @ read input
+ subs r8,r8,#1
+ eor r11,r11,r10
+ strb r11,[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+.Lno_data_arm:
+ ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)
+
+#if __LINUX_ARM_ARCH__ >= 7 && IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
+.align 5
+.Lsigma2:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone2:
+.long 1,0,0,0
+.word -1
+
+.arch armv7-a
+.fpu neon
+
+.align 5
+ENTRY(chacha20_neon)
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_neon
+ cmp r2,#192 @ test len
+ bls .Lshort
+.Lchacha20_neon_begin:
+ adr r14,.Lsigma2
+ vstmdb sp!,{d8-d15} @ ABI spec says so
+ stmdb sp!,{r0-r3}
+
+ vld1.32 {q1-q2},[r3] @ load key
+ ldmia r3,{r4-r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {q3},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0-r3} @ load sigma
+ vld1.32 {q0},[r14]! @ load sigma
+ vld1.32 {q12},[r14] @ one
+ vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {q0-q1},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ vshl.i32 d26,d24,#1 @ two
+ vstr d24,[sp,#4*(16+0)]
+ vshl.i32 d28,d24,#2 @ four
+ vstr d26,[sp,#4*(16+2)]
+ vmov q4,q0
+ vstr d28,[sp,#4*(16+4)]
+ vmov q8,q0
+ vmov q5,q1
+ vmov q9,q1
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0-r9} @ load key material
+ cmp r11,#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ vmov q4,q0
+ str r11,[sp,#4*(32+2)] @ save len
+ vmov q8,q0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov q5,q1
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov q9,q1
+.Loop_neon_enter:
+ ldr r11, [sp,#4*(15)]
+ vadd.i32 q7,q3,q12 @ counter+1
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ vmov q6,q2
+ ldr r10, [sp,#4*(13)]
+ vmov q10,q2
+ ldr r14,[sp,#4*(14)]
+ vadd.i32 q11,q7,q12 @ counter+2
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ add r12,r12,#3 @ counter+3
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs r11,r11,#1
+ vadd.i32 q0,q0,q1
+ add r0,r0,r4
+ vadd.i32 q4,q4,q5
+ mov r12,r12,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r5
+ veor q3,q3,q0
+ mov r10,r10,ror#16
+ veor q7,q7,q4
+ eor r12,r12,r0,ror#16
+ veor q11,q11,q8
+ eor r10,r10,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r12
+ vrev32.16 q7,q7
+ mov r4,r4,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r10
+ vadd.i32 q2,q2,q3
+ mov r5,r5,ror#20
+ vadd.i32 q6,q6,q7
+ eor r4,r4,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r5,r5,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r4
+ veor q13,q5,q6
+ mov r12,r12,ror#24
+ veor q14,q9,q10
+ add r1,r1,r5
+ vshr.u32 q1,q12,#20
+ mov r10,r10,ror#24
+ vshr.u32 q5,q13,#20
+ eor r12,r12,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r10,r10,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r12
+ vsli.32 q5,q13,#12
+ mov r4,r4,ror#25
+ vsli.32 q9,q14,#12
+ add r9,r9,r10
+ vadd.i32 q0,q0,q1
+ mov r5,r5,ror#25
+ vadd.i32 q4,q4,q5
+ str r10,[sp,#4*(16+13)]
+ vadd.i32 q8,q8,q9
+ ldr r10,[sp,#4*(16+15)]
+ veor q12,q3,q0
+ eor r4,r4,r8,ror#25
+ veor q13,q7,q4
+ eor r5,r5,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+8)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+10)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r6
+ vshr.u32 q11,q14,#24
+ mov r14,r14,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+9)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+11)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r7
+ vadd.i32 q2,q2,q3
+ mov r10,r10,ror#16
+ vadd.i32 q6,q6,q7
+ eor r14,r14,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r10,r10,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r14
+ veor q13,q5,q6
+ mov r6,r6,ror#20
+ veor q14,q9,q10
+ add r9,r9,r10
+ vshr.u32 q1,q12,#25
+ mov r7,r7,ror#20
+ vshr.u32 q5,q13,#25
+ eor r6,r6,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r7,r7,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r6
+ vsli.32 q5,q13,#7
+ mov r14,r14,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r7
+ vext.8 q2,q2,q2,#8
+ mov r10,r10,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r14,r14,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r10,r10,r3,ror#24
+ vext.8 q1,q1,q1,#4
+ add r8,r8,r14
+ vext.8 q5,q5,q5,#4
+ mov r6,r6,ror#25
+ vext.8 q9,q9,q9,#4
+ add r9,r9,r10
+ vext.8 q3,q3,q3,#12
+ mov r7,r7,ror#25
+ vext.8 q7,q7,q7,#12
+ eor r6,r6,r8,ror#25
+ vext.8 q11,q11,q11,#12
+ eor r7,r7,r9,ror#25
+ vadd.i32 q0,q0,q1
+ add r0,r0,r5
+ vadd.i32 q4,q4,q5
+ mov r10,r10,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r6
+ veor q3,q3,q0
+ mov r12,r12,ror#16
+ veor q7,q7,q4
+ eor r10,r10,r0,ror#16
+ veor q11,q11,q8
+ eor r12,r12,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r10
+ vrev32.16 q7,q7
+ mov r5,r5,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r12
+ vadd.i32 q2,q2,q3
+ mov r6,r6,ror#20
+ vadd.i32 q6,q6,q7
+ eor r5,r5,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r6,r6,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r5
+ veor q13,q5,q6
+ mov r10,r10,ror#24
+ veor q14,q9,q10
+ add r1,r1,r6
+ vshr.u32 q1,q12,#20
+ mov r12,r12,ror#24
+ vshr.u32 q5,q13,#20
+ eor r10,r10,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r12,r12,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r10
+ vsli.32 q5,q13,#12
+ mov r5,r5,ror#25
+ vsli.32 q9,q14,#12
+ str r10,[sp,#4*(16+15)]
+ vadd.i32 q0,q0,q1
+ ldr r10,[sp,#4*(16+13)]
+ vadd.i32 q4,q4,q5
+ add r9,r9,r12
+ vadd.i32 q8,q8,q9
+ mov r6,r6,ror#25
+ veor q12,q3,q0
+ eor r5,r5,r8,ror#25
+ veor q13,q7,q4
+ eor r6,r6,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+10)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+8)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r7
+ vshr.u32 q11,q14,#24
+ mov r10,r10,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+11)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+9)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r4
+ vadd.i32 q2,q2,q3
+ mov r14,r14,ror#16
+ vadd.i32 q6,q6,q7
+ eor r10,r10,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r14,r14,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r10
+ veor q13,q5,q6
+ mov r7,r7,ror#20
+ veor q14,q9,q10
+ add r9,r9,r14
+ vshr.u32 q1,q12,#25
+ mov r4,r4,ror#20
+ vshr.u32 q5,q13,#25
+ eor r7,r7,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r4,r4,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r7
+ vsli.32 q5,q13,#7
+ mov r10,r10,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r4
+ vext.8 q2,q2,q2,#8
+ mov r14,r14,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r10,r10,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r14,r14,r3,ror#24
+ vext.8 q1,q1,q1,#12
+ add r8,r8,r10
+ vext.8 q5,q5,q5,#12
+ mov r7,r7,ror#25
+ vext.8 q9,q9,q9,#12
+ add r9,r9,r14
+ vext.8 q3,q3,q3,#4
+ mov r4,r4,ror#25
+ vext.8 q7,q7,q7,#4
+ eor r7,r7,r8,ror#25
+ vext.8 q11,q11,q11,#4
+ eor r4,r4,r9,ror#25
+ bne .Loop_neon
+
+ add r11,sp,#32
+ vld1.32 {q12-q13},[sp] @ load key material
+ vld1.32 {q14-q15},[r11]
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 q0,q0,q12 @ accumulate key material
+ vadd.i32 q4,q4,q12
+ vadd.i32 q8,q8,q12
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ vadd.i32 q1,q1,q13
+ vadd.i32 q5,q5,q13
+ vadd.i32 q9,q9,q13
+ vldr d26,[sp,#4*(16+2)] @ two
+
+ vadd.i32 q2,q2,q14
+ vadd.i32 q6,q6,q14
+ vadd.i32 q10,q10,q14
+ vadd.i32 d14,d14,d24 @ counter+1
+ vadd.i32 d22,d22,d26 @ counter+2
+
+ vadd.i32 q3,q3,q15
+ vadd.i32 q7,q7,q15
+ vadd.i32 q11,q11,q15
+
+ cmp r11,#64*4
+ blo .Ltail_neon
+
+ vld1.8 {q12-q13},[r12]! @ load input
+ mov r11,sp
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12 @ xor with input
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ vst1.8 {q0-q1},[r14]! @ store output
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vld1.32 {q0-q1},[r11]! @ load for next iteration
+ veor d25,d25,d25
+ vldr d24,[sp,#4*(16+4)] @ four
+ veor q9,q9,q13
+ vld1.32 {q2-q3},[r11]
+ veor q10,q10,q14
+ vst1.8 {q4-q5},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q6-q7},[r14]!
+
+ vadd.i32 d6,d6,d24 @ next counter value
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ vst1.8 {q8-q9},[r14]!
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+ vst1.8 {q10-q11},[r14]!
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ eor r0,r0,r8 @ xor with input
+ add r8,sp,#4*(4)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r5,r5,r9
+ ldr r9,[r12,#-12]
+ add r6,r6,r10
+ ldr r10,[r12,#-8]
+ add r7,r7,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ eor r4,r4,r8
+ add r8,sp,#4*(8)
+ eor r5,r5,r9
+ str r4,[r14],#16 @ store output
+ eor r6,r6,r10
+ str r5,[r14,#-12]
+ eor r7,r7,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+#ifdef __thumb2__
+ it hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+#ifdef __thumb2__
+ it hi
+#endif
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ eor r0,r0,r8
+ add r8,sp,#4*(12)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,r8,#4 @ next counter value
+ add r5,r5,r9
+ str r8,[sp,#4*(12)] @ save next counter value
+ ldr r8,[r12],#16 @ load input
+ add r6,r6,r10
+ add r4,r4,#3 @ counter+3
+ ldr r9,[r12,#-12]
+ add r7,r7,r11
+ ldr r10,[r12,#-8]
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ eor r4,r4,r8
+#ifdef __thumb2__
+ it hi
+#endif
+ ldrhi r8,[sp,#4*(32+2)] @ re-load len
+ eor r5,r5,r9
+ eor r6,r6,r10
+ str r4,[r14],#16 @ store output
+ eor r7,r7,r11
+ str r5,[r14,#-12]
+ sub r11,r8,#64*4 @ len-=64*4
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str r11, [sp,#4*(20+32+2)] @ save len
+ add r11,sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr r12,[sp,#4*(16+10)]
+ ldr r14,[sp,#4*(16+11)]
+ vldmia r11,{d8-d15} @ fulfill ABI requirement
+ str r12,[sp,#4*(20+16+10)] @ copy "rx"
+ str r14,[sp,#4*(20+16+11)] @ copy "rx"
+
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(20+16+15)]
+ add r11,sp,#4*(20)
+ vst1.32 {q0-q1},[r11]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {q2-q3},[r11]
+ mov r11,#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp r11,#64*3
+ bhs .L192_or_more_neon
+ cmp r11,#64*2
+ bhs .L128_or_more_neon
+ cmp r11,#64*1
+ bhs .L64_or_more_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q0-q1},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q2-q3},[r8]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vst1.8 {q0-q1},[r14]!
+ vst1.8 {q2-q3},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q4-q5},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q6-q7},[r8]
+ sub r11,r11,#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vst1.8 {q0-q1},[r14]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vst1.8 {q4-q5},[r14]!
+ vst1.8 {q6-q7},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q8-q9},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q10-q11},[r8]
+ sub r11,r11,#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q0-q1},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vst1.8 {q2-q3},[r14]!
+ veor q9,q9,q13
+ vst1.8 {q4-q5},[r14]!
+ veor q10,q10,q14
+ vst1.8 {q6-q7},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q8-q9},[r14]!
+ vst1.8 {q10-q11},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(4)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r6,r6,r10
+ add r7,r7,r11
+ ldmia r8,{r8-r11} @ load key material
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ stmia sp,{r0-r7}
+ add r0,sp,#4*(16+8)
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(12)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r4,r4,#3 @ counter+3
+ add r6,r6,r10
+ add r7,r7,r11
+ ldr r11,[sp,#4*(32+2)] @ re-load len
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ stmia r8,{r0-r7}
+ add r10,sp,#4*(0)
+ sub r11,r11,#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb r8,[r10],#1 @ read buffer on stack
+ ldrb r9,[r12],#1 @ read input
+ subs r11,r11,#1
+ eor r8,r8,r9
+ strb r8,[r14],#1 @ store output
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8-d15}
+ add sp,sp,#4*(16+3)
+.Lno_data_neon:
+ ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_neon)
+#endif
diff --git a/lib/zinc/chacha20/chacha20-arm64.S b/lib/zinc/chacha20/chacha20-arm64.S
new file mode 100644
index 000000000000..f90162c32a33
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm64.S
@@ -0,0 +1,1942 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+
+.align 5
+ENTRY(chacha20_arm)
+ cbz x2,.Labort
+.Lshort:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __ARMEB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+.Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+.Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,.Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo .Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add x2,x2,#64
+.Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+ENDPROC(chacha20_arm)
+
+.align 5
+ENTRY(chacha20_neon)
+ cbz x2,.Labort_neon
+ cmp x2,#192
+ b.lo .Lshort
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+.Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,.Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo .Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo .Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+.Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+.Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+.Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs .Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort_neon:
+ ret
+ENDPROC(chacha20_neon)
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 03/20] zinc: ChaCha20 generic C implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Jean-Philippe Aumasson
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
This implements the ChaCha20 permutation as a single C statement, by way
of the comma operator, which the compiler is able to simplify
terrifically.
Information: https://cr.yp.to/chacha.html
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
---
include/zinc/chacha20.h | 54 +++++++++++
lib/zinc/Kconfig | 5 ++
lib/zinc/Makefile | 4 +
lib/zinc/chacha20/chacha20.c | 168 +++++++++++++++++++++++++++++++++++
lib/zinc/main.c | 5 ++
5 files changed, 236 insertions(+)
create mode 100644 include/zinc/chacha20.h
create mode 100644 lib/zinc/chacha20/chacha20.c
diff --git a/include/zinc/chacha20.h b/include/zinc/chacha20.h
new file mode 100644
index 000000000000..3c2c2f72d88a
--- /dev/null
+++ b/include/zinc/chacha20.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _ZINC_CHACHA20_H
+#define _ZINC_CHACHA20_H
+
+#include <asm/unaligned.h>
+#include <linux/simd.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+enum {
+ CHACHA20_IV_SIZE = 16,
+ CHACHA20_KEY_SIZE = 32,
+ CHACHA20_BLOCK_SIZE = 64,
+ CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(u32),
+ HCHACHA20_KEY_SIZE = 32,
+ HCHACHA20_NONCE_SIZE = 16
+};
+
+struct chacha20_ctx {
+ u32 key[8];
+ u32 counter[4];
+} __aligned(32);
+
+void chacha20_fpu_init(void);
+
+static inline void chacha20_init(struct chacha20_ctx *state,
+ const u8 key[CHACHA20_KEY_SIZE],
+ const u64 nonce)
+{
+ state->key[0] = get_unaligned_le32(key + 0);
+ state->key[1] = get_unaligned_le32(key + 4);
+ state->key[2] = get_unaligned_le32(key + 8);
+ state->key[3] = get_unaligned_le32(key + 12);
+ state->key[4] = get_unaligned_le32(key + 16);
+ state->key[5] = get_unaligned_le32(key + 20);
+ state->key[6] = get_unaligned_le32(key + 24);
+ state->key[7] = get_unaligned_le32(key + 28);
+ state->counter[0] = state->counter[1] = 0;
+ state->counter[2] = nonce & U32_MAX;
+ state->counter[3] = nonce >> 32;
+}
+void chacha20(struct chacha20_ctx *state, u8 *dst, const u8 *src, u32 len,
+ simd_context_t simd_context);
+
+/* Derived key should be 32-bit aligned */
+void hchacha20(u8 derived_key[CHACHA20_KEY_SIZE],
+ const u8 nonce[HCHACHA20_NONCE_SIZE],
+ const u8 key[HCHACHA20_KEY_SIZE], simd_context_t simd_context);
+
+#endif /* _ZINC_CHACHA20_H */
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
index 5980c411af0d..e7d396d61607 100644
--- a/lib/zinc/Kconfig
+++ b/lib/zinc/Kconfig
@@ -1,6 +1,11 @@
config ZINC
tristate
+config ZINC_CHACHA20
+ bool
+ select ZINC
+ select CRYPTO_ALGAPI
+
config ZINC_DEBUG
bool "Zinc cryptography library debugging and self-tests"
depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index dad47573de42..0b5a964bfba6 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -3,6 +3,10 @@ ccflags-y += -Wframe-larger-than=8192
ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
+ifeq ($(CONFIG_ZINC_CHACHA20),y)
+zinc-y += chacha20/chacha20.o
+endif
+
zinc-y += main.o
obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/chacha20/chacha20.c b/lib/zinc/chacha20/chacha20.c
new file mode 100644
index 000000000000..1d9168e6c142
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20.c
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Implementation of the ChaCha20 stream cipher.
+ *
+ * Information: https://cr.yp.to/chacha.html
+ */
+
+#include <zinc/chacha20.h>
+
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+#ifndef HAVE_CHACHA20_ARCH_IMPLEMENTATION
+void __init chacha20_fpu_init(void)
+{
+}
+static inline bool chacha20_arch(u8 *out, const u8 *in, const size_t len,
+ const u32 key[8], const u32 counter[4],
+ simd_context_t simd_context)
+{
+ return false;
+}
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+ const u8 *key, simd_context_t simd_context)
+{
+ return false;
+}
+#endif
+
+#define EXPAND_32_BYTE_K 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U
+
+#define QUARTER_ROUND(x, a, b, c, d) ( \
+ x[a] += x[b], \
+ x[d] = rol32((x[d] ^ x[a]), 16), \
+ x[c] += x[d], \
+ x[b] = rol32((x[b] ^ x[c]), 12), \
+ x[a] += x[b], \
+ x[d] = rol32((x[d] ^ x[a]), 8), \
+ x[c] += x[d], \
+ x[b] = rol32((x[b] ^ x[c]), 7) \
+)
+
+#define C(i, j) (i * 4 + j)
+
+#define DOUBLE_ROUND(x) ( \
+ /* Column Round */ \
+ QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
+ QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
+ QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
+ QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
+ /* Diagonal Round */ \
+ QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
+ QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
+ QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
+ QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
+)
+
+#define TWENTY_ROUNDS(x) ( \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x) \
+)
+
+static void chacha20_block_generic(__le32 *stream, u32 *state)
+{
+ u32 x[CHACHA20_BLOCK_WORDS];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(x); ++i)
+ x[i] = state[i];
+
+ TWENTY_ROUNDS(x);
+
+ for (i = 0; i < ARRAY_SIZE(x); ++i)
+ stream[i] = cpu_to_le32(x[i] + state[i]);
+
+ ++state[12];
+}
+
+static void chacha20_generic(u8 *out, const u8 *in, u32 len, const u32 key[8],
+ const u32 counter[4])
+{
+ __le32 buf[CHACHA20_BLOCK_WORDS];
+ u32 x[] = {
+ EXPAND_32_BYTE_K,
+ key[0], key[1], key[2], key[3],
+ key[4], key[5], key[6], key[7],
+ counter[0], counter[1], counter[2], counter[3]
+ };
+
+ if (out != in)
+ memmove(out, in, len);
+
+ while (len >= CHACHA20_BLOCK_SIZE) {
+ chacha20_block_generic(buf, x);
+ crypto_xor(out, (u8 *)buf, CHACHA20_BLOCK_SIZE);
+ len -= CHACHA20_BLOCK_SIZE;
+ out += CHACHA20_BLOCK_SIZE;
+ }
+ if (len) {
+ chacha20_block_generic(buf, x);
+ crypto_xor(out, (u8 *)buf, len);
+ }
+}
+
+void chacha20(struct chacha20_ctx *state, u8 *dst, const u8 *src, u32 len,
+ simd_context_t simd_context)
+{
+ if (!chacha20_arch(dst, src, len, state->key, state->counter,
+ simd_context))
+ chacha20_generic(dst, src, len, state->key, state->counter);
+ state->counter[0] += (len + 63) / 64;
+}
+EXPORT_SYMBOL(chacha20);
+
+static void hchacha20_generic(u8 derived_key[CHACHA20_KEY_SIZE],
+ const u8 nonce[HCHACHA20_NONCE_SIZE],
+ const u8 key[HCHACHA20_KEY_SIZE])
+{
+ __le32 *out = (__force __le32 *)derived_key;
+ u32 x[] = { EXPAND_32_BYTE_K,
+ get_unaligned_le32(key + 0),
+ get_unaligned_le32(key + 4),
+ get_unaligned_le32(key + 8),
+ get_unaligned_le32(key + 12),
+ get_unaligned_le32(key + 16),
+ get_unaligned_le32(key + 20),
+ get_unaligned_le32(key + 24),
+ get_unaligned_le32(key + 28),
+ get_unaligned_le32(nonce + 0),
+ get_unaligned_le32(nonce + 4),
+ get_unaligned_le32(nonce + 8),
+ get_unaligned_le32(nonce + 12)
+ };
+
+ TWENTY_ROUNDS(x);
+
+ out[0] = cpu_to_le32(x[0]);
+ out[1] = cpu_to_le32(x[1]);
+ out[2] = cpu_to_le32(x[2]);
+ out[3] = cpu_to_le32(x[3]);
+ out[4] = cpu_to_le32(x[12]);
+ out[5] = cpu_to_le32(x[13]);
+ out[6] = cpu_to_le32(x[14]);
+ out[7] = cpu_to_le32(x[15]);
+}
+
+/* Derived key should be 32-bit aligned */
+void hchacha20(u8 derived_key[CHACHA20_KEY_SIZE],
+ const u8 nonce[HCHACHA20_NONCE_SIZE],
+ const u8 key[HCHACHA20_KEY_SIZE], simd_context_t simd_context)
+{
+ if (!hchacha20_arch(derived_key, nonce, key, simd_context))
+ hchacha20_generic(derived_key, nonce, key);
+}
+/* Deliberately not EXPORT_SYMBOL'd, since there are few reasons why somebody
+ * should be using this directly, rather than via xchacha20. Revisit only in
+ * the unlikely event that somebody has a good reason to export this.
+ */
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
index ceece33ff5a7..7e8e84b706b7 100644
--- a/lib/zinc/main.c
+++ b/lib/zinc/main.c
@@ -3,6 +3,8 @@
* Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
+#include <zinc/chacha20.h>
+
#include <linux/init.h>
#include <linux/module.h>
@@ -17,6 +19,9 @@
static int __init mod_init(void)
{
+#ifdef CONFIG_ZINC_CHACHA20
+ chacha20_fpu_init();
+#endif
return 0;
}
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 02/20] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Jean-Philippe Aumasson
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
Zinc stands for "Zinc Is Neat Crypto" or "Zinc as IN Crypto" or maybe
just "Zx2c4's INsane Cryptolib." It's also short, easy to type, and
plays nicely with the recent trend of naming crypto libraries after
elements. The guiding principle is "don't overdo it". It's less of a
library and more of a directory tree for organizing well-curated direct
implementations of cryptography primitives.
Zinc is a new cryptography API that is much more minimal and lower-level
than the current one. It intends to complement it and provide a basis
upon which the current crypto API might build, as the provider of
software implementations of cryptographic primitives. It is motivated by
three primary observations in crypto API design:
* Highly composable "cipher modes" and related abstractions from
90s cryptographers did not turn out to be as terrific an idea as
hoped, leading to a host of API misuse problems.
* Most programmers are afraid of crypto code, and so prefer to
integrate it into libraries in a highly abstracted manner, so as to
shield themselves from implementation details. Cryptographers, on
the other hand, prefer simple direct implementations, which they're
able to verify for high assurance and optimize in accordance with
their expertise.
* Overly abstracted and flexible cryptography APIs lead to a host of
dangerous problems and performance issues. The kernel is in the
business usually not of coming up with new uses of crypto, but
rather implementing various constructions, which means it essentially
needs a library of primitives, not a highly abstracted enterprise-ready
pluggable system, with a few particular exceptions.
This last observation has seen itself play out several times over and
over again within the kernel:
* The perennial move of actual primitives away from crypto/ and into
lib/, so that users can actually call these functions directly with
no overhead and without lots of allocations, function pointers,
string specifier parsing, and general clunkiness. For example:
sha256, chacha20, siphash, sha1, and so forth live in lib/ rather
than in crypto/. Zinc intends to stop the cluttering of lib/ and
introduce these direct primitives into their proper place, lib/zinc/.
* An abundance of misuse bugs with the present crypto API that have
been very unpleasant to clean up.
* A hesitance to even use cryptography, because of the overhead and
headaches involved in accessing the routines.
Zinc goes in a rather different direction. Rather than providing a
thoroughly designed and abstracted API, Zinc gives you simple functions,
which implement some primitive, or some particular and specific
construction of primitives. It is not dynamic in the least, though one
could imagine implementing a complex dynamic dispatch mechanism (such as
the current crypto API) on top of these basic functions. After all,
dynamic dispatch is usually needed for applications with cipher agility,
such as IPsec, dm-crypt, AF_ALG, and so forth, and the existing crypto
API will continue to play that role. However, Zinc will provide a non-
haphazard way of directly utilizing crypto routines in applications
that do have neither the need nor desire for abstraction and dynamic
dispatch.
It also organizes the implementations in a simple, straight-forward,
and direct manner, making it enjoyable and intuitive to work on.
Rather than moving optimized assembly implementations into arch/, it
keeps them all together in lib/zinc/, making it simple and obvious to
compare and contrast what's happening. This is, notably, exactly what
the lib/raid6/ tree does, and that seems to work out rather well. It's
also the pattern of most successful crypto libraries. The architecture-
specific glue-code is made a part of each translation unit, rather than
being in a separate one, so that generic and architecture-optimized code
are combined at compile-time, and incompatibility branches compiled out by
the optimizer.
All implementations have been extensively tested and fuzzed, and are
selected for their quality, trustworthiness, and performance. Wherever
possible and performant, formally verified implementations are used,
such as those from HACL* [1] and Fiat-Crypto [2]. The routines also take
special care to zero out secrets using memzero_explicit (and future work
is planned to have gcc do this more reliably and performantly with
compiler plugins). The performance of the selected implementations is
state-of-the-art and unrivaled on a broad array of hardware, though of
course we will continue to fine tune these to the hardware demands
needed by kernel contributors. Each implementation also comes with
extensive self-tests and crafted test vectors, pulled from various
places such as Wycheproof [9].
Regularity of function signatures is important, so that users can easily
"guess" the name of the function they want. Though, individual
primitives are oftentimes not trivially interchangeable, having been
designed for different things and requiring different parameters and
semantics, and so the function signatures they provide will directly
reflect the realities of the primitives' usages, rather than hiding it
behind (inevitably leaky) abstractions. Also, in contrast to the current
crypto API, Zinc functions can work on stack buffers, and can be called
with different keys, without requiring allocations or locking.
SIMD is used automatically when available, though some routines may
benefit from either having their SIMD disabled for particular
invocations, or to have the SIMD initialization calls amortized over
several invocations of the function, and so Zinc utilizes function
signatures enabling that in conjunction with the recently introduced
simd_context_t.
More generally, Zinc provides function signatures that allow just what
is required by the various callers. This isn't to say that users of the
functions will be permitted to pollute the function semantics with weird
particular needs, but we are trying very hard not to overdo it, and that
means looking carefully at what's actually necessary, and doing just that,
and not much more than that. Remember: practicality and cleanliness rather
than over-zealous infrastructure.
Zinc provides also an opening for the best implementers in academia to
contribute their time and effort to the kernel, by being sufficiently
simple and inviting. In discussing this commit with some of the best and
brightest over the last few years, there are many who are eager to
devote rare talent and energy to this effort.
Following the merging of this, I expect for the primitives that
currently exist in lib/ to work their way into lib/zinc/, after intense
scrutiny of each implementation, potentially replacing them with either
formally-verified implementations, or better studied and faster
state-of-the-art implementations.
Also following the merging of this, I expect for the old crypto API
implementations to be ported over to use Zinc for their software-based
implementations.
As Zinc is simply library code, its config options are un-menued, with
the exception of CONFIG_ZINC_DEBUG, which enables various selftests and
BUG_ONs.
[1] https://github.com/project-everest/hacl-star
[2] https://github.com/mit-plv/fiat-crypto
[3] https://cr.yp.to/ecdh.html
[4] https://cr.yp.to/chacha.html
[5] https://cr.yp.to/snuffle/xsalsa-20081128.pdf
[6] https://cr.yp.to/mac.html
[7] https://blake2.net/
[8] https://tools.ietf.org/html/rfc8439
[9] https://github.com/google/wycheproof
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
---
MAINTAINERS | 8 ++++++++
lib/Kconfig | 2 ++
lib/Makefile | 2 ++
lib/zinc/Kconfig | 46 ++++++++++++++++++++++++++++++++++++++++++++++
lib/zinc/Makefile | 8 ++++++++
lib/zinc/main.c | 31 +++++++++++++++++++++++++++++++
6 files changed, 97 insertions(+)
create mode 100644 lib/zinc/Kconfig
create mode 100644 lib/zinc/Makefile
create mode 100644 lib/zinc/main.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 2ef884b883c3..d2092e52320d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16160,6 +16160,14 @@ Q: https://patchwork.linuxtv.org/project/linux-media/list/
S: Maintained
F: drivers/media/dvb-frontends/zd1301_demod*
+ZINC CRYPTOGRAPHY LIBRARY
+M: Jason A. Donenfeld <Jason@zx2c4.com>
+M: Samuel Neves <sneves@dei.uc.pt>
+S: Maintained
+F: lib/zinc/
+F: include/zinc/
+L: linux-crypto@vger.kernel.org
+
ZPOOL COMPRESSED PAGE STORAGE API
M: Dan Streetman <ddstreet@ieee.org>
L: linux-mm@kvack.org
diff --git a/lib/Kconfig b/lib/Kconfig
index a3928d4438b5..3e6848269c66 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -485,6 +485,8 @@ config GLOB_SELFTEST
module load) by a small amount, so you're welcome to play with
it, but you probably don't need it.
+source "lib/zinc/Kconfig"
+
#
# Netlink attribute parsing support is select'ed if needed
#
diff --git a/lib/Makefile b/lib/Makefile
index ca3f7ebb900d..3f16e35d2c11 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -214,6 +214,8 @@ obj-$(CONFIG_PERCPU_TEST) += percpu_test.o
obj-$(CONFIG_ASN1) += asn1_decoder.o
+obj-$(CONFIG_ZINC) += zinc/
+
obj-$(CONFIG_FONT_SUPPORT) += fonts/
obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
new file mode 100644
index 000000000000..5980c411af0d
--- /dev/null
+++ b/lib/zinc/Kconfig
@@ -0,0 +1,46 @@
+config ZINC
+ tristate
+
+config ZINC_DEBUG
+ bool "Zinc cryptography library debugging and self-tests"
+ depends on ZINC
+ help
+ This builds a series of self-tests for the Zinc crypto library, which
+ help diagnose any cryptographic algorithm implementation issues that
+ might be at the root cause of potential bugs. It also adds various
+ debugging traps.
+
+ Unless you're developing and testing cryptographic routines, or are
+ especially paranoid about correctness on your hardware, you may say
+ N here.
+
+config ZINC_ARCH_ARM
+ def_bool y
+ depends on ARM
+ depends on ZINC
+ imply VFP
+ imply VFPv3 if CPU_V7
+ imply NEON if CPU_V7
+ imply KERNEL_MODE_NEON if CPU_V7
+
+config ZINC_ARCH_ARM64
+ def_bool y
+ depends on ARM64
+ depends on ZINC
+
+config ZINC_ARCH_X86_64
+ def_bool y
+ depends on X86_64
+ depends on !UML
+ depends on ZINC
+
+config ZINC_ARCH_MIPS
+ def_bool y
+ depends on MIPS
+ depends on ZINC
+
+config ZINC_ARCH_MIPS64
+ def_bool y
+ depends on MIPS
+ depends on 64BIT
+ depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
new file mode 100644
index 000000000000..dad47573de42
--- /dev/null
+++ b/lib/zinc/Makefile
@@ -0,0 +1,8 @@
+ccflags-y := -O3
+ccflags-y += -Wframe-larger-than=8192
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
+
+zinc-y += main.o
+
+obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
new file mode 100644
index 000000000000..ceece33ff5a7
--- /dev/null
+++ b/lib/zinc/main.c
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#ifdef DEBUG
+#define selftest(which) do { \
+ if (!which ## _selftest()) \
+ return -ENOTRECOVERABLE; \
+} while (0)
+#else
+#define selftest(which)
+#endif
+
+static int __init mod_init(void)
+{
+ return 0;
+}
+
+static void __exit mod_exit(void)
+{
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Zinc cryptography library");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 01/20] asm: simd context helper API
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh
Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
Thomas Gleixner, linux-arch
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>
Sometimes it's useful to amortize calls to XSAVE/XRSTOR and the related
FPU/SIMD functions over a number of calls, because FPU restoration is
quite expensive. This adds a simple header for carrying out this pattern:
simd_context_t simd_context = simd_get();
while ((item = get_item_from_queue()) != NULL) {
encrypt_item(item, simd_context);
simd_context = simd_relax(simd_context);
}
simd_put(simd_context);
The relaxation step ensures that we don't trample over preemption, and
the get/put API should be a familiar paradigm in the kernel.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: linux-arch@vger.kernel.org
---
arch/alpha/include/asm/Kbuild | 5 ++--
arch/arc/include/asm/Kbuild | 1 +
arch/arm/include/asm/simd.h | 42 ++++++++++++++++++++++++++++++
arch/arm64/include/asm/simd.h | 37 +++++++++++++++++++++-----
arch/c6x/include/asm/Kbuild | 3 ++-
arch/h8300/include/asm/Kbuild | 3 ++-
arch/hexagon/include/asm/Kbuild | 1 +
arch/ia64/include/asm/Kbuild | 1 +
arch/m68k/include/asm/Kbuild | 1 +
arch/microblaze/include/asm/Kbuild | 1 +
arch/mips/include/asm/Kbuild | 1 +
arch/nds32/include/asm/Kbuild | 7 ++---
arch/nios2/include/asm/Kbuild | 1 +
arch/openrisc/include/asm/Kbuild | 7 ++---
arch/parisc/include/asm/Kbuild | 1 +
arch/powerpc/include/asm/Kbuild | 3 ++-
arch/riscv/include/asm/Kbuild | 3 ++-
arch/s390/include/asm/Kbuild | 3 ++-
arch/sh/include/asm/Kbuild | 1 +
arch/sparc/include/asm/Kbuild | 1 +
arch/um/include/asm/Kbuild | 3 ++-
arch/unicore32/include/asm/Kbuild | 1 +
arch/x86/include/asm/simd.h | 30 ++++++++++++++++++++-
arch/xtensa/include/asm/Kbuild | 1 +
include/asm-generic/simd.h | 15 +++++++++++
include/linux/simd.h | 28 ++++++++++++++++++++
26 files changed, 180 insertions(+), 21 deletions(-)
create mode 100644 arch/arm/include/asm/simd.h
create mode 100644 include/linux/simd.h
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 0580cb8c84b2..07b2c1025d34 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -2,14 +2,15 @@
generic-y += compat.h
+generic-y += current.h
generic-y += exec.h
generic-y += export.h
generic-y += fb.h
generic-y += irq_work.h
+generic-y += kprobes.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += preempt.h
generic-y += sections.h
+generic-y += simd.h
generic-y += trace_clock.h
-generic-y += current.h
-generic-y += kprobes.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index feed50ce89fa..a7f4255f1649 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += parport.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
+generic-y += simd.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += user.h
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
new file mode 100644
index 000000000000..bf468993bbef
--- /dev/null
+++ b/arch/arm/include/asm/simd.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
+static __must_check inline bool may_use_simd(void)
+{
+ return !in_interrupt();
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+
+static inline simd_context_t simd_get(void)
+{
+ bool have_simd = may_use_simd();
+ if (have_simd)
+ kernel_neon_begin();
+ return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+ if (prior_context != HAVE_NO_SIMD)
+ kernel_neon_end();
+}
+#else
+static inline simd_context_t simd_get(void)
+{
+ return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+#endif
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index 6495cc51246f..058c336de38d 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -1,11 +1,10 @@
-/*
- * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+/* SPDX-License-Identifier: GPL-2.0
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
+#include <linux/simd.h>
#ifndef __ASM_SIMD_H
#define __ASM_SIMD_H
@@ -16,6 +15,8 @@
#include <linux/types.h>
#ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+#include <asm/simd.h>
DECLARE_PER_CPU(bool, kernel_neon_busy);
@@ -40,12 +41,36 @@ static __must_check inline bool may_use_simd(void)
!this_cpu_read(kernel_neon_busy);
}
+static inline simd_context_t simd_get(void)
+{
+ bool have_simd = may_use_simd();
+ if (have_simd)
+ kernel_neon_begin();
+ return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+ if (prior_context != HAVE_NO_SIMD)
+ kernel_neon_end();
+}
+
#else /* ! CONFIG_KERNEL_MODE_NEON */
-static __must_check inline bool may_use_simd(void) {
+static __must_check inline bool may_use_simd(void)
+{
return false;
}
+static inline simd_context_t simd_get(void)
+{
+ return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
#endif /* ! CONFIG_KERNEL_MODE_NEON */
#endif
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 33a2c94fed0d..22f3d8333c74 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -5,8 +5,8 @@ generic-y += compat.h
generic-y += current.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += exec.h
generic-y += extable.h
@@ -30,6 +30,7 @@ generic-y += pgalloc.h
generic-y += preempt.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += tlbflush.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index a5d0b2991f47..f5c2f12d593e 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -8,8 +8,8 @@ generic-y += current.h
generic-y += delay.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += exec.h
generic-y += extable.h
@@ -39,6 +39,7 @@ generic-y += preempt.h
generic-y += scatterlist.h
generic-y += sections.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += spinlock.h
generic-y += timex.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index dd2fd9c0d292..217d4695fd8a 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -29,6 +29,7 @@ generic-y += rwsem.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 557bbc8ba9f5..41c5ebdf79e5 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += irq_work.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += preempt.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += vtime.h
generic-y += word-at-a-time.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index a4b8d3331a9e..73898dd1a4d0 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += mm-arch-hooks.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += sections.h
+generic-y += simd.h
generic-y += spinlock.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 569ba9e670c1..7a877eea99d3 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -25,6 +25,7 @@ generic-y += parport.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += serial.h
+generic-y += simd.h
generic-y += syscalls.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 58351e48421e..e8868e0fb2c3 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += qrwlock.h
generic-y += qspinlock.h
generic-y += sections.h
generic-y += segment.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += unaligned.h
generic-y += user.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index dbc4e5422550..603c1d020620 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -7,14 +7,14 @@ generic-y += bug.h
generic-y += bugs.h
generic-y += checksum.h
generic-y += clkdev.h
-generic-y += cmpxchg.h
generic-y += cmpxchg-local.h
+generic-y += cmpxchg.h
generic-y += compat.h
generic-y += cputime.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += errno.h
generic-y += exec.h
@@ -46,14 +46,15 @@ generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
generic-y += shmbuf.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += stat.h
generic-y += switch_to.h
generic-y += timex.h
generic-y += topology.h
generic-y += trace_clock.h
-generic-y += xor.h
generic-y += unaligned.h
generic-y += user.h
generic-y += vga.h
generic-y += word-at-a-time.h
+generic-y += xor.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..571a9d9ad107 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -33,6 +33,7 @@ generic-y += preempt.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += spinlock.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index eb87cd8327c8..5e9f2f4c4d39 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -28,12 +28,13 @@ generic-y += module.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
-generic-y += qspinlock_types.h
-generic-y += qspinlock.h
-generic-y += qrwlock_types.h
generic-y += qrwlock.h
+generic-y += qrwlock_types.h
+generic-y += qspinlock.h
+generic-y += qspinlock_types.h
generic-y += sections.h
generic-y += segment.h
+generic-y += simd.h
generic-y += string.h
generic-y += switch_to.h
generic-y += topology.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 2013d639e735..97970b4d05ab 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += percpu.h
generic-y += preempt.h
generic-y += seccomp.h
generic-y += segment.h
+generic-y += simd.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d227e351..64290f48e733 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -4,7 +4,8 @@ generic-y += irq_regs.h
generic-y += irq_work.h
generic-y += local64.h
generic-y += mcs_spinlock.h
+generic-y += msi.h
generic-y += preempt.h
generic-y += rwsem.h
+generic-y += simd.h
generic-y += vtime.h
-generic-y += msi.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index efdbe311e936..6669b7374c0a 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -5,9 +5,9 @@ generic-y += compat.h
generic-y += cputime.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-contiguous.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += errno.h
generic-y += exec.h
@@ -46,6 +46,7 @@ generic-y += setup.h
generic-y += shmbuf.h
generic-y += shmparam.h
generic-y += signal.h
+generic-y += simd.h
generic-y += socket.h
generic-y += sockios.h
generic-y += stat.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index e3239772887a..7a26dc6ce815 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,9 +7,9 @@ generated-y += unistd_nr.h
generic-y += asm-offsets.h
generic-y += cacheflush.h
generic-y += device.h
+generic-y += div64.h
generic-y += dma-contiguous.h
generic-y += dma-mapping.h
-generic-y += div64.h
generic-y += emergency-restart.h
generic-y += export.h
generic-y += fb.h
@@ -22,6 +22,7 @@ generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += preempt.h
generic-y += rwsem.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += unaligned.h
generic-y += word-at-a-time.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 6a5609a55965..8e64ff35a933 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += percpu.h
generic-y += preempt.h
generic-y += rwsem.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += trace_clock.h
generic-y += xor.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 410b263ef5c8..72b9e08fb350 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -17,5 +17,6 @@ generic-y += msi.h
generic-y += preempt.h
generic-y += rwsem.h
generic-y += serial.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += word-at-a-time.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b10dde6cb793..d37288b08dd2 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -16,15 +16,16 @@ generic-y += io.h
generic-y += irq_regs.h
generic-y += irq_work.h
generic-y += kdebug.h
+generic-y += kprobes.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += param.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
+generic-y += simd.h
generic-y += switch_to.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += word-at-a-time.h
generic-y += xor.h
-generic-y += kprobes.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index bfc7abe77905..98a908720bbd 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -27,6 +27,7 @@ generic-y += preempt.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += syscalls.h
generic-y += topology.h
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index a341c878e977..79411178988a 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,4 +1,11 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
#include <asm/fpu/api.h>
@@ -10,3 +17,24 @@ static __must_check inline bool may_use_simd(void)
{
return irq_fpu_usable();
}
+
+static inline simd_context_t simd_get(void)
+{
+ bool have_simd = false;
+#if !defined(CONFIG_UML)
+ have_simd = may_use_simd();
+ if (have_simd)
+ kernel_fpu_begin();
+#endif
+ return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+#if !defined(CONFIG_UML)
+ if (prior_context != HAVE_NO_SIMD)
+ kernel_fpu_end();
+#endif
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 82c756431b49..7950f359649d 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += percpu.h
generic-y += preempt.h
generic-y += rwsem.h
generic-y += sections.h
+generic-y += simd.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += word-at-a-time.h
diff --git a/include/asm-generic/simd.h b/include/asm-generic/simd.h
index d0343d58a74a..fad899a5a92d 100644
--- a/include/asm-generic/simd.h
+++ b/include/asm-generic/simd.h
@@ -1,5 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
#include <linux/hardirq.h>
/*
@@ -13,3 +17,14 @@ static __must_check inline bool may_use_simd(void)
{
return !in_interrupt();
}
+
+static inline simd_context_t simd_get(void)
+{
+ return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/include/linux/simd.h b/include/linux/simd.h
new file mode 100644
index 000000000000..f62d047188bf
--- /dev/null
+++ b/include/linux/simd.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _SIMD_H
+#define _SIMD_H
+
+typedef enum {
+ HAVE_NO_SIMD,
+ HAVE_FULL_SIMD
+} simd_context_t;
+
+#include <linux/sched.h>
+#include <asm/simd.h>
+
+static inline simd_context_t simd_relax(simd_context_t prior_context)
+{
+#ifdef CONFIG_PREEMPT
+ if (prior_context != HAVE_NO_SIMD && need_resched()) {
+ simd_put(prior_context);
+ return simd_get();
+ }
+#endif
+ return prior_context;
+}
+
+#endif /* _SIMD_H */
--
2.19.0
^ permalink raw reply related
* [PATCH net-next v4 00/20] WireGuard: Secure Network Tunnel
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh; +Cc: Jason A. Donenfeld
Changes v3->v4:
- Remove mistaken double 07/17 patch.
- Fix whitespace issues in blake2s assembly.
- It's not possible to put compound literals into __initconst, so
we now instead just use boring fixed size struct members.
- Move away from makefile ifdef maze and instead prefer kconfig values,
which also makes the design a bit more modular too, which could help
in the future.
- Port old crypto API implementations (ChaCha20 and Poly1305) to Zinc.
- Port security/keys/big_key to Zinc as second example of a good usage of
Zinc.
- Document precisely what is different between the kernel code and
CRYPTOGAMS code when the CRYPTOGAMS code is used.
- Move changelog to top of 00/20 message so that people can
actually find it.
-----------------------------------------------------------
This patchset is available on git.kernel.org in this branch, where it may be
pulled directly for inclusion into net-next:
* https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/linux.git/log/?h=jd/wireguard
-----------------------------------------------------------
WireGuard is a secure network tunnel written especially for Linux, which
has faced around three years of serious development, deployment, and
scrutiny. It delivers excellent performance and is extremely easy to
use and configure. It has been designed with the primary goal of being
both easy to audit by virtue of being small and highly secure from a
cryptography and systems security perspective. WireGuard is used by some
massive companies pushing enormous amounts of traffic, and likely
already today you've consumed bytes that at some point transited through
a WireGuard tunnel. Even as an out-of-tree module, WireGuard has been
integrated into various userspace tools, Linux distributions, mobile
phones, and data centers. There are ports in several languages to
several operating systems, and even commercial hardware and services
sold integrating WireGuard. It is time, therefore, for WireGuard to be
properly integrated into Linux.
Ample information, including documentation, installation instructions,
and project details, is available at:
* https://www.wireguard.com/
* https://www.wireguard.com/papers/wireguard.pdf
As it is currently an out-of-tree module, it lives in its own git repo
and has its own mailing list, and every commit for the module is tested
against every stable kernel since 3.10 on a variety of architectures
using an extensive test suite:
* https://git.zx2c4.com/WireGuard
https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/WireGuard.git/
* https://lists.zx2c4.com/mailman/listinfo/wireguard
* https://www.wireguard.com/build-status/
The project has been broadly discussed at conferences, and was presented
to the Netdev developers in Seoul last November, where a paper was
released detailing some interesting aspects of the project. Dave asked
me after the talk if I would consider sending in a v1 "sooner rather
than later", hence this patchset. A decision is still waiting from the
Linux Plumbers Conference, but an update on these topics may be presented
in Vancouver in a few months. Prior presentations:
* https://www.wireguard.com/presentations/
* https://www.wireguard.com/papers/wireguard-netdev22.pdf
The cryptography in the protocol itself has been formally verified by
several independent academic teams with positive results, and I know of
two additional efforts on their way to further corroborate those
findings. The version 1 protocol is "complete", and so the purpose of
this review is to assess the implementation of the protocol. However, it
still may be of interest to know that the thing you're reviewing uses a
protocol with various nice security properties:
* https://www.wireguard.com/formal-verification/
This patchset is divided into four segments. The first introduces a very
simple helper for working with the FPU state for the purposes of amortizing
SIMD operations. The second segment is a small collection of cryptographic
primitives, split up into several commits by primitive and by hardware. The
third shows usage of Zinc within the existing crypto API and as a replacement
to the existing crypto API. The last is WireGuard itself, presented as an
unintrusive and self-contained virtual network driver.
It is intended that this entire patch series enter the kernel through
DaveM's net-next tree. Subsequently, WireGuard patches will go through
DaveM's net-next tree, while Zinc patches will go through Greg KH's tree.
Enjoy,
Jason
^ permalink raw reply
* [PATCH net-next v4 00/20] WireGuard: Secure Network Tunnel
From: Jason A. Donenfeld @ 2018-09-14 16:19 UTC (permalink / raw)
To: linux-kernel, netdev, linux-crypto, davem, gregkh; +Cc: Jason A. Donenfeld
Changes v3->v4:
- Remove mistaken double 07/17 patch.
- Fix whitespace issues in blake2s assembly.
- It's not possible to put compound literals into __initconst, so
we now instead just use boring fixed size struct members.
- Move away from makefile ifdef maze and instead prefer kconfig values,
which also makes the design a bit more modular too, which could help
in the future.
- Port old crypto API implementations (ChaCha20 and Poly1305) to Zinc.
- Port security/keys/big_key to Zinc as second example of a good usage of
Zinc.
- Document precisely what is different between the kernel code and
CRYPTOGAMS code when the CRYPTOGAMS code is used.
- Move changelog to top of 00/20 message so that people can
actually find it.
-----------------------------------------------------------
This patchset is available on git.kernel.org in this branch, where it may be
pulled directly for inclusion into net-next:
* https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/linux.git/log/?h=jd/wireguard
-----------------------------------------------------------
WireGuard is a secure network tunnel written especially for Linux, which
has faced around three years of serious development, deployment, and
scrutiny. It delivers excellent performance and is extremely easy to
use and configure. It has been designed with the primary goal of being
both easy to audit by virtue of being small and highly secure from a
cryptography and systems security perspective. WireGuard is used by some
massive companies pushing enormous amounts of traffic, and likely
already today you've consumed bytes that at some point transited through
a WireGuard tunnel. Even as an out-of-tree module, WireGuard has been
integrated into various userspace tools, Linux distributions, mobile
phones, and data centers. There are ports in several languages to
several operating systems, and even commercial hardware and services
sold integrating WireGuard. It is time, therefore, for WireGuard to be
properly integrated into Linux.
Ample information, including documentation, installation instructions,
and project details, is available at:
* https://www.wireguard.com/
* https://www.wireguard.com/papers/wireguard.pdf
As it is currently an out-of-tree module, it lives in its own git repo
and has its own mailing list, and every commit for the module is tested
against every stable kernel since 3.10 on a variety of architectures
using an extensive test suite:
* https://git.zx2c4.com/WireGuard
https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/WireGuard.git/
* https://lists.zx2c4.com/mailman/listinfo/wireguard
* https://www.wireguard.com/build-status/
The project has been broadly discussed at conferences, and was presented
to the Netdev developers in Seoul last November, where a paper was
released detailing some interesting aspects of the project. Dave asked
me after the talk if I would consider sending in a v1 "sooner rather
than later", hence this patchset. A decision is still waiting from the
Linux Plumbers Conference, but an update on these topics may be presented
in Vancouver in a few months. Prior presentations:
* https://www.wireguard.com/presentations/
* https://www.wireguard.com/papers/wireguard-netdev22.pdf
The cryptography in the protocol itself has been formally verified by
several independent academic teams with positive results, and I know of
two additional efforts on their way to further corroborate those
findings. The version 1 protocol is "complete", and so the purpose of
this review is to assess the implementation of the protocol. However, it
still may be of interest to know that the thing you're reviewing uses a
protocol with various nice security properties:
* https://www.wireguard.com/formal-verification/
This patchset is divided into four segments. The first introduces a very
simple helper for working with the FPU state for the purposes of amortizing
SIMD operations. The second segment is a small collection of cryptographic
primitives, split up into several commits by primitive and by hardware. The
third shows usage of Zinc within the existing crypto API and as a replacement
to the existing crypto API. The last is WireGuard itself, presented as an
unintrusive and self-contained virtual network driver.
It is intended that this entire patch series enter the kernel through
DaveM's net-next tree. Subsequently, WireGuard patches will go through
DaveM's net-next tree, while Zinc patches will go through Greg KH's tree.
Enjoy,
Jason
^ permalink raw reply
* Re: [PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock
From: Vlad Buslov @ 2018-09-14 10:46 UTC (permalink / raw)
To: Cong Wang
Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
David Miller
In-Reply-To: <CAM_iQpUQeu0xeurA6e4pC8C7Ha04srZEGcZVCAwstj-NLg-Xmw@mail.gmail.com>
On Thu 13 Sep 2018 at 17:13, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Wed, Sep 12, 2018 at 1:51 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>>
>> On Fri 07 Sep 2018 at 19:12, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> > On Fri, Sep 7, 2018 at 6:52 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>> >>
>> >> Action API was changed to work with actions and action_idr in concurrency
>> >> safe manner, however tcf_del_walker() still uses actions without taking a
>> >> reference or idrinfo->lock first, and deletes them directly, disregarding
>> >> possible concurrent delete.
>> >>
>> >> Add tc_action_wq workqueue to action API. Implement
>> >> tcf_idr_release_unsafe() that assumes external synchronization by caller
>> >> and delays blocking action cleanup part to tc_action_wq workqueue. Extend
>> >> tcf_action_cleanup() with 'async' argument to indicate that function should
>> >> free action asynchronously.
>> >
>> > Where exactly is blocking in tcf_action_cleanup()?
>> >
>> > From your code, it looks like free_tcf(), but from my observation,
>> > the only blocking function inside is tcf_action_goto_chain_fini()
>> > which calls __tcf_chain_put(). But, __tcf_chain_put() is blocking
>> > _ONLY_ when tc_chain_notify() is called, for tc action it is never
>> > called.
>> >
>> > So, what else is blocking?
>>
>> __tcf_chain_put() calls tc_chain_tmplt_del(), which calls
>> ops->tmplt_destroy(). This last function uses hw offload API, which is
>> blocking.
>
> Good to know.
>
> Can we just make ops->tmplt_destroy() to use workqueue?
> Making tc action to workqueue seems overkill, for me.
How about changing tcf_chain_put_by_act() to use tc_filter_wq, instead
of directly calling __tcf_chain_put()? IMO it is a better solution
because it benefits all classifiers, instead of requiring every
classifier with templates support to implement non-blocking
ops->tmplt_destroy().
^ permalink raw reply
* Re: [PATCH net-next 08/13] net: sched: rename tcf_block_get{_ext}() and tcf_block_put{_ext}()
From: Vlad Buslov @ 2018-09-14 10:38 UTC (permalink / raw)
To: Cong Wang, Jiri Pirko
Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
David Miller, Stephen Hemminger, Kirill Tkhai, Paul E. McKenney,
Nicolas Dichtel, Leon Romanovsky, Greg KH, mark.rutland,
Florian Westphal, David Ahern, lucien xin, Jakub Kicinski,
Christian Brauner, Jiri Benc
In-Reply-To: <CAM_iQpV5b4asaUOR1Ly7s_y7x1M-Y7q3Z2DDe6y_r1eKqx8ZDA@mail.gmail.com>
On Thu 13 Sep 2018 at 17:21, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Wed, Sep 12, 2018 at 1:24 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>>
>> On Fri 07 Sep 2018 at 20:09, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> > On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>> >>
>> >> Functions tcf_block_get{_ext}() and tcf_block_put{_ext}() actually
>> >> attach/detach block to specific Qdisc besides just taking/putting
>> >> reference. Rename them according to their purpose.
>> >
>> > Where exactly does it attach to?
>> >
>> > Each qdisc provides a pointer to a pointer of a block, like
>> > &cl->block. It is where the result is saved to. It takes a parameter
>> > of Qdisc* merely for read-only purpose.
>>
>> tcf_block_attach_ext() passes qdisc parameter to tcf_block_owner_add()
>> which saves qdisc to new tcf_block_owner_item and adds the item to
>> block's owner list. I proposed several naming options for these
>> functions to Jiri on internal review and he suggested "attach" as better
>> option.
>
> But that is merely item->q = q, this is why I said it is read-only,
> hard to claim this is attaching.
>
>
>>
>> >
>> > So, renaming it to *attach() is even confusing, at least not
>> > any better. Please find other names or leave them as they are.
>>
>> What would you recommend?
>
> I don't know, perhaps "acquire"?
>
> Or, leaving tcf_block_get() as it is but rename your refcnt
> increment function to be something like tcf_block_refcnt_get()?
Cong, I'm okay with both options.
Jiri, which naming would you prefer?
^ permalink raw reply
* Re: [PATCH 0/2] net: ethernet: neterion: use linux/io-64-nonatomic-lo-hi.h
From: David Miller @ 2018-09-14 15:49 UTC (permalink / raw)
To: clabbe; +Cc: jdmason, linux-kernel, netdev
In-Reply-To: <1536921190-38619-1-git-send-email-clabbe@baylibre.com>
From: Corentin Labbe <clabbe@baylibre.com>
Date: Fri, 14 Sep 2018 10:33:08 +0000
> This serie remove usage of custom writeq/readq in favor of ones
> defined in linux/io-64-nonatomic-lo-hi.h
>
> This serie is only compile tested.
The vxge patch doesn't even apply cleanly to net-next.
^ permalink raw reply
* [PATCH] ARM: dts: at91: add new compatibility string for macb on sama5d3
From: Nicolas Ferre @ 2018-09-14 15:48 UTC (permalink / raw)
To: netdev
Cc: linux-kernel, linux-arm-kernel, Claudiu Beznea, Alexandre Belloni,
Ludovic Desroches, devicetree, Nicolas Ferre
In-Reply-To: <20180914154811.12090-1-nicolas.ferre@microchip.com>
We need this new compatibility string as we experienced different behavior
for this 10/100Mbits/s macb interface on this particular SoC.
Backward compatibility is preserved as we keep the alternative strings.
Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
---
Documentation/devicetree/bindings/net/macb.txt | 1 +
arch/arm/boot/dts/sama5d3_emac.dtsi | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt
index 457d5ae16f23..3e17ac1d5d58 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -10,6 +10,7 @@ Required properties:
Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on
the Cadence GEM, or the generic form: "cdns,gem".
Use "atmel,sama5d2-gem" for the GEM IP (10/100) available on Atmel sama5d2 SoCs.
+ Use "atmel,sama5d3-macb" for the 10/100Mbit IP available on Atmel sama5d3 SoCs.
Use "atmel,sama5d3-gem" for the Gigabit IP available on Atmel sama5d3 SoCs.
Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
diff --git a/arch/arm/boot/dts/sama5d3_emac.dtsi b/arch/arm/boot/dts/sama5d3_emac.dtsi
index 7cb235ef0fb6..6e9e1c2f9def 100644
--- a/arch/arm/boot/dts/sama5d3_emac.dtsi
+++ b/arch/arm/boot/dts/sama5d3_emac.dtsi
@@ -41,7 +41,7 @@
};
macb1: ethernet@f802c000 {
- compatible = "cdns,at91sam9260-macb", "cdns,macb";
+ compatible = "atmel,sama5d3-macb", "cdns,at91sam9260-macb", "cdns,macb";
reg = <0xf802c000 0x100>;
interrupts = <35 IRQ_TYPE_LEVEL_HIGH 3>;
pinctrl-names = "default";
--
2.15.1
^ permalink raw reply related
* [PATCH] net: macb: disable scatter-gather for macb on sama5d3
From: Nicolas Ferre @ 2018-09-14 15:48 UTC (permalink / raw)
To: netdev
Cc: linux-kernel, linux-arm-kernel, Claudiu Beznea, Alexandre Belloni,
Ludovic Desroches, devicetree, Nicolas Ferre
Create a new configuration for the sama5d3-macb new compatibility string.
This configuration disables scatter-gather because we experienced lock down
of the macb interface of this particular SoC under very high load.
Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
---
drivers/net/ethernet/cadence/macb_main.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 16e4ef7d7185..f1a86b422617 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -3837,6 +3837,13 @@ static const struct macb_config at91sam9260_config = {
.init = macb_init,
};
+static const struct macb_config sama5d3macb_config = {
+ .caps = MACB_CAPS_SG_DISABLED
+ | MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII,
+ .clk_init = macb_clk_init,
+ .init = macb_init,
+};
+
static const struct macb_config pc302gem_config = {
.caps = MACB_CAPS_SG_DISABLED | MACB_CAPS_GIGABIT_MODE_AVAILABLE,
.dma_burst_length = 16,
@@ -3904,6 +3911,7 @@ static const struct of_device_id macb_dt_ids[] = {
{ .compatible = "cdns,gem", .data = &pc302gem_config },
{ .compatible = "atmel,sama5d2-gem", .data = &sama5d2_config },
{ .compatible = "atmel,sama5d3-gem", .data = &sama5d3_config },
+ { .compatible = "atmel,sama5d3-macb", .data = &sama5d3macb_config },
{ .compatible = "atmel,sama5d4-gem", .data = &sama5d4_config },
{ .compatible = "cdns,at91rm9200-emac", .data = &emac_config },
{ .compatible = "cdns,emac", .data = &emac_config },
--
2.15.1
^ permalink raw reply related
* [PATCH 2/2] net: neterion: s2io: Use linux/io-64-nonatomic-lo-hi.h
From: Corentin Labbe @ 2018-09-14 10:33 UTC (permalink / raw)
To: davem, jdmason; +Cc: linux-kernel, netdev, Corentin Labbe
In-Reply-To: <1536921190-38619-1-git-send-email-clabbe@baylibre.com>
This patch replace the custom definition of writeq/read and use ones
defined in linux/io-64-nonatomic-lo-hi.h.
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
---
drivers/net/ethernet/neterion/s2io.c | 1 +
drivers/net/ethernet/neterion/s2io.h | 22 +---------------------
2 files changed, 2 insertions(+), 21 deletions(-)
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index b8983e7..44acc63 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -75,6 +75,7 @@
#include <linux/tcp.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include <net/tcp.h>
diff --git a/drivers/net/ethernet/neterion/s2io.h b/drivers/net/ethernet/neterion/s2io.h
index 1a24a72..0a921f3 100644
--- a/drivers/net/ethernet/neterion/s2io.h
+++ b/drivers/net/ethernet/neterion/s2io.h
@@ -10,6 +10,7 @@
* system is licensed under the GPL.
* See the file COPYING in this distribution for more information.
************************************************************************/
+#include <linux/io-64-nonatomic-lo-hi.h>
#ifndef _S2IO_H
#define _S2IO_H
@@ -970,27 +971,6 @@ struct s2io_nic {
#define RESET_ERROR 1
#define CMD_ERROR 2
-/* OS related system calls */
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
- u64 ret = 0;
- ret = readl(addr + 4);
- ret <<= 32;
- ret |= readl(addr);
-
- return ret;
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
- writel((u32) (val), addr);
- writel((u32) (val >> 32), (addr + 4));
-}
-#endif
-
/*
* Some registers have to be written in a particular order to
* expect correct hardware operation. The macro SPECIAL_REG_WRITE
--
2.7.4
^ permalink raw reply related
* [PATCH 1/2] net: neterion: vxge: use linux/io-64-nonatomic-lo-hi.h
From: Corentin Labbe @ 2018-09-14 10:33 UTC (permalink / raw)
To: davem, jdmason; +Cc: linux-kernel, netdev, Corentin Labbe
In-Reply-To: <1536921190-38619-1-git-send-email-clabbe@baylibre.com>
This patch replace the custom definition of writeq/read and use ones
defined in linux/io-64-nonatomic-lo-hi.h.
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
---
drivers/net/ethernet/neterion/vxge/vxge-config.c | 1 +
drivers/net/ethernet/neterion/vxge/vxge-config.h | 20 --------------------
drivers/net/ethernet/neterion/vxge/vxge-traffic.c | 1 +
3 files changed, 2 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.c b/drivers/net/ethernet/neterion/vxge/vxge-config.c
index 358ed61..2b422c5 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.c
@@ -13,6 +13,7 @@
******************************************************************************/
#include <linux/vmalloc.h>
#include <linux/etherdevice.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/pci.h>
#include <linux/pci_hotplug.h>
#include <linux/slab.h>
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.h b/drivers/net/ethernet/neterion/vxge/vxge-config.h
index d743a37..e678ba3 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.h
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.h
@@ -2011,26 +2011,6 @@ enum vxge_hw_status vxge_hw_vpath_mtu_set(
void
vxge_hw_vpath_rx_doorbell_init(struct __vxge_hw_vpath_handle *vp);
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
- u64 ret = 0;
- ret = readl(addr + 4);
- ret <<= 32;
- ret |= readl(addr);
-
- return ret;
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
- writel((u32) (val), addr);
- writel((u32) (val >> 32), (addr + 4));
-}
-#endif
-
static inline void __vxge_hw_pio_mem_write32_upper(u32 val, void __iomem *addr)
{
writel(val, addr + 4);
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-traffic.c b/drivers/net/ethernet/neterion/vxge/vxge-traffic.c
index 0c3b5de..30e5cdc 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-traffic.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-traffic.c
@@ -12,6 +12,7 @@
* Copyright(c) 2002-2010 Exar Corp.
******************************************************************************/
#include <linux/etherdevice.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/prefetch.h>
#include "vxge-traffic.h"
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net] net/sched: act_sample: fix NULL dereference in the data path
From: Jiri Pirko @ 2018-09-14 10:02 UTC (permalink / raw)
To: Davide Caratti
Cc: mcroce, Jamal Hadi Salim, Cong Wang, David S. Miller, netdev
In-Reply-To: <2fcef6665503c5e3b17676daf45c4166cf130cdb.1536919144.git.dcaratti@redhat.com>
Fri, Sep 14, 2018 at 12:03:18PM CEST, dcaratti@redhat.com wrote:
>Matteo reported the following splat, testing the datapath of TC 'sample':
>
> BUG: KASAN: null-ptr-deref in tcf_sample_act+0xc4/0x310
> Read of size 8 at addr 0000000000000000 by task nc/433
>
> CPU: 0 PID: 433 Comm: nc Not tainted 4.19.0-rc3-kvm #17
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
> Call Trace:
> kasan_report.cold.6+0x6c/0x2fa
> tcf_sample_act+0xc4/0x310
> ? dev_hard_start_xmit+0x117/0x180
> tcf_action_exec+0xa3/0x160
> tcf_classify+0xdd/0x1d0
> htb_enqueue+0x18e/0x6b0
> ? deref_stack_reg+0x7a/0xb0
> ? htb_delete+0x4b0/0x4b0
> ? unwind_next_frame+0x819/0x8f0
> ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
> __dev_queue_xmit+0x722/0xca0
> ? unwind_get_return_address_ptr+0x50/0x50
> ? netdev_pick_tx+0xe0/0xe0
> ? save_stack+0x8c/0xb0
> ? kasan_kmalloc+0xbe/0xd0
> ? __kmalloc_track_caller+0xe4/0x1c0
> ? __kmalloc_reserve.isra.45+0x24/0x70
> ? __alloc_skb+0xdd/0x2e0
> ? sk_stream_alloc_skb+0x91/0x3b0
> ? tcp_sendmsg_locked+0x71b/0x15a0
> ? tcp_sendmsg+0x22/0x40
> ? __sys_sendto+0x1b0/0x250
> ? __x64_sys_sendto+0x6f/0x80
> ? do_syscall_64+0x5d/0x150
> ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
> ? __sys_sendto+0x1b0/0x250
> ? __x64_sys_sendto+0x6f/0x80
> ? do_syscall_64+0x5d/0x150
> ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
> ip_finish_output2+0x495/0x590
> ? ip_copy_metadata+0x2e0/0x2e0
> ? skb_gso_validate_network_len+0x6f/0x110
> ? ip_finish_output+0x174/0x280
> __tcp_transmit_skb+0xb17/0x12b0
> ? __tcp_select_window+0x380/0x380
> tcp_write_xmit+0x913/0x1de0
> ? __sk_mem_schedule+0x50/0x80
> tcp_sendmsg_locked+0x49d/0x15a0
> ? tcp_rcv_established+0x8da/0xa30
> ? tcp_set_state+0x220/0x220
> ? clear_user+0x1f/0x50
> ? iov_iter_zero+0x1ae/0x590
> ? __fget_light+0xa0/0xe0
> tcp_sendmsg+0x22/0x40
> __sys_sendto+0x1b0/0x250
> ? __ia32_sys_getpeername+0x40/0x40
> ? _copy_to_user+0x58/0x70
> ? poll_select_copy_remaining+0x176/0x200
> ? __pollwait+0x1c0/0x1c0
> ? ktime_get_ts64+0x11f/0x140
> ? kern_select+0x108/0x150
> ? core_sys_select+0x360/0x360
> ? vfs_read+0x127/0x150
> ? kernel_write+0x90/0x90
> __x64_sys_sendto+0x6f/0x80
> do_syscall_64+0x5d/0x150
> entry_SYSCALL_64_after_hwframe+0x44/0xa9
> RIP: 0033:0x7fefef2b129d
> Code: ff ff ff ff eb b6 0f 1f 80 00 00 00 00 48 8d 05 51 37 0c 00 41 89 ca 8b 00 85 c0 75 20 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 6b f3 c3 66 0f 1f 84 00 00 00 00 00 41 56 41
> RSP: 002b:00007fff2f5350c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
> RAX: ffffffffffffffda RBX: 000056118d60c120 RCX: 00007fefef2b129d
> RDX: 0000000000002000 RSI: 000056118d629320 RDI: 0000000000000003
> RBP: 000056118d530370 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000002000
> R13: 000056118d5c2a10 R14: 000056118d5c2a10 R15: 000056118d5303b8
>
>tcf_sample_act() tried to update its per-cpu stats, but tcf_sample_init()
>forgot to allocate them, because tcf_idr_create() was called with a wrong
>value of 'cpustats'. Setting it to true proved to fix the reported crash.
>
>Reported-by: Matteo Croce <mcroce@redhat.com>
>Fixes: 65a206c01e8e ("net/sched: Change act_api and act_xxx modules to use IDR")
>Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action")
>Tested-by: Matteo Croce <mcroce@redhat.com>
>Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
* Re: [PATCH net-next 0/8] bnxt_en: devlink param updates
From: Jiri Pirko @ 2018-09-14 10:01 UTC (permalink / raw)
To: Vasundhara Volam
Cc: jakub.kicinski, David Miller, michael.chan@broadcom.com, Netdev,
alexander.duyck
In-Reply-To: <CAACQVJp0O8Tb+d9kF9K773okqrCg7XG-JLSiMNu9GMLjtKEA6Q@mail.gmail.com>
Fri, Sep 14, 2018 at 06:17:07AM CEST, vasundhara-v.volam@broadcom.com wrote:
>On Wed, Sep 12, 2018 at 3:20 PM Jakub Kicinski
><jakub.kicinski@netronome.com> wrote:
>>
>> On Wed, 12 Sep 2018 12:09:37 +0530, Vasundhara Volam wrote:
>> > On Tue, Sep 11, 2018 at 5:04 PM Jakub Kicinski wrote:
>> > > On Tue, 11 Sep 2018 14:14:57 +0530, Vasundhara Volam wrote:
>> > > > This patchset adds support for 4 generic and 1 driver-specific devlink
>> > > > parameters.
>> > > >
>> > > > Also, this patchset adds support to return proper error code if
>> > > > HWRM_NVM_GET/SET_VARIABLE commands return error code
>> > > > HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED.
>> > > >
>> > > > Vasundhara Volam (8):
>> > > > devlink: Add generic parameter hw_tc_offload
>> > >
>> > > Much like Jiri, I can't help but wonder why do you need this?
>> >
>> > There is a request from our customer for a way to toggle tc_offload
>> > feature in our adapter.
>>
>> Vasundhara, again, we don't need to know who asked you to do this, but
>> _why_. What problem are you solving? What is the customer trying to
>> achieve?
>For Brand new big features like TC_offload, few customers are not willing
>to enable it by default in the adapter(Firmware). This was a subjective decision
>to disable TC_offload by default in the adapter.
Again, why? Why it cannot be enabled in FW and just enabled/disabled by
ethtool flag? Don't say that "customers want it" please...
>>
>> > > > devlink: Add generic parameter ignore_ari
>> > > > devlink: Add generic parameter msix_vec_per_pf_max
>> > > > devlink: Add generic parameter msix_vec_per_pf_min
>> > >
>> > > IMHO more structured API would be preferable if possible. The string
>> > > keys won't scale if you want to set the parameters per PF, and
>> > > creating more structured API for PCIe which is a relatively slow
>> > > moving HW spec seems tractable.
>> >
>> > Sorry, could you please suggest an example? We will try to adapt.
>>
>> My thinking was that the same way devlink device has ports, it should
>> have PCIe functions as objects which then have attributes. Instead of
>> making everything a string-identified device attribute. But I'm not
>> dead set on this if others don't think its a good idea.
>Actually this parameters are for the port but the value given to this param
>is applicable for individual PF. That's the reason I have added "per_pf" string.
>If you think this is not a good idea, I can move this params to driver-specific.
^ permalink raw reply
* [PATCH net] net/sched: act_sample: fix NULL dereference in the data path
From: Davide Caratti @ 2018-09-14 10:03 UTC (permalink / raw)
To: mcroce, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller; +Cc: netdev
Matteo reported the following splat, testing the datapath of TC 'sample':
BUG: KASAN: null-ptr-deref in tcf_sample_act+0xc4/0x310
Read of size 8 at addr 0000000000000000 by task nc/433
CPU: 0 PID: 433 Comm: nc Not tainted 4.19.0-rc3-kvm #17
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
Call Trace:
kasan_report.cold.6+0x6c/0x2fa
tcf_sample_act+0xc4/0x310
? dev_hard_start_xmit+0x117/0x180
tcf_action_exec+0xa3/0x160
tcf_classify+0xdd/0x1d0
htb_enqueue+0x18e/0x6b0
? deref_stack_reg+0x7a/0xb0
? htb_delete+0x4b0/0x4b0
? unwind_next_frame+0x819/0x8f0
? entry_SYSCALL_64_after_hwframe+0x44/0xa9
__dev_queue_xmit+0x722/0xca0
? unwind_get_return_address_ptr+0x50/0x50
? netdev_pick_tx+0xe0/0xe0
? save_stack+0x8c/0xb0
? kasan_kmalloc+0xbe/0xd0
? __kmalloc_track_caller+0xe4/0x1c0
? __kmalloc_reserve.isra.45+0x24/0x70
? __alloc_skb+0xdd/0x2e0
? sk_stream_alloc_skb+0x91/0x3b0
? tcp_sendmsg_locked+0x71b/0x15a0
? tcp_sendmsg+0x22/0x40
? __sys_sendto+0x1b0/0x250
? __x64_sys_sendto+0x6f/0x80
? do_syscall_64+0x5d/0x150
? entry_SYSCALL_64_after_hwframe+0x44/0xa9
? __sys_sendto+0x1b0/0x250
? __x64_sys_sendto+0x6f/0x80
? do_syscall_64+0x5d/0x150
? entry_SYSCALL_64_after_hwframe+0x44/0xa9
ip_finish_output2+0x495/0x590
? ip_copy_metadata+0x2e0/0x2e0
? skb_gso_validate_network_len+0x6f/0x110
? ip_finish_output+0x174/0x280
__tcp_transmit_skb+0xb17/0x12b0
? __tcp_select_window+0x380/0x380
tcp_write_xmit+0x913/0x1de0
? __sk_mem_schedule+0x50/0x80
tcp_sendmsg_locked+0x49d/0x15a0
? tcp_rcv_established+0x8da/0xa30
? tcp_set_state+0x220/0x220
? clear_user+0x1f/0x50
? iov_iter_zero+0x1ae/0x590
? __fget_light+0xa0/0xe0
tcp_sendmsg+0x22/0x40
__sys_sendto+0x1b0/0x250
? __ia32_sys_getpeername+0x40/0x40
? _copy_to_user+0x58/0x70
? poll_select_copy_remaining+0x176/0x200
? __pollwait+0x1c0/0x1c0
? ktime_get_ts64+0x11f/0x140
? kern_select+0x108/0x150
? core_sys_select+0x360/0x360
? vfs_read+0x127/0x150
? kernel_write+0x90/0x90
__x64_sys_sendto+0x6f/0x80
do_syscall_64+0x5d/0x150
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7fefef2b129d
Code: ff ff ff ff eb b6 0f 1f 80 00 00 00 00 48 8d 05 51 37 0c 00 41 89 ca 8b 00 85 c0 75 20 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 6b f3 c3 66 0f 1f 84 00 00 00 00 00 41 56 41
RSP: 002b:00007fff2f5350c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 000056118d60c120 RCX: 00007fefef2b129d
RDX: 0000000000002000 RSI: 000056118d629320 RDI: 0000000000000003
RBP: 000056118d530370 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000002000
R13: 000056118d5c2a10 R14: 000056118d5c2a10 R15: 000056118d5303b8
tcf_sample_act() tried to update its per-cpu stats, but tcf_sample_init()
forgot to allocate them, because tcf_idr_create() was called with a wrong
value of 'cpustats'. Setting it to true proved to fix the reported crash.
Reported-by: Matteo Croce <mcroce@redhat.com>
Fixes: 65a206c01e8e ("net/sched: Change act_api and act_xxx modules to use IDR")
Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action")
Tested-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
net/sched/act_sample.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 44e9c00657bc..6b67aa13d2dd 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
- &act_sample_ops, bind, false);
+ &act_sample_ops, bind, true);
if (ret) {
tcf_idr_cleanup(tn, parm->index);
return ret;
--
2.17.1
^ permalink raw reply related
* Re: [net-next, RFC PATCH] net: sched: cls_range: Introduce Range classifier
From: Jiri Pirko @ 2018-09-14 9:58 UTC (permalink / raw)
To: Amritha Nambiar
Cc: netdev, davem, alexander.h.duyck, jakub.kicinski,
sridhar.samudrala, jhs, jesse.brandeburg, xiyou.wangcong
In-Reply-To: <153687192654.43503.1433255216543560934.stgit@anamhost.jf.intel.com>
Thu, Sep 13, 2018 at 10:52:06PM CEST, amritha.nambiar@intel.com wrote:
[...]
>+static struct cls_range_filter *range_lookup(struct cls_range_head *head,
>+ struct range_flow_key *key,
>+ struct range_flow_key *mkey,
>+ bool is_skb)
>+{
>+ struct cls_range_filter *filter, *next_filter;
>+ struct range_params range;
>+ int ret;
>+ size_t cmp_size;
>+
>+ list_for_each_entry_safe(filter, next_filter, &head->filters, flist) {
This really should be list_for_each_entry_rcu()
also, as I wrote in the previous email, this should be done in
cls_flower. Look at fl_lookup() it looks-up hashtable. You just need to
add linked list traversal and range comparison to that function for the
hit in the hashtable.
>+ if (!is_skb) {
>+ /* Existing filter comparison */
>+ cmp_size = sizeof(filter->mkey);
>+ } else {
>+ /* skb classification */
>+ ret = range_compare_params(&range, filter, key,
>+ RANGE_PORT_DST);
>+ if (ret < 0)
>+ continue;
>+
>+ ret = range_compare_params(&range, filter, key,
>+ RANGE_PORT_SRC);
>+ if (ret < 0)
>+ continue;
>+
>+ /* skb does not have min and max values */
>+ cmp_size = RANGE_KEY_MEMBER_OFFSET(tp_min);
>+ }
>+ if (!memcmp(mkey, &filter->mkey, cmp_size))
>+ return filter;
>+ }
>+ return NULL;
[...]
^ permalink raw reply
* Re: [net-next,RFC PATCH] Introduce TC Range classifier
From: Jiri Pirko @ 2018-09-14 9:49 UTC (permalink / raw)
To: Amritha Nambiar
Cc: netdev, davem, alexander.h.duyck, jakub.kicinski,
sridhar.samudrala, jhs, jesse.brandeburg, xiyou.wangcong
In-Reply-To: <153687160312.43503.11156697286063840163.stgit@anamhost.jf.intel.com>
Thu, Sep 13, 2018 at 10:52:01PM CEST, amritha.nambiar@intel.com wrote:
>This patch introduces a TC range classifier to support filtering based
>on ranges. Only port-range filters are supported currently. This can
>be combined with flower classifier to support filters that are a
>combination of port-ranges and other parameters based on existing
>fields supported by cls_flower. The 'goto chain' action can be used to
>combine the flower and range filter.
>The filter precedence is decided based on the 'prio' value.
For example Spectrum ASIC supports mask-based and range-based matching
in a single TCAM rule. No chains needed. Also, I don't really understand
why is this a separate cls. I believe that this functionality should be
put as an extension of existing cls_flower.
^ permalink raw reply
* Re: [PATCH 7/7] MIPS: mscc: add PCB120 to the ocelot fitImage
From: Alexandre Belloni @ 2018-09-14 15:00 UTC (permalink / raw)
To: Quentin Schulz
Cc: ralf, paul.burton, jhogan, robh+dt, mark.rutland, davem, andrew,
f.fainelli, allan.nielsen, linux-mips, devicetree, linux-kernel,
netdev, thomas.petazzoni, antoine.tenart
In-Reply-To: <87ab2f80e3942dfca4eab896ba087e7b69bb7b12.1536916714.git-series.quentin.schulz@bootlin.com>
On 14/09/2018 11:44:28+0200, Quentin Schulz wrote:
> PCB120 and PCB123 are both development boards based on Microsemi Ocelot
> so let's use the same fitImage for both.
>
> Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> ---
> arch/mips/generic/Kconfig | 6 +--
> arch/mips/generic/Platform | 2 +-
> arch/mips/generic/board-ocelot.its.S | 40 ++++++++++++++++++++++-
> arch/mips/generic/board-ocelot_pcb123.its.S | 23 +-------------
> 4 files changed, 44 insertions(+), 27 deletions(-)
> create mode 100644 arch/mips/generic/board-ocelot.its.S
> delete mode 100644 arch/mips/generic/board-ocelot_pcb123.its.S
>
> diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig
> index 08e33c6..fd60198 100644
> --- a/arch/mips/generic/Kconfig
> +++ b/arch/mips/generic/Kconfig
> @@ -65,11 +65,11 @@ config FIT_IMAGE_FDT_XILFPGA
> Enable this to include the FDT for the MIPSfpga platform
> from Imagination Technologies in the FIT kernel image.
>
> -config FIT_IMAGE_FDT_OCELOT_PCB123
> - bool "Include FDT for Microsemi Ocelot PCB123"
> +config FIT_IMAGE_FDT_OCELOT
> + bool "Include FDT for Microsemi Ocelot development platforms"
> select MSCC_OCELOT
> help
> - Enable this to include the FDT for the Ocelot PCB123 platform
> + Enable this to include the FDT for the Ocelot development platforms
> from Microsemi in the FIT kernel image.
> This requires u-boot on the platform.
>
> diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform
> index 879cb80..eaa19d1 100644
> --- a/arch/mips/generic/Platform
> +++ b/arch/mips/generic/Platform
> @@ -16,5 +16,5 @@ all-$(CONFIG_MIPS_GENERIC) := vmlinux.gz.itb
> its-y := vmlinux.its.S
> its-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += board-boston.its.S
> its-$(CONFIG_FIT_IMAGE_FDT_NI169445) += board-ni169445.its.S
> -its-$(CONFIG_FIT_IMAGE_FDT_OCELOT_PCB123) += board-ocelot_pcb123.its.S
> +its-$(CONFIG_FIT_IMAGE_FDT_OCELOT) += board-ocelot.its.S
> its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA) += board-xilfpga.its.S
> diff --git a/arch/mips/generic/board-ocelot.its.S b/arch/mips/generic/board-ocelot.its.S
> new file mode 100644
> index 0000000..3da2398
> --- /dev/null
> +++ b/arch/mips/generic/board-ocelot.its.S
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
> +/ {
> + images {
> + fdt@ocelot_pcb123 {
> + description = "MSCC Ocelot PCB123 Device Tree";
> + data = /incbin/("boot/dts/mscc/ocelot_pcb123.dtb");
> + type = "flat_dt";
> + arch = "mips";
> + compression = "none";
> + hash@0 {
> + algo = "sha1";
> + };
> + };
> +
> + fdt@ocelot_pcb120 {
> + description = "MSCC Ocelot PCB120 Device Tree";
> + data = /incbin/("boot/dts/mscc/ocelot_pcb120.dtb");
> + type = "flat_dt";
> + arch = "mips";
> + compression = "none";
> + hash@0 {
> + algo = "sha1";
> + };
> + };
> + };
> +
> + configurations {
> + conf@ocelot_pcb123 {
> + description = "Ocelot Linux kernel";
> + kernel = "kernel@0";
> + fdt = "fdt@ocelot_pcb123";
> + };
> +
> + conf@ocelot_pcb120 {
> + description = "Ocelot Linux kernel";
> + kernel = "kernel@0";
> + fdt = "fdt@ocelot_pcb120";
> + };
> + };
> +};
> diff --git a/arch/mips/generic/board-ocelot_pcb123.its.S b/arch/mips/generic/board-ocelot_pcb123.its.S
> deleted file mode 100644
> index 5a7d5e1..0000000
> --- a/arch/mips/generic/board-ocelot_pcb123.its.S
> +++ /dev/null
> @@ -1,23 +0,0 @@
> -/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
> -/ {
> - images {
> - fdt@ocelot_pcb123 {
> - description = "MSCC Ocelot PCB123 Device Tree";
> - data = /incbin/("boot/dts/mscc/ocelot_pcb123.dtb");
> - type = "flat_dt";
> - arch = "mips";
> - compression = "none";
> - hash@0 {
> - algo = "sha1";
> - };
> - };
> - };
> -
> - configurations {
> - conf@ocelot_pcb123 {
> - description = "Ocelot Linux kernel";
> - kernel = "kernel@0";
> - fdt = "fdt@ocelot_pcb123";
> - };
> - };
> -};
> --
> git-series 0.9.1
--
Alexandre Belloni, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
^ permalink raw reply
* Re: [PATCH 6/7] MIPS: mscc: add DT for Ocelot PCB120
From: Alexandre Belloni @ 2018-09-14 14:58 UTC (permalink / raw)
To: Quentin Schulz
Cc: ralf, paul.burton, jhogan, robh+dt, mark.rutland, davem, andrew,
f.fainelli, allan.nielsen, linux-mips, devicetree, linux-kernel,
netdev, thomas.petazzoni, antoine.tenart
In-Reply-To: <f2ef1137991cabde5e9529403982d84b5b0fe0a6.1536916714.git-series.quentin.schulz@bootlin.com>
On 14/09/2018 11:44:27+0200, Quentin Schulz wrote:
> The Ocelot PCB120 evaluation board is different from the PCB123 in that
> it has 4 external VSC8584 (or VSC8574) PHYs.
>
> It uses the SoC's second MDIO bus for external PHYs which have a
> reversed address on the bus (i.e. PHY4 is on address 3, PHY5 is on
> address 2, PHY6 on 1 and PHY7 on 0).
>
> Here is how the PHYs are connected to the switch ports:
> port 0: phy0 (internal)
> port 1: phy1 (internal)
> port 2: phy2 (internal)
> port 3: phy3 (internal)
> port 4: phy7
> port 5: phy4
> port 6: phy6
> port 9: phy5
>
> Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
> ---
> arch/mips/boot/dts/mscc/Makefile | 2 +-
> arch/mips/boot/dts/mscc/ocelot_pcb120.dts | 100 +++++++++++++++++++++++-
> 2 files changed, 101 insertions(+), 1 deletion(-)
> create mode 100644 arch/mips/boot/dts/mscc/ocelot_pcb120.dts
>
> diff --git a/arch/mips/boot/dts/mscc/Makefile b/arch/mips/boot/dts/mscc/Makefile
> index 9a9bb7e..ec6f5b2 100644
> --- a/arch/mips/boot/dts/mscc/Makefile
> +++ b/arch/mips/boot/dts/mscc/Makefile
> @@ -1,3 +1,3 @@
> -dtb-$(CONFIG_MSCC_OCELOT) += ocelot_pcb123.dtb
> +dtb-$(CONFIG_MSCC_OCELOT) += ocelot_pcb123.dtb ocelot_pcb120.dtb
>
> obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y))
> diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts
> new file mode 100644
> index 0000000..8eb03a5
> --- /dev/null
> +++ b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts
> @@ -0,0 +1,100 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
> +/* Copyright (c) 2017 Microsemi Corporation */
> +
> +/dts-v1/;
> +
> +#include <dt-bindings/interrupt-controller/irq.h>
> +#include <dt-bindings/phy/phy-ocelot-serdes.h>
> +#include "ocelot.dtsi"
> +
> +/ {
> + compatible = "mscc,ocelot-pcb120", "mscc,ocelot";
> +
> + chosen {
> + stdout-path = "serial0:115200n8";
> + };
> +
> + memory@0 {
> + device_type = "memory";
> + reg = <0x0 0x0e000000>;
> + };
> +};
> +
> +&mdio0 {
> + status = "okay";
> +};
> +
> +&mdio1 {
> + status = "okay";
> + pinctrl-names = "default";
> + pinctrl-0 = <&miim1>, <&gpio4>;
> +
> + phy7: ethernet-phy@0 {
> + reg = <0>;
> + interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> + interrupt-parent = <&gpio>;
> + };
> + phy6: ethernet-phy@1 {
> + reg = <1>;
> + interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> + interrupt-parent = <&gpio>;
> + };
> + phy5: ethernet-phy@2 {
> + reg = <2>;
> + interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> + interrupt-parent = <&gpio>;
> + };
> + phy4: ethernet-phy@3 {
> + reg = <3>;
> + interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> + interrupt-parent = <&gpio>;
> + };
> +};
> +
> +&port0 {
> + phy-handle = <&phy0>;
> +};
> +
> +&port1 {
> + phy-handle = <&phy1>;
> +};
> +
> +&port2 {
> + phy-handle = <&phy2>;
> +};
> +
> +&port3 {
> + phy-handle = <&phy3>;
> +};
> +
> +&port4 {
> + phy-handle = <&phy7>;
> + phy-mode = "sgmii";
> + phys = <&serdes 4 SERDES1G_2>;
> +};
> +
> +&port5 {
> + phy-handle = <&phy4>;
> + phy-mode = "sgmii";
> + phys = <&serdes 5 SERDES1G_5>;
> +};
> +
> +&port6 {
> + phy-handle = <&phy6>;
> + phy-mode = "sgmii";
> + phys = <&serdes 6 SERDES1G_3>;
> +};
> +
> +&port9 {
> + phy-handle = <&phy5>;
> + phy-mode = "sgmii";
> + phys = <&serdes 9 SERDES1G_4>;
> +};
> +
> +&uart0 {
> + status = "okay";
> +};
> +
> +&uart2 {
> + status = "okay";
> +};
> --
> git-series 0.9.1
--
Alexandre Belloni, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox