Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v4 09/20] zinc: Poly1305 x86_64 implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Jean-Philippe Aumasson, Andy Polyakov, Thomas Gleixner,
	Ingo Molnar, x86
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

This provides AVX, AVX-2, and AVX-512F implementations for Poly1305.
The AVX-512F implementation is disabled on Skylake, due to throttling.
These come from Andy Polyakov's implementation, with the following
modifications from Samuel Neves:

  - Some cosmetic changes, like renaming labels to .Lname, constants,
    and other Linux conventions.

  - CPU feature checking is done in C by the glue code, so that has been
    removed from the assembly.

  - poly1305_blocks_avx512 jumped to the middle of the poly1305_blocks_avx2
    for the final blocks. To appease objtool, the relevant tail avx2 code
    was duplicated for the avx512 function.

  - The original uses %rbp as a scratch register. However, the kernel
    expects %rbp to be a valid frame pointer at any given time in order
    to do proper unwinding. Thus we need to alter the code in order to
    preserve it. The most straightforward manner in which this was
    accomplished was by replacing $d3, formerly %r10, by %rdi, and
    replacing %rbp by %r10. Because %rdi, a pointer to the context
    structure, does not change and is not used by poly1305_iteration,
    it is safe to use it here, and the overhead of saving and restoring
    it should be minimal.

  - The original hardcodes returns as .byte 0xf3,0xc3, aka "rep ret".
    We replace this by "ret". "rep ret" was meant to help with AMD K8
    chips, cf. http://repzret.org/p/repzret. It makes no sense to
    continue to use this kludge for code that won't even run on ancient
    AMD chips.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
---
 lib/zinc/Makefile                        |    4 +
 lib/zinc/poly1305/poly1305-x86_64-glue.h |  109 +
 lib/zinc/poly1305/poly1305-x86_64.S      | 2792 ++++++++++++++++++++++
 3 files changed, 2905 insertions(+)
 create mode 100644 lib/zinc/poly1305/poly1305-x86_64-glue.h
 create mode 100644 lib/zinc/poly1305/poly1305-x86_64.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index f37df89a3f87..72112f8ffba1 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -25,6 +25,10 @@ endif
 
 ifeq ($(CONFIG_ZINC_POLY1305),y)
 zinc-y += poly1305/poly1305.o
+ifeq ($(CONFIG_ZINC_ARCH_X86_64),y)
+zinc-y += poly1305/poly1305-x86_64.o
+CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-x86_64-glue.h
+endif
 ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
 zinc-y += poly1305/poly1305-arm.o
 CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-arm-glue.h
diff --git a/lib/zinc/poly1305/poly1305-x86_64-glue.h b/lib/zinc/poly1305/poly1305-x86_64-glue.h
new file mode 100644
index 000000000000..4ae028101e7c
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64-glue.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+asmlinkage void poly1305_init_x86_64(void *ctx,
+				     const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+				       const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+				     const u32 nonce[4]);
+#ifdef CONFIG_AS_AVX
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+				  const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+				    const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+				     const u32 padbit);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+				       const size_t len, const u32 padbit);
+#endif
+
+static bool poly1305_use_avx __ro_after_init;
+static bool poly1305_use_avx2 __ro_after_init;
+static bool poly1305_use_avx512 __ro_after_init;
+
+void __init poly1305_fpu_init(void)
+{
+	poly1305_use_avx =
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+	poly1305_use_avx2 =
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX2) &&
+		cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+	poly1305_use_avx512 =
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+				  XFEATURE_MASK_AVX512, NULL) &&
+		/* Skylake downclocks unacceptably much when using zmm. */
+		boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+				      const u8 key[POLY1305_KEY_SIZE],
+				      simd_context_t simd_context)
+{
+	poly1305_init_x86_64(ctx, key);
+	return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+					const size_t len, const u32 padbit,
+					simd_context_t simd_context)
+{
+#ifdef CONFIG_AS_AVX512
+	if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD)
+		poly1305_blocks_avx512(ctx, inp, len, padbit);
+	else
+#endif
+#ifdef CONFIG_AS_AVX2
+	if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD)
+		poly1305_blocks_avx2(ctx, inp, len, padbit);
+	else
+#endif
+#ifdef CONFIG_AS_AVX
+	if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD)
+		poly1305_blocks_avx(ctx, inp, len, padbit);
+	else
+#endif
+		poly1305_blocks_x86_64(ctx, inp, len, padbit);
+	return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+				      const u32 nonce[4],
+				      simd_context_t simd_context)
+{
+#ifdef CONFIG_AS_AVX512
+	if (poly1305_use_avx512 && simd_context == HAVE_FULL_SIMD)
+		poly1305_emit_avx(ctx, mac, nonce);
+	else
+#endif
+#ifdef CONFIG_AS_AVX2
+	if (poly1305_use_avx2 && simd_context == HAVE_FULL_SIMD)
+		poly1305_emit_avx(ctx, mac, nonce);
+	else
+#endif
+#ifdef CONFIG_AS_AVX
+	if (poly1305_use_avx && simd_context == HAVE_FULL_SIMD)
+		poly1305_emit_avx(ctx, mac, nonce);
+	else
+#endif
+		poly1305_emit_x86_64(ctx, mac, nonce);
+	return true;
+}
+
+#define HAVE_POLY1305_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/poly1305/poly1305-x86_64.S b/lib/zinc/poly1305/poly1305-x86_64.S
new file mode 100644
index 000000000000..26c852e3c769
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64.S
@@ -0,0 +1,2792 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst192.Lconst, "aM", @progbits, 192
+.align	64
+.Lconst:
+.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.long	16777216,0,16777216,0,16777216,0,16777216,0
+.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.long	2,2,2,3,2,0,2,1
+.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.text
+
+.align	32
+ENTRY(poly1305_init_x86_64)
+	xorq	%rax,%rax
+	movq	%rax,0(%rdi)
+	movq	%rax,8(%rdi)
+	movq	%rax,16(%rdi)
+
+	cmpq	$0,%rsi
+	je	.Lno_key
+
+	movq	$0x0ffffffc0fffffff,%rax
+	movq	$0x0ffffffc0ffffffc,%rcx
+	andq	0(%rsi),%rax
+	andq	8(%rsi),%rcx
+	movq	%rax,24(%rdi)
+	movq	%rcx,32(%rdi)
+	movl	$1,%eax
+.Lno_key:
+	ret
+ENDPROC(poly1305_init_x86_64)
+
+.align	32
+ENTRY(poly1305_blocks_x86_64)
+.Lblocks:
+	shrq	$4,%rdx
+	jz	.Lno_data
+
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdi
+
+.Lblocks_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%r10
+
+	movq	%r13,%r12
+	shrq	$2,%r13
+	movq	%r12,%rax
+	addq	%r12,%r13
+	jmp	.Loop
+
+.align	32
+.Loop:
+
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%r10
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r11,%rax
+	movq	%rdx,%rdi
+
+	mulq	%r14
+	movq	%rax,%r14
+	movq	%r11,%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	%rdx,%rdi
+
+	mulq	%rbx
+	movq	%r10,%rbx
+	addq	%rax,%r14
+	adcq	%rdx,%r8
+
+	imulq	%r13,%rbx
+	addq	%rbx,%r9
+	movq	%r8,%rbx
+	adcq	$0,%rdi
+
+	imulq	%r11,%r10
+	addq	%r9,%rbx
+	movq	$-4,%rax
+	adcq	%r10,%rdi
+
+	andq	%rdi,%rax
+	movq	%rdi,%r10
+	shrq	$2,%rdi
+	andq	$3,%r10
+	addq	%rdi,%rax
+	addq	%rax,%r14
+	adcq	$0,%rbx
+	adcq	$0,%r10
+
+	movq	%r12,%rax
+	decq	%r15
+	jnz	.Loop
+
+	movq	0(%rsp),%rdi
+
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%r10,16(%rdi)
+
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbx
+	leaq	48(%rsp),%rsp
+.Lno_data:
+.Lblocks_epilogue:
+	ret
+ENDPROC(poly1305_blocks_x86_64)
+
+.align	32
+ENTRY(poly1305_emit_x86_64)
+.Lemit:
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	ret
+ENDPROC(poly1305_emit_x86_64)
+
+.macro __poly1305_block
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r11,%rax
+	movq	%rdx,%rdi
+
+	mulq	%r14
+	movq	%rax,%r14
+	movq	%r11,%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	%rdx,%rdi
+
+	mulq	%rbx
+	movq	%r10,%rbx
+	addq	%rax,%r14
+	adcq	%rdx,%r8
+
+	imulq	%r13,%rbx
+	addq	%rbx,%r9
+	movq	%r8,%rbx
+	adcq	$0,%rdi
+
+	imulq	%r11,%r10
+	addq	%r9,%rbx
+	movq	$-4,%rax
+	adcq	%r10,%rdi
+
+	andq	%rdi,%rax
+	movq	%rdi,%r10
+	shrq	$2,%rdi
+	andq	$3,%r10
+	addq	%rdi,%rax
+	addq	%rax,%r14
+	adcq	$0,%rbx
+	adcq	$0,%r10
+.endm
+
+.macro __poly1305_init_avx
+	movq	%r11,%r14
+	movq	%r12,%rbx
+	xorq	%r10,%r10
+
+	leaq	48+64(%rdi),%rdi
+
+	movq	%r12,%rax
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	movq	%r11,%r9
+	andl	%r11d,%edx
+	movl	%eax,-64(%rdi)
+	shrq	$26,%r8
+	movl	%edx,-60(%rdi)
+	shrq	$26,%r9
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%eax
+	andl	%r9d,%edx
+	movl	%eax,-48(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,-44(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,-32(%rdi)
+	shrq	$26,%r8
+	movl	%edx,-28(%rdi)
+	shrq	$26,%r9
+
+	movq	%rbx,%rax
+	movq	%r12,%rdx
+	shlq	$12,%rax
+	shlq	$12,%rdx
+	orq	%r8,%rax
+	orq	%r9,%rdx
+	andl	$0x3ffffff,%eax
+	andl	$0x3ffffff,%edx
+	movl	%eax,-16(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,-12(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,0(%rdi)
+	movq	%rbx,%r8
+	movl	%edx,4(%rdi)
+	movq	%r12,%r9
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	shrq	$14,%r9
+	andl	%r8d,%eax
+	andl	%r9d,%edx
+	movl	%eax,16(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,20(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,32(%rdi)
+	shrq	$26,%r8
+	movl	%edx,36(%rdi)
+	shrq	$26,%r9
+
+	movq	%r10,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,48(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r9d,52(%rdi)
+	leaq	(%r9,%r9,4),%r9
+	movl	%r8d,64(%rdi)
+	movl	%r9d,68(%rdi)
+
+	movq	%r12,%rax
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+
+	movl	$0x3ffffff,%eax
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	shrq	$26,%r8
+	movl	%eax,-52(%rdi)
+
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%edx
+	movl	%edx,-36(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,-20(%rdi)
+
+	movq	%rbx,%rax
+	shlq	$12,%rax
+	orq	%r8,%rax
+	andl	$0x3ffffff,%eax
+	movl	%eax,-4(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movq	%rbx,%r8
+	movl	%eax,12(%rdi)
+
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	andl	%r8d,%edx
+	movl	%edx,28(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,44(%rdi)
+
+	movq	%r10,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,60(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r8d,76(%rdi)
+
+	movq	%r12,%rax
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+
+	movl	$0x3ffffff,%eax
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	shrq	$26,%r8
+	movl	%eax,-56(%rdi)
+
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%edx
+	movl	%edx,-40(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,-24(%rdi)
+
+	movq	%rbx,%rax
+	shlq	$12,%rax
+	orq	%r8,%rax
+	andl	$0x3ffffff,%eax
+	movl	%eax,-8(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movq	%rbx,%r8
+	movl	%eax,8(%rdi)
+
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	andl	%r8d,%edx
+	movl	%edx,24(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,40(%rdi)
+
+	movq	%r10,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,56(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r8d,72(%rdi)
+
+	leaq	-48-64(%rdi),%rdi
+.endm
+
+#ifdef CONFIG_AS_AVX
+.align	32
+ENTRY(poly1305_blocks_avx)
+
+	movl	20(%rdi),%r8d
+	cmpq	$128,%rdx
+	jae	.Lblocks_avx
+	testl	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx:
+	andq	$-16,%rdx
+	jz	.Lno_data_avx
+
+	vzeroupper
+
+	testl	%r8d,%r8d
+	jz	.Lbase2_64_avx
+
+	testq	$31,%rdx
+	jz	.Leven_avx
+
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdi
+
+.Lblocks_avx_body:
+
+	movq	%rdx,%r15
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movl	16(%rdi),%r10d
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+
+	movl	%r8d,%r14d
+	andq	$-2147483648,%r8
+	movq	%r9,%r12
+	movl	%r9d,%ebx
+	andq	$-2147483648,%r9
+
+	shrq	$6,%r8
+	shlq	$52,%r12
+	addq	%r8,%r14
+	shrq	$12,%rbx
+	shrq	$18,%r9
+	addq	%r12,%r14
+	adcq	%r9,%rbx
+
+	movq	%r10,%r8
+	shlq	$40,%r8
+	shrq	$24,%r10
+	addq	%r8,%rbx
+	adcq	$0,%r10
+
+	movq	$-4,%r9
+	movq	%r10,%r8
+	andq	%r10,%r9
+	shrq	$2,%r8
+	andq	$3,%r10
+	addq	%r9,%r8
+	addq	%r8,%r14
+	adcq	$0,%rbx
+	adcq	$0,%r10
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%r10
+
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+
+	testq	%rcx,%rcx
+	jz	.Lstore_base2_64_avx
+
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r11
+	movq	%rbx,%r12
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r11
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r11,%r14
+	shlq	$24,%r10
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r12
+	andq	$0x3ffffff,%rbx
+	orq	%r12,%r10
+
+	subq	$16,%r15
+	jz	.Lstore_base2_26_avx
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%r10d,%xmm4
+	jmp	.Lproceed_avx
+
+.align	32
+.Lstore_base2_64_avx:
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%r10,16(%rdi)
+	jmp	.Ldone_avx
+
+.align	16
+.Lstore_base2_26_avx:
+	movl	%eax,0(%rdi)
+	movl	%edx,4(%rdi)
+	movl	%r14d,8(%rdi)
+	movl	%ebx,12(%rdi)
+	movl	%r10d,16(%rdi)
+.align	16
+.Ldone_avx:
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbx
+	leaq	48(%rsp),%rsp
+
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+	ret
+
+.align	32
+.Lbase2_64_avx:
+
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdi
+
+.Lbase2_64_avx_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movl	16(%rdi),%r10d
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	testq	$31,%rdx
+	jz	.Linit_avx
+
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%r10
+	subq	$16,%r15
+
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+
+.Linit_avx:
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r8
+	movq	%rbx,%r9
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r8
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r8,%r14
+	shlq	$24,%r10
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r9
+	andq	$0x3ffffff,%rbx
+	orq	%r9,%r10
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%r10d,%xmm4
+	movl	$1,20(%rdi)
+
+	__poly1305_init_avx
+
+.Lproceed_avx:
+	movq	%r15,%rdx
+
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbx
+	leaq	48(%rsp),%rax
+	leaq	48(%rsp),%rsp
+
+.Lbase2_64_avx_epilogue:
+	jmp	.Ldo_avx
+
+
+.align	32
+.Leven_avx:
+	vmovd	0(%rdi),%xmm0
+	vmovd	4(%rdi),%xmm1
+	vmovd	8(%rdi),%xmm2
+	vmovd	12(%rdi),%xmm3
+	vmovd	16(%rdi),%xmm4
+
+.Ldo_avx:
+	leaq	8(%rsp),%r10
+	andq	$-32,%rsp
+	subq	$8,%rsp
+	leaq	-88(%rsp),%r11
+	subq	$0x178,%rsp
+	subq	$64,%rdx
+	leaq	-32(%rsi),%rax
+	cmovcq	%rax,%rsi
+
+	vmovdqu	48(%rdi),%xmm14
+	leaq	112(%rdi),%rdi
+	leaq	.Lconst(%rip),%rcx
+
+	vmovdqu	32(%rsi),%xmm5
+	vmovdqu	48(%rsi),%xmm6
+	vmovdqa	64(%rcx),%xmm15
+
+	vpsrldq	$6,%xmm5,%xmm7
+	vpsrldq	$6,%xmm6,%xmm8
+	vpunpckhqdq	%xmm6,%xmm5,%xmm9
+	vpunpcklqdq	%xmm6,%xmm5,%xmm5
+	vpunpcklqdq	%xmm8,%xmm7,%xmm8
+
+	vpsrlq	$40,%xmm9,%xmm9
+	vpsrlq	$26,%xmm5,%xmm6
+	vpand	%xmm15,%xmm5,%xmm5
+	vpsrlq	$4,%xmm8,%xmm7
+	vpand	%xmm15,%xmm6,%xmm6
+	vpsrlq	$30,%xmm8,%xmm8
+	vpand	%xmm15,%xmm7,%xmm7
+	vpand	%xmm15,%xmm8,%xmm8
+	vpor	32(%rcx),%xmm9,%xmm9
+
+	jbe	.Lskip_loop_avx
+
+
+	vmovdqu	-48(%rdi),%xmm11
+	vmovdqu	-32(%rdi),%xmm12
+	vpshufd	$0xEE,%xmm14,%xmm13
+	vpshufd	$0x44,%xmm14,%xmm10
+	vmovdqa	%xmm13,-144(%r11)
+	vmovdqa	%xmm10,0(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm14
+	vmovdqu	-16(%rdi),%xmm10
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm14,-128(%r11)
+	vmovdqa	%xmm11,16(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm13
+	vmovdqu	0(%rdi),%xmm11
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm13,-112(%r11)
+	vmovdqa	%xmm12,32(%rsp)
+	vpshufd	$0xEE,%xmm10,%xmm14
+	vmovdqu	16(%rdi),%xmm12
+	vpshufd	$0x44,%xmm10,%xmm10
+	vmovdqa	%xmm14,-96(%r11)
+	vmovdqa	%xmm10,48(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm13
+	vmovdqu	32(%rdi),%xmm10
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm13,-80(%r11)
+	vmovdqa	%xmm11,64(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm14
+	vmovdqu	48(%rdi),%xmm11
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm14,-64(%r11)
+	vmovdqa	%xmm12,80(%rsp)
+	vpshufd	$0xEE,%xmm10,%xmm13
+	vmovdqu	64(%rdi),%xmm12
+	vpshufd	$0x44,%xmm10,%xmm10
+	vmovdqa	%xmm13,-48(%r11)
+	vmovdqa	%xmm10,96(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm14
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm14,-32(%r11)
+	vmovdqa	%xmm11,112(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm13
+	vmovdqa	0(%rsp),%xmm14
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm13,-16(%r11)
+	vmovdqa	%xmm12,128(%rsp)
+
+	jmp	.Loop_avx
+
+.align	32
+.Loop_avx:
+
+	vpmuludq	%xmm5,%xmm14,%xmm10
+	vpmuludq	%xmm6,%xmm14,%xmm11
+	vmovdqa	%xmm2,32(%r11)
+	vpmuludq	%xmm7,%xmm14,%xmm12
+	vmovdqa	16(%rsp),%xmm2
+	vpmuludq	%xmm8,%xmm14,%xmm13
+	vpmuludq	%xmm9,%xmm14,%xmm14
+
+	vmovdqa	%xmm0,0(%r11)
+	vpmuludq	32(%rsp),%xmm9,%xmm0
+	vmovdqa	%xmm1,16(%r11)
+	vpmuludq	%xmm8,%xmm2,%xmm1
+	vpaddq	%xmm0,%xmm10,%xmm10
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vmovdqa	%xmm3,48(%r11)
+	vpmuludq	%xmm7,%xmm2,%xmm0
+	vpmuludq	%xmm6,%xmm2,%xmm1
+	vpaddq	%xmm0,%xmm13,%xmm13
+	vmovdqa	48(%rsp),%xmm3
+	vpaddq	%xmm1,%xmm12,%xmm12
+	vmovdqa	%xmm4,64(%r11)
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpmuludq	%xmm7,%xmm3,%xmm0
+	vpaddq	%xmm2,%xmm11,%xmm11
+
+	vmovdqa	64(%rsp),%xmm4
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpmuludq	%xmm6,%xmm3,%xmm1
+	vpmuludq	%xmm5,%xmm3,%xmm3
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vmovdqa	80(%rsp),%xmm2
+	vpaddq	%xmm3,%xmm12,%xmm12
+	vpmuludq	%xmm9,%xmm4,%xmm0
+	vpmuludq	%xmm8,%xmm4,%xmm4
+	vpaddq	%xmm0,%xmm11,%xmm11
+	vmovdqa	96(%rsp),%xmm3
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vmovdqa	128(%rsp),%xmm4
+	vpmuludq	%xmm6,%xmm2,%xmm1
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vpaddq	%xmm2,%xmm13,%xmm13
+	vpmuludq	%xmm9,%xmm3,%xmm0
+	vpmuludq	%xmm8,%xmm3,%xmm1
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vmovdqu	0(%rsi),%xmm0
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpmuludq	%xmm7,%xmm3,%xmm3
+	vpmuludq	%xmm7,%xmm4,%xmm7
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	vmovdqu	16(%rsi),%xmm1
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vpmuludq	%xmm8,%xmm4,%xmm8
+	vpmuludq	%xmm9,%xmm4,%xmm9
+	vpsrldq	$6,%xmm0,%xmm2
+	vpaddq	%xmm8,%xmm12,%xmm12
+	vpaddq	%xmm9,%xmm13,%xmm13
+	vpsrldq	$6,%xmm1,%xmm3
+	vpmuludq	112(%rsp),%xmm5,%xmm9
+	vpmuludq	%xmm6,%xmm4,%xmm5
+	vpunpckhqdq	%xmm1,%xmm0,%xmm4
+	vpaddq	%xmm9,%xmm14,%xmm14
+	vmovdqa	-144(%r11),%xmm9
+	vpaddq	%xmm5,%xmm10,%xmm10
+
+	vpunpcklqdq	%xmm1,%xmm0,%xmm0
+	vpunpcklqdq	%xmm3,%xmm2,%xmm3
+
+
+	vpsrldq	$5,%xmm4,%xmm4
+	vpsrlq	$26,%xmm0,%xmm1
+	vpand	%xmm15,%xmm0,%xmm0
+	vpsrlq	$4,%xmm3,%xmm2
+	vpand	%xmm15,%xmm1,%xmm1
+	vpand	0(%rcx),%xmm4,%xmm4
+	vpsrlq	$30,%xmm3,%xmm3
+	vpand	%xmm15,%xmm2,%xmm2
+	vpand	%xmm15,%xmm3,%xmm3
+	vpor	32(%rcx),%xmm4,%xmm4
+
+	vpaddq	0(%r11),%xmm0,%xmm0
+	vpaddq	16(%r11),%xmm1,%xmm1
+	vpaddq	32(%r11),%xmm2,%xmm2
+	vpaddq	48(%r11),%xmm3,%xmm3
+	vpaddq	64(%r11),%xmm4,%xmm4
+
+	leaq	32(%rsi),%rax
+	leaq	64(%rsi),%rsi
+	subq	$64,%rdx
+	cmovcq	%rax,%rsi
+
+	vpmuludq	%xmm0,%xmm9,%xmm5
+	vpmuludq	%xmm1,%xmm9,%xmm6
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vmovdqa	-128(%r11),%xmm7
+	vpmuludq	%xmm2,%xmm9,%xmm5
+	vpmuludq	%xmm3,%xmm9,%xmm6
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm4,%xmm9,%xmm9
+	vpmuludq	-112(%r11),%xmm4,%xmm5
+	vpaddq	%xmm9,%xmm14,%xmm14
+
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpmuludq	%xmm2,%xmm7,%xmm6
+	vpmuludq	%xmm3,%xmm7,%xmm5
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vmovdqa	-96(%r11),%xmm8
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpmuludq	%xmm1,%xmm7,%xmm6
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm6,%xmm12,%xmm12
+	vpaddq	%xmm7,%xmm11,%xmm11
+
+	vmovdqa	-80(%r11),%xmm9
+	vpmuludq	%xmm2,%xmm8,%xmm5
+	vpmuludq	%xmm1,%xmm8,%xmm6
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vmovdqa	-64(%r11),%xmm7
+	vpmuludq	%xmm0,%xmm8,%xmm8
+	vpmuludq	%xmm4,%xmm9,%xmm5
+	vpaddq	%xmm8,%xmm12,%xmm12
+	vpaddq	%xmm5,%xmm11,%xmm11
+	vmovdqa	-48(%r11),%xmm8
+	vpmuludq	%xmm3,%xmm9,%xmm9
+	vpmuludq	%xmm1,%xmm7,%xmm6
+	vpaddq	%xmm9,%xmm10,%xmm10
+
+	vmovdqa	-16(%r11),%xmm9
+	vpaddq	%xmm6,%xmm14,%xmm14
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpmuludq	%xmm4,%xmm8,%xmm5
+	vpaddq	%xmm7,%xmm13,%xmm13
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vmovdqu	32(%rsi),%xmm5
+	vpmuludq	%xmm3,%xmm8,%xmm7
+	vpmuludq	%xmm2,%xmm8,%xmm8
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vmovdqu	48(%rsi),%xmm6
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+	vpmuludq	%xmm2,%xmm9,%xmm2
+	vpmuludq	%xmm3,%xmm9,%xmm3
+	vpsrldq	$6,%xmm5,%xmm7
+	vpaddq	%xmm2,%xmm11,%xmm11
+	vpmuludq	%xmm4,%xmm9,%xmm4
+	vpsrldq	$6,%xmm6,%xmm8
+	vpaddq	%xmm3,%xmm12,%xmm2
+	vpaddq	%xmm4,%xmm13,%xmm3
+	vpmuludq	-32(%r11),%xmm0,%xmm4
+	vpmuludq	%xmm1,%xmm9,%xmm0
+	vpunpckhqdq	%xmm6,%xmm5,%xmm9
+	vpaddq	%xmm4,%xmm14,%xmm4
+	vpaddq	%xmm0,%xmm10,%xmm0
+
+	vpunpcklqdq	%xmm6,%xmm5,%xmm5
+	vpunpcklqdq	%xmm8,%xmm7,%xmm8
+
+
+	vpsrldq	$5,%xmm9,%xmm9
+	vpsrlq	$26,%xmm5,%xmm6
+	vmovdqa	0(%rsp),%xmm14
+	vpand	%xmm15,%xmm5,%xmm5
+	vpsrlq	$4,%xmm8,%xmm7
+	vpand	%xmm15,%xmm6,%xmm6
+	vpand	0(%rcx),%xmm9,%xmm9
+	vpsrlq	$30,%xmm8,%xmm8
+	vpand	%xmm15,%xmm7,%xmm7
+	vpand	%xmm15,%xmm8,%xmm8
+	vpor	32(%rcx),%xmm9,%xmm9
+
+	vpsrlq	$26,%xmm3,%xmm13
+	vpand	%xmm15,%xmm3,%xmm3
+	vpaddq	%xmm13,%xmm4,%xmm4
+
+	vpsrlq	$26,%xmm0,%xmm10
+	vpand	%xmm15,%xmm0,%xmm0
+	vpaddq	%xmm10,%xmm11,%xmm1
+
+	vpsrlq	$26,%xmm4,%xmm10
+	vpand	%xmm15,%xmm4,%xmm4
+
+	vpsrlq	$26,%xmm1,%xmm11
+	vpand	%xmm15,%xmm1,%xmm1
+	vpaddq	%xmm11,%xmm2,%xmm2
+
+	vpaddq	%xmm10,%xmm0,%xmm0
+	vpsllq	$2,%xmm10,%xmm10
+	vpaddq	%xmm10,%xmm0,%xmm0
+
+	vpsrlq	$26,%xmm2,%xmm12
+	vpand	%xmm15,%xmm2,%xmm2
+	vpaddq	%xmm12,%xmm3,%xmm3
+
+	vpsrlq	$26,%xmm0,%xmm10
+	vpand	%xmm15,%xmm0,%xmm0
+	vpaddq	%xmm10,%xmm1,%xmm1
+
+	vpsrlq	$26,%xmm3,%xmm13
+	vpand	%xmm15,%xmm3,%xmm3
+	vpaddq	%xmm13,%xmm4,%xmm4
+
+	ja	.Loop_avx
+
+.Lskip_loop_avx:
+	vpshufd	$0x10,%xmm14,%xmm14
+	addq	$32,%rdx
+	jnz	.Long_tail_avx
+
+	vpaddq	%xmm2,%xmm7,%xmm7
+	vpaddq	%xmm0,%xmm5,%xmm5
+	vpaddq	%xmm1,%xmm6,%xmm6
+	vpaddq	%xmm3,%xmm8,%xmm8
+	vpaddq	%xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+	vmovdqa	%xmm2,32(%r11)
+	vmovdqa	%xmm0,0(%r11)
+	vmovdqa	%xmm1,16(%r11)
+	vmovdqa	%xmm3,48(%r11)
+	vmovdqa	%xmm4,64(%r11)
+
+	vpmuludq	%xmm7,%xmm14,%xmm12
+	vpmuludq	%xmm5,%xmm14,%xmm10
+	vpshufd	$0x10,-48(%rdi),%xmm2
+	vpmuludq	%xmm6,%xmm14,%xmm11
+	vpmuludq	%xmm8,%xmm14,%xmm13
+	vpmuludq	%xmm9,%xmm14,%xmm14
+
+	vpmuludq	%xmm8,%xmm2,%xmm0
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpshufd	$0x10,-32(%rdi),%xmm3
+	vpmuludq	%xmm7,%xmm2,%xmm1
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vpshufd	$0x10,-16(%rdi),%xmm4
+	vpmuludq	%xmm6,%xmm2,%xmm0
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm11,%xmm11
+	vpmuludq	%xmm9,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	vpshufd	$0x10,0(%rdi),%xmm2
+	vpmuludq	%xmm7,%xmm4,%xmm1
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vpmuludq	%xmm6,%xmm4,%xmm0
+	vpaddq	%xmm0,%xmm13,%xmm13
+	vpshufd	$0x10,16(%rdi),%xmm3
+	vpmuludq	%xmm5,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm12,%xmm12
+	vpmuludq	%xmm9,%xmm2,%xmm1
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpshufd	$0x10,32(%rdi),%xmm4
+	vpmuludq	%xmm8,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm10,%xmm10
+
+	vpmuludq	%xmm6,%xmm3,%xmm0
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpmuludq	%xmm5,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm13,%xmm13
+	vpshufd	$0x10,48(%rdi),%xmm2
+	vpmuludq	%xmm9,%xmm4,%xmm1
+	vpaddq	%xmm1,%xmm12,%xmm12
+	vpshufd	$0x10,64(%rdi),%xmm3
+	vpmuludq	%xmm8,%xmm4,%xmm0
+	vpaddq	%xmm0,%xmm11,%xmm11
+	vpmuludq	%xmm7,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm14,%xmm14
+	vpmuludq	%xmm9,%xmm3,%xmm1
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vpmuludq	%xmm8,%xmm3,%xmm0
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vpmuludq	%xmm7,%xmm3,%xmm1
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpmuludq	%xmm6,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	jz	.Lshort_tail_avx
+
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+
+	vpsrldq	$6,%xmm0,%xmm2
+	vpsrldq	$6,%xmm1,%xmm3
+	vpunpckhqdq	%xmm1,%xmm0,%xmm4
+	vpunpcklqdq	%xmm1,%xmm0,%xmm0
+	vpunpcklqdq	%xmm3,%xmm2,%xmm3
+
+	vpsrlq	$40,%xmm4,%xmm4
+	vpsrlq	$26,%xmm0,%xmm1
+	vpand	%xmm15,%xmm0,%xmm0
+	vpsrlq	$4,%xmm3,%xmm2
+	vpand	%xmm15,%xmm1,%xmm1
+	vpsrlq	$30,%xmm3,%xmm3
+	vpand	%xmm15,%xmm2,%xmm2
+	vpand	%xmm15,%xmm3,%xmm3
+	vpor	32(%rcx),%xmm4,%xmm4
+
+	vpshufd	$0x32,-64(%rdi),%xmm9
+	vpaddq	0(%r11),%xmm0,%xmm0
+	vpaddq	16(%r11),%xmm1,%xmm1
+	vpaddq	32(%r11),%xmm2,%xmm2
+	vpaddq	48(%r11),%xmm3,%xmm3
+	vpaddq	64(%r11),%xmm4,%xmm4
+
+	vpmuludq	%xmm0,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpmuludq	%xmm1,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpmuludq	%xmm2,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpshufd	$0x32,-48(%rdi),%xmm7
+	vpmuludq	%xmm3,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm4,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm14,%xmm14
+
+	vpmuludq	%xmm3,%xmm7,%xmm5
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpshufd	$0x32,-32(%rdi),%xmm8
+	vpmuludq	%xmm2,%xmm7,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpshufd	$0x32,-16(%rdi),%xmm9
+	vpmuludq	%xmm1,%xmm7,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vpmuludq	%xmm4,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+	vpshufd	$0x32,0(%rdi),%xmm7
+	vpmuludq	%xmm2,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm14,%xmm14
+	vpmuludq	%xmm1,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm13,%xmm13
+	vpshufd	$0x32,16(%rdi),%xmm8
+	vpmuludq	%xmm0,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm12,%xmm12
+	vpmuludq	%xmm4,%xmm7,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpshufd	$0x32,32(%rdi),%xmm9
+	vpmuludq	%xmm3,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm10,%xmm10
+
+	vpmuludq	%xmm1,%xmm8,%xmm5
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpmuludq	%xmm0,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm13,%xmm13
+	vpshufd	$0x32,48(%rdi),%xmm7
+	vpmuludq	%xmm4,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm12,%xmm12
+	vpshufd	$0x32,64(%rdi),%xmm8
+	vpmuludq	%xmm3,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm11,%xmm11
+	vpmuludq	%xmm2,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm10,%xmm10
+
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm14,%xmm14
+	vpmuludq	%xmm4,%xmm8,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm3,%xmm8,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpmuludq	%xmm2,%xmm8,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpmuludq	%xmm1,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+	vpsrldq	$8,%xmm14,%xmm9
+	vpsrldq	$8,%xmm13,%xmm8
+	vpsrldq	$8,%xmm11,%xmm6
+	vpsrldq	$8,%xmm10,%xmm5
+	vpsrldq	$8,%xmm12,%xmm7
+	vpaddq	%xmm8,%xmm13,%xmm13
+	vpaddq	%xmm9,%xmm14,%xmm14
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpaddq	%xmm7,%xmm12,%xmm12
+
+	vpsrlq	$26,%xmm13,%xmm3
+	vpand	%xmm15,%xmm13,%xmm13
+	vpaddq	%xmm3,%xmm14,%xmm14
+
+	vpsrlq	$26,%xmm10,%xmm0
+	vpand	%xmm15,%xmm10,%xmm10
+	vpaddq	%xmm0,%xmm11,%xmm11
+
+	vpsrlq	$26,%xmm14,%xmm4
+	vpand	%xmm15,%xmm14,%xmm14
+
+	vpsrlq	$26,%xmm11,%xmm1
+	vpand	%xmm15,%xmm11,%xmm11
+	vpaddq	%xmm1,%xmm12,%xmm12
+
+	vpaddq	%xmm4,%xmm10,%xmm10
+	vpsllq	$2,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vpsrlq	$26,%xmm12,%xmm2
+	vpand	%xmm15,%xmm12,%xmm12
+	vpaddq	%xmm2,%xmm13,%xmm13
+
+	vpsrlq	$26,%xmm10,%xmm0
+	vpand	%xmm15,%xmm10,%xmm10
+	vpaddq	%xmm0,%xmm11,%xmm11
+
+	vpsrlq	$26,%xmm13,%xmm3
+	vpand	%xmm15,%xmm13,%xmm13
+	vpaddq	%xmm3,%xmm14,%xmm14
+
+	vmovd	%xmm10,-112(%rdi)
+	vmovd	%xmm11,-108(%rdi)
+	vmovd	%xmm12,-104(%rdi)
+	vmovd	%xmm13,-100(%rdi)
+	vmovd	%xmm14,-96(%rdi)
+	leaq	-8(%r10),%rsp
+
+	vzeroupper
+	ret
+ENDPROC(poly1305_blocks_avx)
+
+.align	32
+ENTRY(poly1305_emit_avx)
+	cmpl	$0,20(%rdi)
+	je	.Lemit
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ecx
+	movl	8(%rdi),%r8d
+	movl	12(%rdi),%r11d
+	movl	16(%rdi),%r10d
+
+	shlq	$26,%rcx
+	movq	%r8,%r9
+	shlq	$52,%r8
+	addq	%rcx,%rax
+	shrq	$12,%r9
+	addq	%rax,%r8
+	adcq	$0,%r9
+
+	shlq	$14,%r11
+	movq	%r10,%rax
+	shrq	$24,%r10
+	addq	%r11,%r9
+	shlq	$40,%rax
+	addq	%rax,%r9
+	adcq	$0,%r10
+
+	movq	%r10,%rax
+	movq	%r10,%rcx
+	andq	$3,%r10
+	shrq	$2,%rax
+	andq	$-4,%rcx
+	addq	%rcx,%rax
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	ret
+ENDPROC(poly1305_emit_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+.align	32
+ENTRY(poly1305_blocks_avx2)
+
+	movl	20(%rdi),%r8d
+	cmpq	$128,%rdx
+	jae	.Lblocks_avx2
+	testl	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx2:
+	andq	$-16,%rdx
+	jz	.Lno_data_avx2
+
+	vzeroupper
+
+	testl	%r8d,%r8d
+	jz	.Lbase2_64_avx2
+
+	testq	$63,%rdx
+	jz	.Leven_avx2
+
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdi
+
+.Lblocks_avx2_body:
+
+	movq	%rdx,%r15
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movl	16(%rdi),%r10d
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+
+	movl	%r8d,%r14d
+	andq	$-2147483648,%r8
+	movq	%r9,%r12
+	movl	%r9d,%ebx
+	andq	$-2147483648,%r9
+
+	shrq	$6,%r8
+	shlq	$52,%r12
+	addq	%r8,%r14
+	shrq	$12,%rbx
+	shrq	$18,%r9
+	addq	%r12,%r14
+	adcq	%r9,%rbx
+
+	movq	%r10,%r8
+	shlq	$40,%r8
+	shrq	$24,%r10
+	addq	%r8,%rbx
+	adcq	$0,%r10
+
+	movq	$-4,%r9
+	movq	%r10,%r8
+	andq	%r10,%r9
+	shrq	$2,%r8
+	andq	$3,%r10
+	addq	%r9,%r8
+	addq	%r8,%r14
+	adcq	$0,%rbx
+	adcq	$0,%r10
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+.Lbase2_26_pre_avx2:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%r10
+	subq	$16,%r15
+
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_26_pre_avx2
+
+	testq	%rcx,%rcx
+	jz	.Lstore_base2_64_avx2
+
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r11
+	movq	%rbx,%r12
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r11
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r11,%r14
+	shlq	$24,%r10
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r12
+	andq	$0x3ffffff,%rbx
+	orq	%r12,%r10
+
+	testq	%r15,%r15
+	jz	.Lstore_base2_26_avx2
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%r10d,%xmm4
+	jmp	.Lproceed_avx2
+
+.align	32
+.Lstore_base2_64_avx2:
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%r10,16(%rdi)
+	jmp	.Ldone_avx2
+
+.align	16
+.Lstore_base2_26_avx2:
+	movl	%eax,0(%rdi)
+	movl	%edx,4(%rdi)
+	movl	%r14d,8(%rdi)
+	movl	%ebx,12(%rdi)
+	movl	%r10d,16(%rdi)
+.align	16
+.Ldone_avx2:
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbx
+	leaq	48(%rsp),%rsp
+
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+	ret
+
+
+.align	32
+.Lbase2_64_avx2:
+
+
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdi
+
+.Lbase2_64_avx2_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movl	16(%rdi),%r10d
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	testq	$63,%rdx
+	jz	.Linit_avx2
+
+.Lbase2_64_pre_avx2:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%r10
+	subq	$16,%r15
+
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r8
+	movq	%rbx,%r9
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r8
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r8,%r14
+	shlq	$24,%r10
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r9
+	andq	$0x3ffffff,%rbx
+	orq	%r9,%r10
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%r10d,%xmm4
+	movl	$1,20(%rdi)
+
+	__poly1305_init_avx
+
+.Lproceed_avx2:
+	movq	%r15,%rdx
+
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbx
+	leaq	48(%rsp),%rax
+	leaq	48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue:
+	jmp	.Ldo_avx2
+
+
+.align	32
+.Leven_avx2:
+
+	vmovd	0(%rdi),%xmm0
+	vmovd	4(%rdi),%xmm1
+	vmovd	8(%rdi),%xmm2
+	vmovd	12(%rdi),%xmm3
+	vmovd	16(%rdi),%xmm4
+
+.Ldo_avx2:
+	leaq	8(%rsp),%r10
+	subq	$0x128,%rsp
+	leaq	.Lconst(%rip),%rcx
+	leaq	48+64(%rdi),%rdi
+	vmovdqa	96(%rcx),%ymm7
+
+
+	vmovdqu	-64(%rdi),%xmm9
+	andq	$-512,%rsp
+	vmovdqu	-48(%rdi),%xmm10
+	vmovdqu	-32(%rdi),%xmm6
+	vmovdqu	-16(%rdi),%xmm11
+	vmovdqu	0(%rdi),%xmm12
+	vmovdqu	16(%rdi),%xmm13
+	leaq	144(%rsp),%rax
+	vmovdqu	32(%rdi),%xmm14
+	vpermd	%ymm9,%ymm7,%ymm9
+	vmovdqu	48(%rdi),%xmm15
+	vpermd	%ymm10,%ymm7,%ymm10
+	vmovdqu	64(%rdi),%xmm5
+	vpermd	%ymm6,%ymm7,%ymm6
+	vmovdqa	%ymm9,0(%rsp)
+	vpermd	%ymm11,%ymm7,%ymm11
+	vmovdqa	%ymm10,32-144(%rax)
+	vpermd	%ymm12,%ymm7,%ymm12
+	vmovdqa	%ymm6,64-144(%rax)
+	vpermd	%ymm13,%ymm7,%ymm13
+	vmovdqa	%ymm11,96-144(%rax)
+	vpermd	%ymm14,%ymm7,%ymm14
+	vmovdqa	%ymm12,128-144(%rax)
+	vpermd	%ymm15,%ymm7,%ymm15
+	vmovdqa	%ymm13,160-144(%rax)
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqa	%ymm14,192-144(%rax)
+	vmovdqa	%ymm15,224-144(%rax)
+	vmovdqa	%ymm5,256-144(%rax)
+	vmovdqa	64(%rcx),%ymm5
+
+
+
+	vmovdqu	0(%rsi),%xmm7
+	vmovdqu	16(%rsi),%xmm8
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpsrldq	$6,%ymm7,%ymm9
+	vpsrldq	$6,%ymm8,%ymm10
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+	vpunpcklqdq	%ymm10,%ymm9,%ymm9
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+
+	vpsrlq	$30,%ymm9,%ymm10
+	vpsrlq	$4,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+	vpsrlq	$40,%ymm6,%ymm6
+	vpand	%ymm5,%ymm9,%ymm9
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	vpaddq	%ymm2,%ymm9,%ymm2
+	subq	$64,%rdx
+	jz	.Ltail_avx2
+	jmp	.Loop_avx2
+
+.align	32
+.Loop_avx2:
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqa	0(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqa	32(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqa	96(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqa	48(%rax),%ymm10
+	vmovdqa	112(%rax),%ymm5
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	64(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+	vmovdqa	-16(%rax),%ymm8
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vmovdqu	0(%rsi),%xmm7
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vmovdqu	16(%rsi),%xmm8
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqa	16(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpsrldq	$6,%ymm7,%ymm9
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpsrldq	$6,%ymm8,%ymm10
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpunpcklqdq	%ymm10,%ymm9,%ymm10
+	vpmuludq	80(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$4,%ymm10,%ymm9
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpand	%ymm5,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpsrlq	$30,%ymm10,%ymm10
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$40,%ymm6,%ymm6
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	subq	$64,%rdx
+	jnz	.Loop_avx2
+
+.byte	0x66,0x90
+.Ltail_avx2:
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqu	4(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqu	36(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqu	100(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqu	52(%rax),%ymm10
+	vmovdqu	116(%rax),%ymm5
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	68(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vmovdqu	-12(%rax),%ymm8
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqu	20(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpmuludq	84(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+	vpsrldq	$8,%ymm12,%ymm8
+	vpsrldq	$8,%ymm2,%ymm9
+	vpsrldq	$8,%ymm3,%ymm10
+	vpsrldq	$8,%ymm4,%ymm6
+	vpsrldq	$8,%ymm0,%ymm7
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+
+	vpermq	$0x2,%ymm3,%ymm10
+	vpermq	$0x2,%ymm4,%ymm6
+	vpermq	$0x2,%ymm0,%ymm7
+	vpermq	$0x2,%ymm12,%ymm8
+	vpermq	$0x2,%ymm2,%ymm9
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vmovd	%xmm0,-112(%rdi)
+	vmovd	%xmm1,-108(%rdi)
+	vmovd	%xmm2,-104(%rdi)
+	vmovd	%xmm3,-100(%rdi)
+	vmovd	%xmm4,-96(%rdi)
+	leaq	-8(%r10),%rsp
+
+	vzeroupper
+	ret
+
+ENDPROC(poly1305_blocks_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align	32
+ENTRY(poly1305_blocks_avx512)
+
+	movl	20(%rdi),%r8d
+	cmpq	$128,%rdx
+	jae	.Lblocks_avx2_512
+	testl	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx2_512:
+	andq	$-16,%rdx
+	jz	.Lno_data_avx2_512
+
+	vzeroupper
+
+	testl	%r8d,%r8d
+	jz	.Lbase2_64_avx2_512
+
+	testq	$63,%rdx
+	jz	.Leven_avx2_512
+
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdi
+
+.Lblocks_avx2_body_512:
+
+	movq	%rdx,%r15
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movl	16(%rdi),%r10d
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+
+	movl	%r8d,%r14d
+	andq	$-2147483648,%r8
+	movq	%r9,%r12
+	movl	%r9d,%ebx
+	andq	$-2147483648,%r9
+
+	shrq	$6,%r8
+	shlq	$52,%r12
+	addq	%r8,%r14
+	shrq	$12,%rbx
+	shrq	$18,%r9
+	addq	%r12,%r14
+	adcq	%r9,%rbx
+
+	movq	%r10,%r8
+	shlq	$40,%r8
+	shrq	$24,%r10
+	addq	%r8,%rbx
+	adcq	$0,%r10
+
+	movq	$-4,%r9
+	movq	%r10,%r8
+	andq	%r10,%r9
+	shrq	$2,%r8
+	andq	$3,%r10
+	addq	%r9,%r8
+	addq	%r8,%r14
+	adcq	$0,%rbx
+	adcq	$0,%r10
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+.Lbase2_26_pre_avx2_512:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%r10
+	subq	$16,%r15
+
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_26_pre_avx2_512
+
+	testq	%rcx,%rcx
+	jz	.Lstore_base2_64_avx2_512
+
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r11
+	movq	%rbx,%r12
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r11
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r11,%r14
+	shlq	$24,%r10
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r12
+	andq	$0x3ffffff,%rbx
+	orq	%r12,%r10
+
+	testq	%r15,%r15
+	jz	.Lstore_base2_26_avx2_512
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%r10d,%xmm4
+	jmp	.Lproceed_avx2_512
+
+.align	32
+.Lstore_base2_64_avx2_512:
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%r10,16(%rdi)
+	jmp	.Ldone_avx2_512
+
+.align	16
+.Lstore_base2_26_avx2_512:
+	movl	%eax,0(%rdi)
+	movl	%edx,4(%rdi)
+	movl	%r14d,8(%rdi)
+	movl	%ebx,12(%rdi)
+	movl	%r10d,16(%rdi)
+.align	16
+.Ldone_avx2_512:
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbx
+	leaq	48(%rsp),%rsp
+
+.Lno_data_avx2_512:
+.Lblocks_avx2_epilogue_512:
+	ret
+
+
+.align	32
+.Lbase2_64_avx2_512:
+
+	pushq	%rbx
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	pushq	%rdi
+
+.Lbase2_64_avx2_body_512:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movl	16(%rdi),%r10d
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	testq	$63,%rdx
+	jz	.Linit_avx2_512
+
+.Lbase2_64_pre_avx2_512:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%r10
+	subq	$16,%r15
+
+	movq	%rdi,0(%rsp)
+	__poly1305_block
+	movq	0(%rsp),%rdi
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_64_pre_avx2_512
+
+.Linit_avx2_512:
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r8
+	movq	%rbx,%r9
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r8
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r8,%r14
+	shlq	$24,%r10
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r9
+	andq	$0x3ffffff,%rbx
+	orq	%r9,%r10
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%r10d,%xmm4
+	movl	$1,20(%rdi)
+
+	__poly1305_init_avx
+
+.Lproceed_avx2_512:
+	movq	%r15,%rdx
+
+	movq	8(%rsp),%r15
+	movq	16(%rsp),%r14
+	movq	24(%rsp),%r13
+	movq	32(%rsp),%r12
+	movq	40(%rsp),%rbx
+	leaq	48(%rsp),%rax
+	leaq	48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue_512:
+	jmp	.Ldo_avx2_512
+
+
+.align	32
+.Leven_avx2_512:
+
+	vmovd	0(%rdi),%xmm0
+	vmovd	4(%rdi),%xmm1
+	vmovd	8(%rdi),%xmm2
+	vmovd	12(%rdi),%xmm3
+	vmovd	16(%rdi),%xmm4
+
+.Ldo_avx2_512:
+	cmpq	$512,%rdx
+	jae	.Lblocks_avx512
+.Lskip_avx512:
+	leaq	8(%rsp),%r10
+
+	subq	$0x128,%rsp
+	leaq	.Lconst(%rip),%rcx
+	leaq	48+64(%rdi),%rdi
+	vmovdqa	96(%rcx),%ymm7
+
+
+	vmovdqu	-64(%rdi),%xmm9
+	andq	$-512,%rsp
+	vmovdqu	-48(%rdi),%xmm10
+	vmovdqu	-32(%rdi),%xmm6
+	vmovdqu	-16(%rdi),%xmm11
+	vmovdqu	0(%rdi),%xmm12
+	vmovdqu	16(%rdi),%xmm13
+	leaq	144(%rsp),%rax
+	vmovdqu	32(%rdi),%xmm14
+	vpermd	%ymm9,%ymm7,%ymm9
+	vmovdqu	48(%rdi),%xmm15
+	vpermd	%ymm10,%ymm7,%ymm10
+	vmovdqu	64(%rdi),%xmm5
+	vpermd	%ymm6,%ymm7,%ymm6
+	vmovdqa	%ymm9,0(%rsp)
+	vpermd	%ymm11,%ymm7,%ymm11
+	vmovdqa	%ymm10,32-144(%rax)
+	vpermd	%ymm12,%ymm7,%ymm12
+	vmovdqa	%ymm6,64-144(%rax)
+	vpermd	%ymm13,%ymm7,%ymm13
+	vmovdqa	%ymm11,96-144(%rax)
+	vpermd	%ymm14,%ymm7,%ymm14
+	vmovdqa	%ymm12,128-144(%rax)
+	vpermd	%ymm15,%ymm7,%ymm15
+	vmovdqa	%ymm13,160-144(%rax)
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqa	%ymm14,192-144(%rax)
+	vmovdqa	%ymm15,224-144(%rax)
+	vmovdqa	%ymm5,256-144(%rax)
+	vmovdqa	64(%rcx),%ymm5
+
+
+
+	vmovdqu	0(%rsi),%xmm7
+	vmovdqu	16(%rsi),%xmm8
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpsrldq	$6,%ymm7,%ymm9
+	vpsrldq	$6,%ymm8,%ymm10
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+	vpunpcklqdq	%ymm10,%ymm9,%ymm9
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+
+	vpsrlq	$30,%ymm9,%ymm10
+	vpsrlq	$4,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+	vpsrlq	$40,%ymm6,%ymm6
+	vpand	%ymm5,%ymm9,%ymm9
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	vpaddq	%ymm2,%ymm9,%ymm2
+	subq	$64,%rdx
+	jz	.Ltail_avx2_512
+	jmp	.Loop_avx2_512
+
+.align	32
+.Loop_avx2_512:
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqa	0(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqa	32(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqa	96(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqa	48(%rax),%ymm10
+	vmovdqa	112(%rax),%ymm5
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	64(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+	vmovdqa	-16(%rax),%ymm8
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vmovdqu	0(%rsi),%xmm7
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vmovdqu	16(%rsi),%xmm8
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqa	16(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpsrldq	$6,%ymm7,%ymm9
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpsrldq	$6,%ymm8,%ymm10
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpunpcklqdq	%ymm10,%ymm9,%ymm10
+	vpmuludq	80(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$4,%ymm10,%ymm9
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpand	%ymm5,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpsrlq	$30,%ymm10,%ymm10
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$40,%ymm6,%ymm6
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	subq	$64,%rdx
+	jnz	.Loop_avx2_512
+
+.byte	0x66,0x90
+.Ltail_avx2_512:
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqu	4(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqu	36(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqu	100(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqu	52(%rax),%ymm10
+	vmovdqu	116(%rax),%ymm5
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	68(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vmovdqu	-12(%rax),%ymm8
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqu	20(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpmuludq	84(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+	vpsrldq	$8,%ymm12,%ymm8
+	vpsrldq	$8,%ymm2,%ymm9
+	vpsrldq	$8,%ymm3,%ymm10
+	vpsrldq	$8,%ymm4,%ymm6
+	vpsrldq	$8,%ymm0,%ymm7
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+
+	vpermq	$0x2,%ymm3,%ymm10
+	vpermq	$0x2,%ymm4,%ymm6
+	vpermq	$0x2,%ymm0,%ymm7
+	vpermq	$0x2,%ymm12,%ymm8
+	vpermq	$0x2,%ymm2,%ymm9
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vmovd	%xmm0,-112(%rdi)
+	vmovd	%xmm1,-108(%rdi)
+	vmovd	%xmm2,-104(%rdi)
+	vmovd	%xmm3,-100(%rdi)
+	vmovd	%xmm4,-96(%rdi)
+	leaq	-8(%r10),%rsp
+
+	vzeroupper
+	ret
+
+.Lblocks_avx512:
+
+	movl	$15,%eax
+	kmovw	%eax,%k2
+	leaq	8(%rsp),%r10
+
+	subq	$0x128,%rsp
+	leaq	.Lconst(%rip),%rcx
+	leaq	48+64(%rdi),%rdi
+	vmovdqa	96(%rcx),%ymm9
+
+	vmovdqu32	-64(%rdi),%zmm16{%k2}{z}
+	andq	$-512,%rsp
+	vmovdqu32	-48(%rdi),%zmm17{%k2}{z}
+	movq	$0x20,%rax
+	vmovdqu32	-32(%rdi),%zmm21{%k2}{z}
+	vmovdqu32	-16(%rdi),%zmm18{%k2}{z}
+	vmovdqu32	0(%rdi),%zmm22{%k2}{z}
+	vmovdqu32	16(%rdi),%zmm19{%k2}{z}
+	vmovdqu32	32(%rdi),%zmm23{%k2}{z}
+	vmovdqu32	48(%rdi),%zmm20{%k2}{z}
+	vmovdqu32	64(%rdi),%zmm24{%k2}{z}
+	vpermd	%zmm16,%zmm9,%zmm16
+	vpbroadcastq	64(%rcx),%zmm5
+	vpermd	%zmm17,%zmm9,%zmm17
+	vpermd	%zmm21,%zmm9,%zmm21
+	vpermd	%zmm18,%zmm9,%zmm18
+	vmovdqa64	%zmm16,0(%rsp){%k2}
+	vpsrlq	$32,%zmm16,%zmm7
+	vpermd	%zmm22,%zmm9,%zmm22
+	vmovdqu64	%zmm17,0(%rsp,%rax,1){%k2}
+	vpsrlq	$32,%zmm17,%zmm8
+	vpermd	%zmm19,%zmm9,%zmm19
+	vmovdqa64	%zmm21,64(%rsp){%k2}
+	vpermd	%zmm23,%zmm9,%zmm23
+	vpermd	%zmm20,%zmm9,%zmm20
+	vmovdqu64	%zmm18,64(%rsp,%rax,1){%k2}
+	vpermd	%zmm24,%zmm9,%zmm24
+	vmovdqa64	%zmm22,128(%rsp){%k2}
+	vmovdqu64	%zmm19,128(%rsp,%rax,1){%k2}
+	vmovdqa64	%zmm23,192(%rsp){%k2}
+	vmovdqu64	%zmm20,192(%rsp,%rax,1){%k2}
+	vmovdqa64	%zmm24,256(%rsp){%k2}
+
+	vpmuludq	%zmm7,%zmm16,%zmm11
+	vpmuludq	%zmm7,%zmm17,%zmm12
+	vpmuludq	%zmm7,%zmm18,%zmm13
+	vpmuludq	%zmm7,%zmm19,%zmm14
+	vpmuludq	%zmm7,%zmm20,%zmm15
+	vpsrlq	$32,%zmm18,%zmm9
+
+	vpmuludq	%zmm8,%zmm24,%zmm25
+	vpmuludq	%zmm8,%zmm16,%zmm26
+	vpmuludq	%zmm8,%zmm17,%zmm27
+	vpmuludq	%zmm8,%zmm18,%zmm28
+	vpmuludq	%zmm8,%zmm19,%zmm29
+	vpsrlq	$32,%zmm19,%zmm10
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+
+	vpmuludq	%zmm9,%zmm23,%zmm25
+	vpmuludq	%zmm9,%zmm24,%zmm26
+	vpmuludq	%zmm9,%zmm17,%zmm28
+	vpmuludq	%zmm9,%zmm18,%zmm29
+	vpmuludq	%zmm9,%zmm16,%zmm27
+	vpsrlq	$32,%zmm20,%zmm6
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm10,%zmm22,%zmm25
+	vpmuludq	%zmm10,%zmm16,%zmm28
+	vpmuludq	%zmm10,%zmm17,%zmm29
+	vpmuludq	%zmm10,%zmm23,%zmm26
+	vpmuludq	%zmm10,%zmm24,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm6,%zmm24,%zmm28
+	vpmuludq	%zmm6,%zmm16,%zmm29
+	vpmuludq	%zmm6,%zmm21,%zmm25
+	vpmuludq	%zmm6,%zmm22,%zmm26
+	vpmuludq	%zmm6,%zmm23,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vmovdqu64	0(%rsi),%zmm10
+	vmovdqu64	64(%rsi),%zmm6
+	leaq	128(%rsi),%rsi
+
+	vpsrlq	$26,%zmm14,%zmm28
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm28,%zmm15,%zmm15
+
+	vpsrlq	$26,%zmm11,%zmm25
+	vpandq	%zmm5,%zmm11,%zmm11
+	vpaddq	%zmm25,%zmm12,%zmm12
+
+	vpsrlq	$26,%zmm15,%zmm29
+	vpandq	%zmm5,%zmm15,%zmm15
+
+	vpsrlq	$26,%zmm12,%zmm26
+	vpandq	%zmm5,%zmm12,%zmm12
+	vpaddq	%zmm26,%zmm13,%zmm13
+
+	vpaddq	%zmm29,%zmm11,%zmm11
+	vpsllq	$2,%zmm29,%zmm29
+	vpaddq	%zmm29,%zmm11,%zmm11
+
+	vpsrlq	$26,%zmm13,%zmm27
+	vpandq	%zmm5,%zmm13,%zmm13
+	vpaddq	%zmm27,%zmm14,%zmm14
+
+	vpsrlq	$26,%zmm11,%zmm25
+	vpandq	%zmm5,%zmm11,%zmm11
+	vpaddq	%zmm25,%zmm12,%zmm12
+
+	vpsrlq	$26,%zmm14,%zmm28
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm28,%zmm15,%zmm15
+
+	vpunpcklqdq	%zmm6,%zmm10,%zmm7
+	vpunpckhqdq	%zmm6,%zmm10,%zmm6
+
+	vmovdqa32	128(%rcx),%zmm25
+	movl	$0x7777,%eax
+	kmovw	%eax,%k1
+
+	vpermd	%zmm16,%zmm25,%zmm16
+	vpermd	%zmm17,%zmm25,%zmm17
+	vpermd	%zmm18,%zmm25,%zmm18
+	vpermd	%zmm19,%zmm25,%zmm19
+	vpermd	%zmm20,%zmm25,%zmm20
+
+	vpermd	%zmm11,%zmm25,%zmm16{%k1}
+	vpermd	%zmm12,%zmm25,%zmm17{%k1}
+	vpermd	%zmm13,%zmm25,%zmm18{%k1}
+	vpermd	%zmm14,%zmm25,%zmm19{%k1}
+	vpermd	%zmm15,%zmm25,%zmm20{%k1}
+
+	vpslld	$2,%zmm17,%zmm21
+	vpslld	$2,%zmm18,%zmm22
+	vpslld	$2,%zmm19,%zmm23
+	vpslld	$2,%zmm20,%zmm24
+	vpaddd	%zmm17,%zmm21,%zmm21
+	vpaddd	%zmm18,%zmm22,%zmm22
+	vpaddd	%zmm19,%zmm23,%zmm23
+	vpaddd	%zmm20,%zmm24,%zmm24
+
+	vpbroadcastq	32(%rcx),%zmm30
+
+	vpsrlq	$52,%zmm7,%zmm9
+	vpsllq	$12,%zmm6,%zmm10
+	vporq	%zmm10,%zmm9,%zmm9
+	vpsrlq	$26,%zmm7,%zmm8
+	vpsrlq	$14,%zmm6,%zmm10
+	vpsrlq	$40,%zmm6,%zmm6
+	vpandq	%zmm5,%zmm9,%zmm9
+	vpandq	%zmm5,%zmm7,%zmm7
+
+	vpaddq	%zmm2,%zmm9,%zmm2
+	subq	$192,%rdx
+	jbe	.Ltail_avx512
+	jmp	.Loop_avx512
+
+.align	32
+.Loop_avx512:
+
+	vpmuludq	%zmm2,%zmm17,%zmm14
+	vpaddq	%zmm0,%zmm7,%zmm0
+	vpmuludq	%zmm2,%zmm18,%zmm15
+	vpandq	%zmm5,%zmm8,%zmm8
+	vpmuludq	%zmm2,%zmm23,%zmm11
+	vpandq	%zmm5,%zmm10,%zmm10
+	vpmuludq	%zmm2,%zmm24,%zmm12
+	vporq	%zmm30,%zmm6,%zmm6
+	vpmuludq	%zmm2,%zmm16,%zmm13
+	vpaddq	%zmm1,%zmm8,%zmm1
+	vpaddq	%zmm3,%zmm10,%zmm3
+	vpaddq	%zmm4,%zmm6,%zmm4
+
+	vmovdqu64	0(%rsi),%zmm10
+	vmovdqu64	64(%rsi),%zmm6
+	leaq	128(%rsi),%rsi
+	vpmuludq	%zmm0,%zmm19,%zmm28
+	vpmuludq	%zmm0,%zmm20,%zmm29
+	vpmuludq	%zmm0,%zmm16,%zmm25
+	vpmuludq	%zmm0,%zmm17,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+
+	vpmuludq	%zmm1,%zmm18,%zmm28
+	vpmuludq	%zmm1,%zmm19,%zmm29
+	vpmuludq	%zmm1,%zmm24,%zmm25
+	vpmuludq	%zmm0,%zmm18,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpunpcklqdq	%zmm6,%zmm10,%zmm7
+	vpunpckhqdq	%zmm6,%zmm10,%zmm6
+
+	vpmuludq	%zmm3,%zmm16,%zmm28
+	vpmuludq	%zmm3,%zmm17,%zmm29
+	vpmuludq	%zmm1,%zmm16,%zmm26
+	vpmuludq	%zmm1,%zmm17,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm24,%zmm28
+	vpmuludq	%zmm4,%zmm16,%zmm29
+	vpmuludq	%zmm3,%zmm22,%zmm25
+	vpmuludq	%zmm3,%zmm23,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpmuludq	%zmm3,%zmm24,%zmm27
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm21,%zmm25
+	vpmuludq	%zmm4,%zmm22,%zmm26
+	vpmuludq	%zmm4,%zmm23,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm0
+	vpaddq	%zmm26,%zmm12,%zmm1
+	vpaddq	%zmm27,%zmm13,%zmm2
+
+	vpsrlq	$52,%zmm7,%zmm9
+	vpsllq	$12,%zmm6,%zmm10
+
+	vpsrlq	$26,%zmm14,%zmm3
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm3,%zmm15,%zmm4
+
+	vporq	%zmm10,%zmm9,%zmm9
+
+	vpsrlq	$26,%zmm0,%zmm11
+	vpandq	%zmm5,%zmm0,%zmm0
+	vpaddq	%zmm11,%zmm1,%zmm1
+
+	vpandq	%zmm5,%zmm9,%zmm9
+
+	vpsrlq	$26,%zmm4,%zmm15
+	vpandq	%zmm5,%zmm4,%zmm4
+
+	vpsrlq	$26,%zmm1,%zmm12
+	vpandq	%zmm5,%zmm1,%zmm1
+	vpaddq	%zmm12,%zmm2,%zmm2
+
+	vpaddq	%zmm15,%zmm0,%zmm0
+	vpsllq	$2,%zmm15,%zmm15
+	vpaddq	%zmm15,%zmm0,%zmm0
+
+	vpaddq	%zmm9,%zmm2,%zmm2
+	vpsrlq	$26,%zmm7,%zmm8
+
+	vpsrlq	$26,%zmm2,%zmm13
+	vpandq	%zmm5,%zmm2,%zmm2
+	vpaddq	%zmm13,%zmm14,%zmm3
+
+	vpsrlq	$14,%zmm6,%zmm10
+
+	vpsrlq	$26,%zmm0,%zmm11
+	vpandq	%zmm5,%zmm0,%zmm0
+	vpaddq	%zmm11,%zmm1,%zmm1
+
+	vpsrlq	$40,%zmm6,%zmm6
+
+	vpsrlq	$26,%zmm3,%zmm14
+	vpandq	%zmm5,%zmm3,%zmm3
+	vpaddq	%zmm14,%zmm4,%zmm4
+
+	vpandq	%zmm5,%zmm7,%zmm7
+
+	subq	$128,%rdx
+	ja	.Loop_avx512
+
+.Ltail_avx512:
+
+	vpsrlq	$32,%zmm16,%zmm16
+	vpsrlq	$32,%zmm17,%zmm17
+	vpsrlq	$32,%zmm18,%zmm18
+	vpsrlq	$32,%zmm23,%zmm23
+	vpsrlq	$32,%zmm24,%zmm24
+	vpsrlq	$32,%zmm19,%zmm19
+	vpsrlq	$32,%zmm20,%zmm20
+	vpsrlq	$32,%zmm21,%zmm21
+	vpsrlq	$32,%zmm22,%zmm22
+
+	leaq	(%rsi,%rdx,1),%rsi
+
+	vpaddq	%zmm0,%zmm7,%zmm0
+
+	vpmuludq	%zmm2,%zmm17,%zmm14
+	vpmuludq	%zmm2,%zmm18,%zmm15
+	vpmuludq	%zmm2,%zmm23,%zmm11
+	vpandq	%zmm5,%zmm8,%zmm8
+	vpmuludq	%zmm2,%zmm24,%zmm12
+	vpandq	%zmm5,%zmm10,%zmm10
+	vpmuludq	%zmm2,%zmm16,%zmm13
+	vporq	%zmm30,%zmm6,%zmm6
+	vpaddq	%zmm1,%zmm8,%zmm1
+	vpaddq	%zmm3,%zmm10,%zmm3
+	vpaddq	%zmm4,%zmm6,%zmm4
+
+	vmovdqu	0(%rsi),%xmm7
+	vpmuludq	%zmm0,%zmm19,%zmm28
+	vpmuludq	%zmm0,%zmm20,%zmm29
+	vpmuludq	%zmm0,%zmm16,%zmm25
+	vpmuludq	%zmm0,%zmm17,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+
+	vmovdqu	16(%rsi),%xmm8
+	vpmuludq	%zmm1,%zmm18,%zmm28
+	vpmuludq	%zmm1,%zmm19,%zmm29
+	vpmuludq	%zmm1,%zmm24,%zmm25
+	vpmuludq	%zmm0,%zmm18,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+	vpmuludq	%zmm3,%zmm16,%zmm28
+	vpmuludq	%zmm3,%zmm17,%zmm29
+	vpmuludq	%zmm1,%zmm16,%zmm26
+	vpmuludq	%zmm1,%zmm17,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	vpmuludq	%zmm4,%zmm24,%zmm28
+	vpmuludq	%zmm4,%zmm16,%zmm29
+	vpmuludq	%zmm3,%zmm22,%zmm25
+	vpmuludq	%zmm3,%zmm23,%zmm26
+	vpmuludq	%zmm3,%zmm24,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm3
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm21,%zmm25
+	vpmuludq	%zmm4,%zmm22,%zmm26
+	vpmuludq	%zmm4,%zmm23,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm0
+	vpaddq	%zmm26,%zmm12,%zmm1
+	vpaddq	%zmm27,%zmm13,%zmm2
+
+	movl	$1,%eax
+	vpermq	$0xb1,%zmm3,%zmm14
+	vpermq	$0xb1,%zmm15,%zmm4
+	vpermq	$0xb1,%zmm0,%zmm11
+	vpermq	$0xb1,%zmm1,%zmm12
+	vpermq	$0xb1,%zmm2,%zmm13
+	vpaddq	%zmm14,%zmm3,%zmm3
+	vpaddq	%zmm15,%zmm4,%zmm4
+	vpaddq	%zmm11,%zmm0,%zmm0
+	vpaddq	%zmm12,%zmm1,%zmm1
+	vpaddq	%zmm13,%zmm2,%zmm2
+
+	kmovw	%eax,%k3
+	vpermq	$0x2,%zmm3,%zmm14
+	vpermq	$0x2,%zmm4,%zmm15
+	vpermq	$0x2,%zmm0,%zmm11
+	vpermq	$0x2,%zmm1,%zmm12
+	vpermq	$0x2,%zmm2,%zmm13
+	vpaddq	%zmm14,%zmm3,%zmm3
+	vpaddq	%zmm15,%zmm4,%zmm4
+	vpaddq	%zmm11,%zmm0,%zmm0
+	vpaddq	%zmm12,%zmm1,%zmm1
+	vpaddq	%zmm13,%zmm2,%zmm2
+
+	vextracti64x4	$0x1,%zmm3,%ymm14
+	vextracti64x4	$0x1,%zmm4,%ymm15
+	vextracti64x4	$0x1,%zmm0,%ymm11
+	vextracti64x4	$0x1,%zmm1,%ymm12
+	vextracti64x4	$0x1,%zmm2,%ymm13
+	vpaddq	%zmm14,%zmm3,%zmm3{%k3}{z}
+	vpaddq	%zmm15,%zmm4,%zmm4{%k3}{z}
+	vpaddq	%zmm11,%zmm0,%zmm0{%k3}{z}
+	vpaddq	%zmm12,%zmm1,%zmm1{%k3}{z}
+	vpaddq	%zmm13,%zmm2,%zmm2{%k3}{z}
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpsrldq	$6,%ymm7,%ymm9
+	vpsrldq	$6,%ymm8,%ymm10
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpunpcklqdq	%ymm10,%ymm9,%ymm9
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpsrlq	$30,%ymm9,%ymm10
+	vpsrlq	$4,%ymm9,%ymm9
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpsrlq	$26,%ymm7,%ymm8
+	vpsrlq	$40,%ymm6,%ymm6
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpand	%ymm5,%ymm9,%ymm9
+	vpand	%ymm5,%ymm7,%ymm7
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm2,%ymm9,%ymm2
+	vpand	%ymm5,%ymm8,%ymm8
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	leaq	144(%rsp),%rax
+	addq	$64,%rdx
+	jnz	.Ltail_avx2_512
+
+	vpsubq	%ymm9,%ymm2,%ymm2
+	vmovd	%xmm0,-112(%rdi)
+	vmovd	%xmm1,-108(%rdi)
+	vmovd	%xmm2,-104(%rdi)
+	vmovd	%xmm3,-100(%rdi)
+	vmovd	%xmm4,-96(%rdi)
+	vzeroall
+	leaq	-8(%r10),%rsp
+
+	ret
+
+ENDPROC(poly1305_blocks_avx512)
+#endif /* CONFIG_AS_AVX512 */
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 08/20] zinc: Poly1305 ARM and ARM64 implementations
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Jean-Philippe Aumasson, Andy Polyakov, Russell King,
	linux-arm-kernel
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

These NEON and non-NEON implementations come from Andy Polyakov's
implementation. They are exactly the same as Andy Polyakov's original,
with the following exceptions:

- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
  removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed to fit kernel conventions.
- The neon code can jump to the scalar code when it makes sense to do
  so.

After '/^#/d;/^\..*[^:]$/d', the code has the following diff in actual
instructions from the original.

ARM:

-poly1305_init:
-.Lpoly1305_init:
+ENTRY(poly1305_init_arm)
 	stmdb	sp!,{r4-r11}

 	eor	r3,r3,r3
@@ -18,8 +25,6 @@
 	moveq	r0,#0
 	beq	.Lno_key

-	adr	r11,.Lpoly1305_init
-	ldr	r12,.LOPENSSL_armcap
 	ldrb	r4,[r1,#0]
 	mov	r10,#0x0fffffff
 	ldrb	r5,[r1,#1]
@@ -34,8 +39,6 @@
 	ldrb	r7,[r1,#6]
 	and	r4,r4,r10

-	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
-	ldr	r12,[r12]
 	ldrb	r8,[r1,#7]
 	orr	r5,r5,r6,lsl#8
 	ldrb	r6,[r1,#8]
@@ -45,22 +48,6 @@
 	ldrb	r8,[r1,#10]
 	and	r5,r5,r3

-	tst	r12,#ARMV7_NEON		@ check for NEON
-	adr	r9,poly1305_blocks_neon
-	adr	r11,poly1305_blocks
-	it	ne
-	movne	r11,r9
-	adr	r12,poly1305_emit
-	adr	r10,poly1305_emit_neon
-	it	ne
-	movne	r12,r10
-	itete	eq
-	addeq	r12,r11,#(poly1305_emit-.Lpoly1305_init)
-	addne	r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
-	addeq	r11,r11,#(poly1305_blocks-.Lpoly1305_init)
-	addne	r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-	orr	r12,r12,#1	@ thumb-ify address
-	orr	r11,r11,#1
 	ldrb	r9,[r1,#11]
 	orr	r6,r6,r7,lsl#8
 	ldrb	r7,[r1,#12]
@@ -79,17 +66,16 @@
 	str	r6,[r0,#8]
 	and	r7,r7,r3
 	str	r7,[r0,#12]
-	stmia	r2,{r11,r12}		@ fill functions table
-	mov	r0,#1
-	mov	r0,#0
 .Lno_key:
 	ldmia	sp!,{r4-r11}
 	bx	lr				@ bx	lr
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-poly1305_blocks:
-.Lpoly1305_blocks:
+ENDPROC(poly1305_init_arm)
+
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
 	stmdb	sp!,{r3-r11,lr}

 	ands	r2,r2,#-16
@@ -231,10 +217,11 @@
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
-poly1305_emit:
+ENDPROC(poly1305_blocks_arm)
+
+ENTRY(poly1305_emit_arm)
 	stmdb	sp!,{r4-r11}
 .Lpoly1305_emit_enter:
-
 	ldmia	r0,{r3-r7}
 	adds	r8,r3,#5		@ compare to modulus
 	adcs	r9,r4,#0
@@ -305,8 +292,12 @@
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+ENDPROC(poly1305_emit_arm)
+
+

-poly1305_init_neon:
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
 	ldr	r4,[r0,#20]		@ load key base 2^32
 	ldr	r5,[r0,#24]
 	ldr	r6,[r0,#28]
@@ -515,8 +506,9 @@
 	vst1.32		{d8[1]},[r7]

 	bx	lr				@ bx	lr
+ENDPROC(poly1305_init_neon)

-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
 	ldr	ip,[r0,#36]		@ is_base2_26
 	ands	r2,r2,#-16
 	beq	.Lno_data_neon
@@ -524,7 +516,7 @@
 	cmp	r2,#64
 	bhs	.Lenter_neon
 	tst	ip,ip			@ is_base2_26?
-	beq	.Lpoly1305_blocks
+	beq	.Lpoly1305_blocks_arm

 .Lenter_neon:
 	stmdb	sp!,{r4-r7}
@@ -534,7 +526,7 @@
 	bne	.Lbase2_26_neon

 	stmdb	sp!,{r1-r3,lr}
-	bl	poly1305_init_neon
+	bl	.Lpoly1305_init_neon

 	ldr	r4,[r0,#0]		@ load hash value base 2^32
 	ldr	r5,[r0,#4]
@@ -989,8 +981,9 @@
 	ldmia	sp!,{r4-r7}
 .Lno_data_neon:
 	bx	lr					@ bx	lr
+ENDPROC(poly1305_blocks_neon)

-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
 	ldr	ip,[r0,#36]		@ is_base2_26

 	stmdb	sp!,{r4-r11}
@@ -1055,6 +1048,6 @@

 	ldmia	sp!,{r4-r11}
 	bx	lr				@ bx	lr
+ENDPROC(poly1305_emit_neon)

ARM64:

-poly1305_init:
+ENTRY(poly1305_init_arm)
 	cmp	x1,xzr
 	stp	xzr,xzr,[x0]		// zero hash value
 	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
@@ -11,14 +15,9 @@
 	csel	x0,xzr,x0,eq
 	b.eq	.Lno_key

-	ldrsw	x11,.LOPENSSL_armcap_P
-	ldr	x11,.LOPENSSL_armcap_P
-	adr	x10,.LOPENSSL_armcap_P
-
 	ldp	x7,x8,[x1]		// load key
 	mov	x9,#0xfffffffc0fffffff
 	movk	x9,#0x0fff,lsl#48
-	ldr	w17,[x10,x11]
 	rev	x7,x7			// flip bytes
 	rev	x8,x8
 	and	x7,x7,x9		// &=0ffffffc0fffffff
@@ -26,24 +25,11 @@
 	and	x8,x8,x9		// &=0ffffffc0ffffffc
 	stp	x7,x8,[x0,#32]	// save key value

-	tst	w17,#ARMV7_NEON
-
-	adr	x12,poly1305_blocks
-	adr	x7,poly1305_blocks_neon
-	adr	x13,poly1305_emit
-	adr	x8,poly1305_emit_neon
-
-	csel	x12,x12,x7,eq
-	csel	x13,x13,x8,eq
-
-	stp	w12,w13,[x2]
-	stp	x12,x13,[x2]
-
-	mov	x0,#1
 .Lno_key:
 	ret
+ENDPROC(poly1305_init_arm)

-poly1305_blocks:
+ENTRY(poly1305_blocks_arm)
 	ands	x2,x2,#-16
 	b.eq	.Lno_data

@@ -100,8 +86,9 @@

 .Lno_data:
 	ret
+ENDPROC(poly1305_blocks_arm)

-poly1305_emit:
+ENTRY(poly1305_emit_arm)
 	ldp	x4,x5,[x0]		// load hash base 2^64
 	ldr	x6,[x0,#16]
 	ldp	x10,x11,[x2]	// load nonce
@@ -124,7 +111,9 @@
 	stp	x4,x5,[x1]		// write result

 	ret
-poly1305_mult:
+ENDPROC(poly1305_emit_arm)
+
+__poly1305_mult:
 	mul	x12,x4,x7		// h0*r0
 	umulh	x13,x4,x7

@@ -158,7 +147,7 @@

 	ret

-poly1305_splat:
+__poly1305_splat:
 	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
 	ubfx	x13,x4,#26,#26
 	extr	x14,x5,x4,#52
@@ -182,11 +171,11 @@

 	ret

-poly1305_blocks_neon:
+ENTRY(poly1305_blocks_neon)
 	ldr	x17,[x0,#24]
 	cmp	x2,#128
 	b.hs	.Lblocks_neon
-	cbz	x17,poly1305_blocks
+	cbz	x17,poly1305_blocks_arm

 .Lblocks_neon:
 	stp	x29,x30,[sp,#-80]!
@@ -232,7 +221,7 @@
 	adcs	x5,x5,x13
 	adc	x6,x6,x3

-	bl	poly1305_mult
+	bl	__poly1305_mult
 	ldr	x30,[sp,#8]

 	cbz	x3,.Lstore_base2_64_neon
@@ -274,7 +263,7 @@
 	adcs	x5,x5,x13
 	adc	x6,x6,x3

-	bl	poly1305_mult
+	bl	__poly1305_mult

 .Linit_neon:
 	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
@@ -301,19 +290,19 @@
 	mov	x5,x8
 	mov	x6,xzr
 	add	x0,x0,#48+12
-	bl	poly1305_splat
+	bl	__poly1305_splat

-	bl	poly1305_mult		// r^2
+	bl	__poly1305_mult		// r^2
 	sub	x0,x0,#4
-	bl	poly1305_splat
+	bl	__poly1305_splat

-	bl	poly1305_mult		// r^3
+	bl	__poly1305_mult		// r^3
 	sub	x0,x0,#4
-	bl	poly1305_splat
+	bl	__poly1305_splat

-	bl	poly1305_mult		// r^4
+	bl	__poly1305_mult		// r^4
 	sub	x0,x0,#4
-	bl	poly1305_splat
+	bl	__poly1305_splat
 	ldr	x30,[sp,#8]

 	add	x16,x1,#32
@@ -743,10 +732,11 @@
 .Lno_data_neon:
 	ldr	x29,[sp],#80
 	ret
+ENDPROC(poly1305_blocks_neon)

-poly1305_emit_neon:
+ENTRY(poly1305_emit_neon)
 	ldr	x17,[x0,#24]
-	cbz	x17,poly1305_emit
+	cbz	x17,poly1305_emit_arm

 	ldp	w10,w11,[x0]		// load hash value base 2^26
 	ldp	w12,w13,[x0,#8]
@@ -788,6 +778,6 @@
 	stp	x4,x5,[x1]		// write result

 	ret
+ENDPROC(poly1305_emit_neon)

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
---
 lib/zinc/Makefile                     |    8 +
 lib/zinc/poly1305/poly1305-arm-glue.h |   69 ++
 lib/zinc/poly1305/poly1305-arm.S      | 1117 +++++++++++++++++++++++++
 lib/zinc/poly1305/poly1305-arm64.S    |  822 ++++++++++++++++++
 4 files changed, 2016 insertions(+)
 create mode 100644 lib/zinc/poly1305/poly1305-arm-glue.h
 create mode 100644 lib/zinc/poly1305/poly1305-arm.S
 create mode 100644 lib/zinc/poly1305/poly1305-arm64.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index d1e3892e06d9..f37df89a3f87 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -25,6 +25,14 @@ endif
 
 ifeq ($(CONFIG_ZINC_POLY1305),y)
 zinc-y += poly1305/poly1305.o
+ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
+zinc-y += poly1305/poly1305-arm.o
+CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-arm-glue.h
+endif
+ifeq ($(CONFIG_ZINC_ARCH_ARM64),y)
+zinc-y += poly1305/poly1305-arm64.o
+CFLAGS_poly1305.o += -include $(srctree)/$(src)/poly1305/poly1305-arm-glue.h
+endif
 endif
 
 zinc-y += main.o
diff --git a/lib/zinc/poly1305/poly1305-arm-glue.h b/lib/zinc/poly1305/poly1305-arm-glue.h
new file mode 100644
index 000000000000..53f8fec7f858
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm-glue.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
+asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
+				    const u32 padbit);
+asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&                                     \
+	(defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
+				     const u32 padbit);
+asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
+#endif
+
+static bool poly1305_use_neon __ro_after_init;
+
+void __init poly1305_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+	poly1305_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+	poly1305_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+				      const u8 key[POLY1305_KEY_SIZE],
+				      simd_context_t simd_context)
+{
+	poly1305_init_arm(ctx, key);
+	return true;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+					const size_t len, const u32 padbit,
+					simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+	if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) {
+		poly1305_blocks_neon(ctx, inp, len, padbit);
+		return true;
+	}
+#endif
+	poly1305_blocks_arm(ctx, inp, len, padbit);
+	return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+				      const u32 nonce[4],
+				      simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+	if (simd_context == HAVE_FULL_SIMD && poly1305_use_neon) {
+		poly1305_emit_neon(ctx, mac, nonce);
+		return true;
+	}
+#endif
+	poly1305_emit_arm(ctx, mac, nonce);
+	return true;
+}
+
+#define HAVE_POLY1305_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/poly1305/poly1305-arm.S b/lib/zinc/poly1305/poly1305-arm.S
new file mode 100644
index 000000000000..110f4317b5d7
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm.S
@@ -0,0 +1,1117 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.align	5
+ENTRY(poly1305_init_arm)
+	stmdb	sp!,{r4-r11}
+
+	eor	r3,r3,r3
+	cmp	r1,#0
+	str	r3,[r0,#0]		@ zero hash value
+	str	r3,[r0,#4]
+	str	r3,[r0,#8]
+	str	r3,[r0,#12]
+	str	r3,[r0,#16]
+	str	r3,[r0,#36]		@ is_base2_26
+	add	r0,r0,#20
+
+#ifdef	__thumb2__
+	it	eq
+#endif
+	moveq	r0,#0
+	beq	.Lno_key
+
+	ldrb	r4,[r1,#0]
+	mov	r10,#0x0fffffff
+	ldrb	r5,[r1,#1]
+	and	r3,r10,#-4		@ 0x0ffffffc
+	ldrb	r6,[r1,#2]
+	ldrb	r7,[r1,#3]
+	orr	r4,r4,r5,lsl#8
+	ldrb	r5,[r1,#4]
+	orr	r4,r4,r6,lsl#16
+	ldrb	r6,[r1,#5]
+	orr	r4,r4,r7,lsl#24
+	ldrb	r7,[r1,#6]
+	and	r4,r4,r10
+
+	ldrb	r8,[r1,#7]
+	orr	r5,r5,r6,lsl#8
+	ldrb	r6,[r1,#8]
+	orr	r5,r5,r7,lsl#16
+	ldrb	r7,[r1,#9]
+	orr	r5,r5,r8,lsl#24
+	ldrb	r8,[r1,#10]
+	and	r5,r5,r3
+
+	ldrb	r9,[r1,#11]
+	orr	r6,r6,r7,lsl#8
+	ldrb	r7,[r1,#12]
+	orr	r6,r6,r8,lsl#16
+	ldrb	r8,[r1,#13]
+	orr	r6,r6,r9,lsl#24
+	ldrb	r9,[r1,#14]
+	and	r6,r6,r3
+
+	ldrb	r10,[r1,#15]
+	orr	r7,r7,r8,lsl#8
+	str	r4,[r0,#0]
+	orr	r7,r7,r9,lsl#16
+	str	r5,[r0,#4]
+	orr	r7,r7,r10,lsl#24
+	str	r6,[r0,#8]
+	and	r7,r7,r3
+	str	r7,[r0,#12]
+.Lno_key:
+	ldmia	sp!,{r4-r11}
+#if __LINUX_ARM_ARCH__ >= 5
+	bx	lr				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_init_arm)
+
+.align	5
+ENTRY(poly1305_blocks_arm)
+.Lpoly1305_blocks_arm:
+	stmdb	sp!,{r3-r11,lr}
+
+	ands	r2,r2,#-16
+	beq	.Lno_data
+
+	cmp	r3,#0
+	add	r2,r2,r1		@ end pointer
+	sub	sp,sp,#32
+
+	ldmia	r0,{r4-r12}		@ load context
+
+	str	r0,[sp,#12]		@ offload stuff
+	mov	lr,r1
+	str	r2,[sp,#16]
+	str	r10,[sp,#20]
+	str	r11,[sp,#24]
+	str	r12,[sp,#28]
+	b	.Loop
+
+.Loop:
+#if __LINUX_ARM_ARCH__ < 7
+	ldrb	r0,[lr],#16		@ load input
+#ifdef	__thumb2__
+	it	hi
+#endif
+	addhi	r8,r8,#1		@ 1<<128
+	ldrb	r1,[lr,#-15]
+	ldrb	r2,[lr,#-14]
+	ldrb	r3,[lr,#-13]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-12]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-11]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-10]
+	adds	r4,r4,r3		@ accumulate input
+
+	ldrb	r3,[lr,#-9]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-8]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-7]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-6]
+	adcs	r5,r5,r3
+
+	ldrb	r3,[lr,#-5]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-4]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-3]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-2]
+	adcs	r6,r6,r3
+
+	ldrb	r3,[lr,#-1]
+	orr	r1,r0,r1,lsl#8
+	str	lr,[sp,#8]		@ offload input pointer
+	orr	r2,r1,r2,lsl#16
+	add	r10,r10,r10,lsr#2
+	orr	r3,r2,r3,lsl#24
+#else
+	ldr	r0,[lr],#16		@ load input
+#ifdef	__thumb2__
+	it	hi
+#endif
+	addhi	r8,r8,#1		@ padbit
+	ldr	r1,[lr,#-12]
+	ldr	r2,[lr,#-8]
+	ldr	r3,[lr,#-4]
+#ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+	adds	r4,r4,r0		@ accumulate input
+	str	lr,[sp,#8]		@ offload input pointer
+	adcs	r5,r5,r1
+	add	r10,r10,r10,lsr#2
+	adcs	r6,r6,r2
+#endif
+	add	r11,r11,r11,lsr#2
+	adcs	r7,r7,r3
+	add	r12,r12,r12,lsr#2
+
+	umull	r2,r3,r5,r9
+	 adc	r8,r8,#0
+	umull	r0,r1,r4,r9
+	umlal	r2,r3,r8,r10
+	umlal	r0,r1,r7,r10
+	ldr	r10,[sp,#20]		@ reload r10
+	umlal	r2,r3,r6,r12
+	umlal	r0,r1,r5,r12
+	umlal	r2,r3,r7,r11
+	umlal	r0,r1,r6,r11
+	umlal	r2,r3,r4,r10
+	str	r0,[sp,#0]		@ future r4
+	 mul	r0,r11,r8
+	ldr	r11,[sp,#24]		@ reload r11
+	adds	r2,r2,r1		@ d1+=d0>>32
+	 eor	r1,r1,r1
+	adc	lr,r3,#0		@ future r6
+	str	r2,[sp,#4]		@ future r5
+
+	mul	r2,r12,r8
+	eor	r3,r3,r3
+	umlal	r0,r1,r7,r12
+	ldr	r12,[sp,#28]		@ reload r12
+	umlal	r2,r3,r7,r9
+	umlal	r0,r1,r6,r9
+	umlal	r2,r3,r6,r10
+	umlal	r0,r1,r5,r10
+	umlal	r2,r3,r5,r11
+	umlal	r0,r1,r4,r11
+	umlal	r2,r3,r4,r12
+	ldr	r4,[sp,#0]
+	mul	r8,r9,r8
+	ldr	r5,[sp,#4]
+
+	adds	r6,lr,r0		@ d2+=d1>>32
+	ldr	lr,[sp,#8]		@ reload input pointer
+	adc	r1,r1,#0
+	adds	r7,r2,r1		@ d3+=d2>>32
+	ldr	r0,[sp,#16]		@ reload end pointer
+	adc	r3,r3,#0
+	add	r8,r8,r3		@ h4+=d3>>32
+
+	and	r1,r8,#-4
+	and	r8,r8,#3
+	add	r1,r1,r1,lsr#2		@ *=5
+	adds	r4,r4,r1
+	adcs	r5,r5,#0
+	adcs	r6,r6,#0
+	adcs	r7,r7,#0
+	adc	r8,r8,#0
+
+	cmp	r0,lr			@ done yet?
+	bhi	.Loop
+
+	ldr	r0,[sp,#12]
+	add	sp,sp,#32
+	stmia	r0,{r4-r8}		@ store the result
+
+.Lno_data:
+#if __LINUX_ARM_ARCH__ >= 5
+	ldmia	sp!,{r3-r11,pc}
+#else
+	ldmia	sp!,{r3-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_blocks_arm)
+
+.align	5
+ENTRY(poly1305_emit_arm)
+	stmdb	sp!,{r4-r11}
+.Lpoly1305_emit_enter:
+	ldmia	r0,{r3-r7}
+	adds	r8,r3,#5		@ compare to modulus
+	adcs	r9,r4,#0
+	adcs	r10,r5,#0
+	adcs	r11,r6,#0
+	adc	r7,r7,#0
+	tst	r7,#4			@ did it carry/borrow?
+
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	r3,r8
+	ldr	r8,[r2,#0]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	r4,r9
+	ldr	r9,[r2,#4]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	r5,r10
+	ldr	r10,[r2,#8]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	r6,r11
+	ldr	r11,[r2,#12]
+
+	adds	r3,r3,r8
+	adcs	r4,r4,r9
+	adcs	r5,r5,r10
+	adc	r6,r6,r11
+
+#if __LINUX_ARM_ARCH__ >= 7
+#ifdef __ARMEB__
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+#endif
+	str	r3,[r1,#0]
+	str	r4,[r1,#4]
+	str	r5,[r1,#8]
+	str	r6,[r1,#12]
+#else
+	strb	r3,[r1,#0]
+	mov	r3,r3,lsr#8
+	strb	r4,[r1,#4]
+	mov	r4,r4,lsr#8
+	strb	r5,[r1,#8]
+	mov	r5,r5,lsr#8
+	strb	r6,[r1,#12]
+	mov	r6,r6,lsr#8
+
+	strb	r3,[r1,#1]
+	mov	r3,r3,lsr#8
+	strb	r4,[r1,#5]
+	mov	r4,r4,lsr#8
+	strb	r5,[r1,#9]
+	mov	r5,r5,lsr#8
+	strb	r6,[r1,#13]
+	mov	r6,r6,lsr#8
+
+	strb	r3,[r1,#2]
+	mov	r3,r3,lsr#8
+	strb	r4,[r1,#6]
+	mov	r4,r4,lsr#8
+	strb	r5,[r1,#10]
+	mov	r5,r5,lsr#8
+	strb	r6,[r1,#14]
+	mov	r6,r6,lsr#8
+
+	strb	r3,[r1,#3]
+	strb	r4,[r1,#7]
+	strb	r5,[r1,#11]
+	strb	r6,[r1,#15]
+#endif
+	ldmia	sp!,{r4-r11}
+#if __LINUX_ARM_ARCH__ >= 5
+	bx	lr				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+ENDPROC(poly1305_emit_arm)
+
+
+#if __LINUX_ARM_ARCH__ >= 7
+.fpu	neon
+
+.align	5
+ENTRY(poly1305_init_neon)
+.Lpoly1305_init_neon:
+	ldr	r4,[r0,#20]		@ load key base 2^32
+	ldr	r5,[r0,#24]
+	ldr	r6,[r0,#28]
+	ldr	r7,[r0,#32]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	and	r3,r3,#0x03ffffff
+	and	r4,r4,#0x03ffffff
+	and	r5,r5,#0x03ffffff
+
+	vdup.32	d0,r2			@ r^1 in both lanes
+	add	r2,r3,r3,lsl#2		@ *5
+	vdup.32	d1,r3
+	add	r3,r4,r4,lsl#2
+	vdup.32	d2,r2
+	vdup.32	d3,r4
+	add	r4,r5,r5,lsl#2
+	vdup.32	d4,r3
+	vdup.32	d5,r5
+	add	r5,r6,r6,lsl#2
+	vdup.32	d6,r4
+	vdup.32	d7,r6
+	vdup.32	d8,r5
+
+	mov	r5,#2		@ counter
+
+.Lsquare_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+
+	vmull.u32	q5,d0,d0[1]
+	vmull.u32	q6,d1,d0[1]
+	vmull.u32	q7,d3,d0[1]
+	vmull.u32	q8,d5,d0[1]
+	vmull.u32	q9,d7,d0[1]
+
+	vmlal.u32	q5,d7,d2[1]
+	vmlal.u32	q6,d0,d1[1]
+	vmlal.u32	q7,d1,d1[1]
+	vmlal.u32	q8,d3,d1[1]
+	vmlal.u32	q9,d5,d1[1]
+
+	vmlal.u32	q5,d5,d4[1]
+	vmlal.u32	q6,d7,d4[1]
+	vmlal.u32	q8,d1,d3[1]
+	vmlal.u32	q7,d0,d3[1]
+	vmlal.u32	q9,d3,d3[1]
+
+	vmlal.u32	q5,d3,d6[1]
+	vmlal.u32	q8,d0,d5[1]
+	vmlal.u32	q6,d5,d6[1]
+	vmlal.u32	q7,d7,d6[1]
+	vmlal.u32	q9,d1,d5[1]
+
+	vmlal.u32	q8,d7,d8[1]
+	vmlal.u32	q5,d1,d8[1]
+	vmlal.u32	q6,d3,d8[1]
+	vmlal.u32	q7,d5,d8[1]
+	vmlal.u32	q9,d0,d7[1]
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	@ and P. Schwabe
+	@
+	@ H0>>+H1>>+H2>>+H3>>+H4
+	@ H3>>+H4>>*5+H0>>+H1
+	@
+	@ Trivia.
+	@
+	@ Result of multiplication of n-bit number by m-bit number is
+	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+	@ m-bit number multiplied by 2^n is still n+m bits wide.
+	@
+	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+	@ one is n+1 bits wide.
+	@
+	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+	@ can be 27. However! In cases when their width exceeds 26 bits
+	@ they are limited by 2^26+2^6. This in turn means that *sum*
+	@ of the products with these values can still be viewed as sum
+	@ of 52-bit numbers as long as the amount of addends is not a
+	@ power of 2. For example,
+	@
+	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+	@
+	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
+	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+	@ which is less than 32 * (2^52) or 2^57. And when processing
+	@ data we are looking at triple as many addends...
+	@
+	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
+	@ This means that result of reduction have to be compressed upon
+	@ loop wrap-around. This can be done in the process of reduction
+	@ to minimize amount of instructions [as well as amount of
+	@ 128-bit instructions, which benefits low-end processors], but
+	@ one has to watch for H2 (which is narrower than H0) and 5*H4
+	@ not being wider than 58 bits, so that result of right shift
+	@ by 26 bits fits in 32 bits. This is also useful on x86,
+	@ because it allows to use paddd in place for paddq, which
+	@ benefits Atom, where paddq is ridiculously slow.
+
+	vshr.u64	q15,q8,#26
+	vmovn.i64	d16,q8
+	 vshr.u64	q4,q5,#26
+	 vmovn.i64	d10,q5
+	vadd.i64	q9,q9,q15		@ h3 -> h4
+	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
+	 vadd.i64	q6,q6,q4		@ h0 -> h1
+	 vbic.i32	d10,#0xfc000000
+
+	vshrn.u64	d30,q9,#26
+	vmovn.i64	d18,q9
+	 vshr.u64	q4,q6,#26
+	 vmovn.i64	d12,q6
+	 vadd.i64	q7,q7,q4		@ h1 -> h2
+	vbic.i32	d18,#0xfc000000
+	 vbic.i32	d12,#0xfc000000
+
+	vadd.i32	d10,d10,d30
+	vshl.u32	d30,d30,#2
+	 vshrn.u64	d8,q7,#26
+	 vmovn.i64	d14,q7
+	vadd.i32	d10,d10,d30	@ h4 -> h0
+	 vadd.i32	d16,d16,d8	@ h2 -> h3
+	 vbic.i32	d14,#0xfc000000
+
+	vshr.u32	d30,d10,#26
+	vbic.i32	d10,#0xfc000000
+	 vshr.u32	d8,d16,#26
+	 vbic.i32	d16,#0xfc000000
+	vadd.i32	d12,d12,d30	@ h0 -> h1
+	 vadd.i32	d18,d18,d8	@ h3 -> h4
+
+	subs		r5,r5,#1
+	beq		.Lsquare_break_neon
+
+	add		r6,r0,#(48+0*9*4)
+	add		r7,r0,#(48+1*9*4)
+
+	vtrn.32		d0,d10		@ r^2:r^1
+	vtrn.32		d3,d14
+	vtrn.32		d5,d16
+	vtrn.32		d1,d12
+	vtrn.32		d7,d18
+
+	vshl.u32	d4,d3,#2		@ *5
+	vshl.u32	d6,d5,#2
+	vshl.u32	d2,d1,#2
+	vshl.u32	d8,d7,#2
+	vadd.i32	d4,d4,d3
+	vadd.i32	d2,d2,d1
+	vadd.i32	d6,d6,d5
+	vadd.i32	d8,d8,d7
+
+	vst4.32		{d0[0],d1[0],d2[0],d3[0]},[r6]!
+	vst4.32		{d0[1],d1[1],d2[1],d3[1]},[r7]!
+	vst4.32		{d4[0],d5[0],d6[0],d7[0]},[r6]!
+	vst4.32		{d4[1],d5[1],d6[1],d7[1]},[r7]!
+	vst1.32		{d8[0]},[r6,:32]
+	vst1.32		{d8[1]},[r7,:32]
+
+	b		.Lsquare_neon
+
+.align	4
+.Lsquare_break_neon:
+	add		r6,r0,#(48+2*4*9)
+	add		r7,r0,#(48+3*4*9)
+
+	vmov		d0,d10		@ r^4:r^3
+	vshl.u32	d2,d12,#2		@ *5
+	vmov		d1,d12
+	vshl.u32	d4,d14,#2
+	vmov		d3,d14
+	vshl.u32	d6,d16,#2
+	vmov		d5,d16
+	vshl.u32	d8,d18,#2
+	vmov		d7,d18
+	vadd.i32	d2,d2,d12
+	vadd.i32	d4,d4,d14
+	vadd.i32	d6,d6,d16
+	vadd.i32	d8,d8,d18
+
+	vst4.32		{d0[0],d1[0],d2[0],d3[0]},[r6]!
+	vst4.32		{d0[1],d1[1],d2[1],d3[1]},[r7]!
+	vst4.32		{d4[0],d5[0],d6[0],d7[0]},[r6]!
+	vst4.32		{d4[1],d5[1],d6[1],d7[1]},[r7]!
+	vst1.32		{d8[0]},[r6]
+	vst1.32		{d8[1]},[r7]
+
+	bx	lr				@ bx	lr
+ENDPROC(poly1305_init_neon)
+
+.align	5
+ENTRY(poly1305_blocks_neon)
+	ldr	ip,[r0,#36]		@ is_base2_26
+	ands	r2,r2,#-16
+	beq	.Lno_data_neon
+
+	cmp	r2,#64
+	bhs	.Lenter_neon
+	tst	ip,ip			@ is_base2_26?
+	beq	.Lpoly1305_blocks_arm
+
+.Lenter_neon:
+	stmdb	sp!,{r4-r7}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	tst	ip,ip			@ is_base2_26?
+	bne	.Lbase2_26_neon
+
+	stmdb	sp!,{r1-r3,lr}
+	bl	.Lpoly1305_init_neon
+
+	ldr	r4,[r0,#0]		@ load hash value base 2^32
+	ldr	r5,[r0,#4]
+	ldr	r6,[r0,#8]
+	ldr	r7,[r0,#12]
+	ldr	ip,[r0,#16]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	 veor	d10,d10,d10
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	 veor	d12,d12,d12
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	 veor	d14,d14,d14
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	 veor	d16,d16,d16
+	and	r3,r3,#0x03ffffff
+	orr	r6,r6,ip,lsl#24
+	 veor	d18,d18,d18
+	and	r4,r4,#0x03ffffff
+	mov	r1,#1
+	and	r5,r5,#0x03ffffff
+	str	r1,[r0,#36]		@ is_base2_26
+
+	vmov.32	d10[0],r2
+	vmov.32	d12[0],r3
+	vmov.32	d14[0],r4
+	vmov.32	d16[0],r5
+	vmov.32	d18[0],r6
+	adr	r5,.Lzeros
+
+	ldmia	sp!,{r1-r3,lr}
+	b	.Lbase2_32_neon
+
+.align	4
+.Lbase2_26_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ load hash value
+
+	veor		d10,d10,d10
+	veor		d12,d12,d12
+	veor		d14,d14,d14
+	veor		d16,d16,d16
+	veor		d18,d18,d18
+	vld4.32		{d10[0],d12[0],d14[0],d16[0]},[r0]!
+	adr		r5,.Lzeros
+	vld1.32		{d18[0]},[r0]
+	sub		r0,r0,#16		@ rewind
+
+.Lbase2_32_neon:
+	add		r4,r1,#32
+	mov		r3,r3,lsl#24
+	tst		r2,#31
+	beq		.Leven
+
+	vld4.32		{d20[0],d22[0],d24[0],d26[0]},[r1]!
+	vmov.32		d28[0],r3
+	sub		r2,r2,#16
+	add		r4,r1,#32
+
+#ifdef	__ARMEB__
+	vrev32.8	q10,q10
+	vrev32.8	q13,q13
+	vrev32.8	q11,q11
+	vrev32.8	q12,q12
+#endif
+	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
+	vshl.u32	d26,d26,#18
+
+	vsri.u32	d26,d24,#14
+	vshl.u32	d24,d24,#12
+	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
+
+	vbic.i32	d26,#0xfc000000
+	vsri.u32	d24,d22,#20
+	vshl.u32	d22,d22,#6
+
+	vbic.i32	d24,#0xfc000000
+	vsri.u32	d22,d20,#26
+	vadd.i32	d27,d26,d16
+
+	vbic.i32	d20,#0xfc000000
+	vbic.i32	d22,#0xfc000000
+	vadd.i32	d25,d24,d14
+
+	vadd.i32	d21,d20,d10
+	vadd.i32	d23,d22,d12
+
+	mov		r7,r5
+	add		r6,r0,#48
+
+	cmp		r2,r2
+	b		.Long_tail
+
+.align	4
+.Leven:
+	subs		r2,r2,#64
+	it		lo
+	movlo		r4,r5
+
+	vmov.i32	q14,#1<<24		@ padbit, yes, always
+	vld4.32		{d20,d22,d24,d26},[r1]	@ inp[0:1]
+	add		r1,r1,#64
+	vld4.32		{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
+	add		r4,r4,#64
+	itt		hi
+	addhi		r7,r0,#(48+1*9*4)
+	addhi		r6,r0,#(48+3*9*4)
+
+#ifdef	__ARMEB__
+	vrev32.8	q10,q10
+	vrev32.8	q13,q13
+	vrev32.8	q11,q11
+	vrev32.8	q12,q12
+#endif
+	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
+	vshl.u32	q13,q13,#18
+
+	vsri.u32	q13,q12,#14
+	vshl.u32	q12,q12,#12
+
+	vbic.i32	q13,#0xfc000000
+	vsri.u32	q12,q11,#20
+	vshl.u32	q11,q11,#6
+
+	vbic.i32	q12,#0xfc000000
+	vsri.u32	q11,q10,#26
+
+	vbic.i32	q10,#0xfc000000
+	vbic.i32	q11,#0xfc000000
+
+	bls		.Lskip_loop
+
+	vld4.32		{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
+	vld4.32		{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
+	vld4.32		{d4[1],d5[1],d6[1],d7[1]},[r7]!
+	vld4.32		{d4[0],d5[0],d6[0],d7[0]},[r6]!
+	b		.Loop_neon
+
+.align	5
+.Loop_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	@   ___________________/
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	@   ___________________/ ____________________/
+	@
+	@ Note that we start with inp[2:3]*r^2. This is because it
+	@ doesn't depend on reduction in previous iteration.
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ inp[2:3]*r^2
+
+	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
+	vmull.u32	q7,d25,d0[1]
+	vadd.i32	d20,d20,d10
+	vmull.u32	q5,d21,d0[1]
+	vadd.i32	d26,d26,d16
+	vmull.u32	q8,d27,d0[1]
+	vmlal.u32	q7,d23,d1[1]
+	vadd.i32	d22,d22,d12
+	vmull.u32	q6,d23,d0[1]
+
+	vadd.i32	d28,d28,d18
+	vmull.u32	q9,d29,d0[1]
+	subs		r2,r2,#64
+	vmlal.u32	q5,d29,d2[1]
+	it		lo
+	movlo		r4,r5
+	vmlal.u32	q8,d25,d1[1]
+	vld1.32		d8[1],[r7,:32]
+	vmlal.u32	q6,d21,d1[1]
+	vmlal.u32	q9,d27,d1[1]
+
+	vmlal.u32	q5,d27,d4[1]
+	vmlal.u32	q8,d23,d3[1]
+	vmlal.u32	q9,d25,d3[1]
+	vmlal.u32	q6,d29,d4[1]
+	vmlal.u32	q7,d21,d3[1]
+
+	vmlal.u32	q8,d21,d5[1]
+	vmlal.u32	q5,d25,d6[1]
+	vmlal.u32	q9,d23,d5[1]
+	vmlal.u32	q6,d27,d6[1]
+	vmlal.u32	q7,d29,d6[1]
+
+	vmlal.u32	q8,d29,d8[1]
+	vmlal.u32	q5,d23,d8[1]
+	vmlal.u32	q9,d21,d7[1]
+	vmlal.u32	q6,d25,d8[1]
+	vmlal.u32	q7,d27,d8[1]
+
+	vld4.32		{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
+	add		r4,r4,#64
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4 and accumulate
+
+	vmlal.u32	q8,d26,d0[0]
+	vmlal.u32	q5,d20,d0[0]
+	vmlal.u32	q9,d28,d0[0]
+	vmlal.u32	q6,d22,d0[0]
+	vmlal.u32	q7,d24,d0[0]
+	vld1.32		d8[0],[r6,:32]
+
+	vmlal.u32	q8,d24,d1[0]
+	vmlal.u32	q5,d28,d2[0]
+	vmlal.u32	q9,d26,d1[0]
+	vmlal.u32	q6,d20,d1[0]
+	vmlal.u32	q7,d22,d1[0]
+
+	vmlal.u32	q8,d22,d3[0]
+	vmlal.u32	q5,d26,d4[0]
+	vmlal.u32	q9,d24,d3[0]
+	vmlal.u32	q6,d28,d4[0]
+	vmlal.u32	q7,d20,d3[0]
+
+	vmlal.u32	q8,d20,d5[0]
+	vmlal.u32	q5,d24,d6[0]
+	vmlal.u32	q9,d22,d5[0]
+	vmlal.u32	q6,d26,d6[0]
+	vmlal.u32	q8,d28,d8[0]
+
+	vmlal.u32	q7,d28,d6[0]
+	vmlal.u32	q5,d22,d8[0]
+	vmlal.u32	q9,d20,d7[0]
+	vmov.i32	q14,#1<<24		@ padbit, yes, always
+	vmlal.u32	q6,d24,d8[0]
+	vmlal.u32	q7,d26,d8[0]
+
+	vld4.32		{d20,d22,d24,d26},[r1]	@ inp[0:1]
+	add		r1,r1,#64
+#ifdef	__ARMEB__
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vrev32.8	q12,q12
+	vrev32.8	q13,q13
+#endif
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
+	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
+
+	vshr.u64	q15,q8,#26
+	vmovn.i64	d16,q8
+	 vshr.u64	q4,q5,#26
+	 vmovn.i64	d10,q5
+	vadd.i64	q9,q9,q15		@ h3 -> h4
+	vbic.i32	d16,#0xfc000000
+	  vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
+	 vadd.i64	q6,q6,q4		@ h0 -> h1
+	  vshl.u32	q13,q13,#18
+	 vbic.i32	d10,#0xfc000000
+
+	vshrn.u64	d30,q9,#26
+	vmovn.i64	d18,q9
+	 vshr.u64	q4,q6,#26
+	 vmovn.i64	d12,q6
+	 vadd.i64	q7,q7,q4		@ h1 -> h2
+	  vsri.u32	q13,q12,#14
+	vbic.i32	d18,#0xfc000000
+	  vshl.u32	q12,q12,#12
+	 vbic.i32	d12,#0xfc000000
+
+	vadd.i32	d10,d10,d30
+	vshl.u32	d30,d30,#2
+	  vbic.i32	q13,#0xfc000000
+	 vshrn.u64	d8,q7,#26
+	 vmovn.i64	d14,q7
+	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
+	  vsri.u32	q12,q11,#20
+	 vadd.i32	d16,d16,d8	@ h2 -> h3
+	  vshl.u32	q11,q11,#6
+	 vbic.i32	d14,#0xfc000000
+	  vbic.i32	q12,#0xfc000000
+
+	vshrn.u64	d30,q5,#26		@ re-narrow
+	vmovn.i64	d10,q5
+	  vsri.u32	q11,q10,#26
+	  vbic.i32	q10,#0xfc000000
+	 vshr.u32	d8,d16,#26
+	 vbic.i32	d16,#0xfc000000
+	vbic.i32	d10,#0xfc000000
+	vadd.i32	d12,d12,d30	@ h0 -> h1
+	 vadd.i32	d18,d18,d8	@ h3 -> h4
+	  vbic.i32	q11,#0xfc000000
+
+	bhi		.Loop_neon
+
+.Lskip_loop:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	add		r7,r0,#(48+0*9*4)
+	add		r6,r0,#(48+1*9*4)
+	adds		r2,r2,#32
+	it		ne
+	movne		r2,#0
+	bne		.Long_tail
+
+	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
+	vadd.i32	d21,d20,d10
+	vadd.i32	d27,d26,d16
+	vadd.i32	d23,d22,d12
+	vadd.i32	d29,d28,d18
+
+.Long_tail:
+	vld4.32		{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
+	vld4.32		{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
+
+	vadd.i32	d24,d24,d14	@ can be redundant
+	vmull.u32	q7,d25,d0
+	vadd.i32	d20,d20,d10
+	vmull.u32	q5,d21,d0
+	vadd.i32	d26,d26,d16
+	vmull.u32	q8,d27,d0
+	vadd.i32	d22,d22,d12
+	vmull.u32	q6,d23,d0
+	vadd.i32	d28,d28,d18
+	vmull.u32	q9,d29,d0
+
+	vmlal.u32	q5,d29,d2
+	vld4.32		{d4[1],d5[1],d6[1],d7[1]},[r7]!
+	vmlal.u32	q8,d25,d1
+	vld4.32		{d4[0],d5[0],d6[0],d7[0]},[r6]!
+	vmlal.u32	q6,d21,d1
+	vmlal.u32	q9,d27,d1
+	vmlal.u32	q7,d23,d1
+
+	vmlal.u32	q8,d23,d3
+	vld1.32		d8[1],[r7,:32]
+	vmlal.u32	q5,d27,d4
+	vld1.32		d8[0],[r6,:32]
+	vmlal.u32	q9,d25,d3
+	vmlal.u32	q6,d29,d4
+	vmlal.u32	q7,d21,d3
+
+	vmlal.u32	q8,d21,d5
+	 it		ne
+	 addne		r7,r0,#(48+2*9*4)
+	vmlal.u32	q5,d25,d6
+	 it		ne
+	 addne		r6,r0,#(48+3*9*4)
+	vmlal.u32	q9,d23,d5
+	vmlal.u32	q6,d27,d6
+	vmlal.u32	q7,d29,d6
+
+	vmlal.u32	q8,d29,d8
+	 vorn		q0,q0,q0	@ all-ones, can be redundant
+	vmlal.u32	q5,d23,d8
+	 vshr.u64	q0,q0,#38
+	vmlal.u32	q9,d21,d7
+	vmlal.u32	q6,d25,d8
+	vmlal.u32	q7,d27,d8
+
+	beq		.Lshort_tail
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+	vld4.32		{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
+	vld4.32		{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
+
+	vmlal.u32	q7,d24,d0
+	vmlal.u32	q5,d20,d0
+	vmlal.u32	q8,d26,d0
+	vmlal.u32	q6,d22,d0
+	vmlal.u32	q9,d28,d0
+
+	vmlal.u32	q5,d28,d2
+	vld4.32		{d4[1],d5[1],d6[1],d7[1]},[r7]!
+	vmlal.u32	q8,d24,d1
+	vld4.32		{d4[0],d5[0],d6[0],d7[0]},[r6]!
+	vmlal.u32	q6,d20,d1
+	vmlal.u32	q9,d26,d1
+	vmlal.u32	q7,d22,d1
+
+	vmlal.u32	q8,d22,d3
+	vld1.32		d8[1],[r7,:32]
+	vmlal.u32	q5,d26,d4
+	vld1.32		d8[0],[r6,:32]
+	vmlal.u32	q9,d24,d3
+	vmlal.u32	q6,d28,d4
+	vmlal.u32	q7,d20,d3
+
+	vmlal.u32	q8,d20,d5
+	vmlal.u32	q5,d24,d6
+	vmlal.u32	q9,d22,d5
+	vmlal.u32	q6,d26,d6
+	vmlal.u32	q7,d28,d6
+
+	vmlal.u32	q8,d28,d8
+	 vorn		q0,q0,q0	@ all-ones
+	vmlal.u32	q5,d22,d8
+	 vshr.u64	q0,q0,#38
+	vmlal.u32	q9,d20,d7
+	vmlal.u32	q6,d24,d8
+	vmlal.u32	q7,d26,d8
+
+.Lshort_tail:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ horizontal addition
+
+	vadd.i64	d16,d16,d17
+	vadd.i64	d10,d10,d11
+	vadd.i64	d18,d18,d19
+	vadd.i64	d12,d12,d13
+	vadd.i64	d14,d14,d15
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction, but without narrowing
+
+	vshr.u64	q15,q8,#26
+	vand.i64	q8,q8,q0
+	 vshr.u64	q4,q5,#26
+	 vand.i64	q5,q5,q0
+	vadd.i64	q9,q9,q15		@ h3 -> h4
+	 vadd.i64	q6,q6,q4		@ h0 -> h1
+
+	vshr.u64	q15,q9,#26
+	vand.i64	q9,q9,q0
+	 vshr.u64	q4,q6,#26
+	 vand.i64	q6,q6,q0
+	 vadd.i64	q7,q7,q4		@ h1 -> h2
+
+	vadd.i64	q5,q5,q15
+	vshl.u64	q15,q15,#2
+	 vshr.u64	q4,q7,#26
+	 vand.i64	q7,q7,q0
+	vadd.i64	q5,q5,q15		@ h4 -> h0
+	 vadd.i64	q8,q8,q4		@ h2 -> h3
+
+	vshr.u64	q15,q5,#26
+	vand.i64	q5,q5,q0
+	 vshr.u64	q4,q8,#26
+	 vand.i64	q8,q8,q0
+	vadd.i64	q6,q6,q15		@ h0 -> h1
+	 vadd.i64	q9,q9,q4		@ h3 -> h4
+
+	cmp		r2,#0
+	bne		.Leven
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ store hash value
+
+	vst4.32		{d10[0],d12[0],d14[0],d16[0]},[r0]!
+	vst1.32		{d18[0]},[r0]
+
+	vldmia	sp!,{d8-d15}			@ epilogue
+	ldmia	sp!,{r4-r7}
+.Lno_data_neon:
+	bx	lr					@ bx	lr
+ENDPROC(poly1305_blocks_neon)
+
+.align	5
+ENTRY(poly1305_emit_neon)
+	ldr	ip,[r0,#36]		@ is_base2_26
+
+	stmdb	sp!,{r4-r11}
+
+	tst	ip,ip
+	beq	.Lpoly1305_emit_enter
+
+	ldmia	r0,{r3-r7}
+	eor	r8,r8,r8
+
+	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
+	mov	r4,r4,lsr#6
+	adcs	r4,r4,r5,lsl#20
+	mov	r5,r5,lsr#12
+	adcs	r5,r5,r6,lsl#14
+	mov	r6,r6,lsr#18
+	adcs	r6,r6,r7,lsl#8
+	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
+
+	and	r8,r7,#-4		@ ... so reduce
+	and	r7,r6,#3
+	add	r8,r8,r8,lsr#2	@ *= 5
+	adds	r3,r3,r8
+	adcs	r4,r4,#0
+	adcs	r5,r5,#0
+	adcs	r6,r6,#0
+	adc	r7,r7,#0
+
+	adds	r8,r3,#5		@ compare to modulus
+	adcs	r9,r4,#0
+	adcs	r10,r5,#0
+	adcs	r11,r6,#0
+	adc	r7,r7,#0
+	tst	r7,#4			@ did it carry/borrow?
+
+	it	ne
+	movne	r3,r8
+	ldr	r8,[r2,#0]
+	it	ne
+	movne	r4,r9
+	ldr	r9,[r2,#4]
+	it	ne
+	movne	r5,r10
+	ldr	r10,[r2,#8]
+	it	ne
+	movne	r6,r11
+	ldr	r11,[r2,#12]
+
+	adds	r3,r3,r8		@ accumulate nonce
+	adcs	r4,r4,r9
+	adcs	r5,r5,r10
+	adc	r6,r6,r11
+
+#ifdef __ARMEB__
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+#endif
+	str	r3,[r1,#0]		@ store the result
+	str	r4,[r1,#4]
+	str	r5,[r1,#8]
+	str	r6,[r1,#12]
+
+	ldmia	sp!,{r4-r11}
+	bx	lr				@ bx	lr
+ENDPROC(poly1305_emit_neon)
+
+.align	5
+.Lzeros:
+.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#endif
diff --git a/lib/zinc/poly1305/poly1305-arm64.S b/lib/zinc/poly1305/poly1305-arm64.S
new file mode 100644
index 000000000000..c20023544183
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-arm64.S
@@ -0,0 +1,822 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+.text
+
+.align	5
+ENTRY(poly1305_init_arm)
+	cmp	x1,xzr
+	stp	xzr,xzr,[x0]		// zero hash value
+	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
+
+	csel	x0,xzr,x0,eq
+	b.eq	.Lno_key
+
+	ldp	x7,x8,[x1]		// load key
+	mov	x9,#0xfffffffc0fffffff
+	movk	x9,#0x0fff,lsl#48
+#ifdef	__ARMEB__
+	rev	x7,x7			// flip bytes
+	rev	x8,x8
+#endif
+	and	x7,x7,x9		// &=0ffffffc0fffffff
+	and	x9,x9,#-4
+	and	x8,x8,x9		// &=0ffffffc0ffffffc
+	stp	x7,x8,[x0,#32]	// save key value
+
+.Lno_key:
+	ret
+ENDPROC(poly1305_init_arm)
+
+.align	5
+ENTRY(poly1305_blocks_arm)
+	ands	x2,x2,#-16
+	b.eq	.Lno_data
+
+	ldp	x4,x5,[x0]		// load hash value
+	ldp	x7,x8,[x0,#32]	// load key value
+	ldr	x6,[x0,#16]
+	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
+	b	.Loop
+
+.align	5
+.Loop:
+	ldp	x10,x11,[x1],#16	// load input
+	sub	x2,x2,#16
+#ifdef	__ARMEB__
+	rev	x10,x10
+	rev	x11,x11
+#endif
+	adds	x4,x4,x10		// accumulate input
+	adcs	x5,x5,x11
+
+	mul	x12,x4,x7		// h0*r0
+	adc	x6,x6,x3
+	umulh	x13,x4,x7
+
+	mul	x10,x5,x9		// h1*5*r1
+	umulh	x11,x5,x9
+
+	adds	x12,x12,x10
+	mul	x10,x4,x8		// h0*r1
+	adc	x13,x13,x11
+	umulh	x14,x4,x8
+
+	adds	x13,x13,x10
+	mul	x10,x5,x7		// h1*r0
+	adc	x14,x14,xzr
+	umulh	x11,x5,x7
+
+	adds	x13,x13,x10
+	mul	x10,x6,x9		// h2*5*r1
+	adc	x14,x14,x11
+	mul	x11,x6,x7		// h2*r0
+
+	adds	x13,x13,x10
+	adc	x14,x14,x11
+
+	and	x10,x14,#-4		// final reduction
+	and	x6,x14,#3
+	add	x10,x10,x14,lsr#2
+	adds	x4,x12,x10
+	adcs	x5,x13,xzr
+	adc	x6,x6,xzr
+
+	cbnz	x2,.Loop
+
+	stp	x4,x5,[x0]		// store hash value
+	str	x6,[x0,#16]
+
+.Lno_data:
+	ret
+ENDPROC(poly1305_blocks_arm)
+
+.align	5
+ENTRY(poly1305_emit_arm)
+	ldp	x4,x5,[x0]		// load hash base 2^64
+	ldr	x6,[x0,#16]
+	ldp	x10,x11,[x2]	// load nonce
+
+	adds	x12,x4,#5		// compare to modulus
+	adcs	x13,x5,xzr
+	adc	x14,x6,xzr
+
+	tst	x14,#-4			// see if it's carried/borrowed
+
+	csel	x4,x4,x12,eq
+	csel	x5,x5,x13,eq
+
+#ifdef	__ARMEB__
+	ror	x10,x10,#32		// flip nonce words
+	ror	x11,x11,#32
+#endif
+	adds	x4,x4,x10		// accumulate nonce
+	adc	x5,x5,x11
+#ifdef	__ARMEB__
+	rev	x4,x4			// flip output bytes
+	rev	x5,x5
+#endif
+	stp	x4,x5,[x1]		// write result
+
+	ret
+ENDPROC(poly1305_emit_arm)
+
+.align	5
+__poly1305_mult:
+	mul	x12,x4,x7		// h0*r0
+	umulh	x13,x4,x7
+
+	mul	x10,x5,x9		// h1*5*r1
+	umulh	x11,x5,x9
+
+	adds	x12,x12,x10
+	mul	x10,x4,x8		// h0*r1
+	adc	x13,x13,x11
+	umulh	x14,x4,x8
+
+	adds	x13,x13,x10
+	mul	x10,x5,x7		// h1*r0
+	adc	x14,x14,xzr
+	umulh	x11,x5,x7
+
+	adds	x13,x13,x10
+	mul	x10,x6,x9		// h2*5*r1
+	adc	x14,x14,x11
+	mul	x11,x6,x7		// h2*r0
+
+	adds	x13,x13,x10
+	adc	x14,x14,x11
+
+	and	x10,x14,#-4		// final reduction
+	and	x6,x14,#3
+	add	x10,x10,x14,lsr#2
+	adds	x4,x12,x10
+	adcs	x5,x13,xzr
+	adc	x6,x6,xzr
+
+	ret
+
+__poly1305_splat:
+	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x13,x4,#26,#26
+	extr	x14,x5,x4,#52
+	and	x14,x14,#0x03ffffff
+	ubfx	x15,x5,#14,#26
+	extr	x16,x6,x5,#40
+
+	str	w12,[x0,#16*0]	// r0
+	add	w12,w13,w13,lsl#2	// r1*5
+	str	w13,[x0,#16*1]	// r1
+	add	w13,w14,w14,lsl#2	// r2*5
+	str	w12,[x0,#16*2]	// s1
+	str	w14,[x0,#16*3]	// r2
+	add	w14,w15,w15,lsl#2	// r3*5
+	str	w13,[x0,#16*4]	// s2
+	str	w15,[x0,#16*5]	// r3
+	add	w15,w16,w16,lsl#2	// r4*5
+	str	w14,[x0,#16*6]	// s3
+	str	w16,[x0,#16*7]	// r4
+	str	w15,[x0,#16*8]	// s4
+
+	ret
+
+.align	5
+ENTRY(poly1305_blocks_neon)
+	ldr	x17,[x0,#24]
+	cmp	x2,#128
+	b.hs	.Lblocks_neon
+	cbz	x17,poly1305_blocks_arm
+
+.Lblocks_neon:
+	stp	x29,x30,[sp,#-80]!
+	add	x29,sp,#0
+
+	ands	x2,x2,#-16
+	b.eq	.Lno_data_neon
+
+	cbz	x17,.Lbase2_64_neon
+
+	ldp	w10,w11,[x0]		// load hash value base 2^26
+	ldp	w12,w13,[x0,#8]
+	ldr	w14,[x0,#16]
+
+	tst	x2,#31
+	b.eq	.Leven_neon
+
+	ldp	x7,x8,[x0,#32]	// load key value
+
+	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
+	lsr	x5,x12,#12
+	adds	x4,x4,x12,lsl#52
+	add	x5,x5,x13,lsl#14
+	adc	x5,x5,xzr
+	lsr	x6,x14,#24
+	adds	x5,x5,x14,lsl#40
+	adc	x14,x6,xzr		// can be partially reduced...
+
+	ldp	x12,x13,[x1],#16	// load input
+	sub	x2,x2,#16
+	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
+
+	and	x10,x14,#-4		// ... so reduce
+	and	x6,x14,#3
+	add	x10,x10,x14,lsr#2
+	adds	x4,x4,x10
+	adcs	x5,x5,xzr
+	adc	x6,x6,xzr
+
+#ifdef	__ARMEB__
+	rev	x12,x12
+	rev	x13,x13
+#endif
+	adds	x4,x4,x12		// accumulate input
+	adcs	x5,x5,x13
+	adc	x6,x6,x3
+
+	bl	__poly1305_mult
+	ldr	x30,[sp,#8]
+
+	cbz	x3,.Lstore_base2_64_neon
+
+	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,x4,#26,#26
+	extr	x12,x5,x4,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,x5,#14,#26
+	extr	x14,x6,x5,#40
+
+	cbnz	x2,.Leven_neon
+
+	stp	w10,w11,[x0]		// store hash value base 2^26
+	stp	w12,w13,[x0,#8]
+	str	w14,[x0,#16]
+	b	.Lno_data_neon
+
+.align	4
+.Lstore_base2_64_neon:
+	stp	x4,x5,[x0]		// store hash value base 2^64
+	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
+	b	.Lno_data_neon
+
+.align	4
+.Lbase2_64_neon:
+	ldp	x7,x8,[x0,#32]	// load key value
+
+	ldp	x4,x5,[x0]		// load hash value base 2^64
+	ldr	x6,[x0,#16]
+
+	tst	x2,#31
+	b.eq	.Linit_neon
+
+	ldp	x12,x13,[x1],#16	// load input
+	sub	x2,x2,#16
+	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
+#ifdef	__ARMEB__
+	rev	x12,x12
+	rev	x13,x13
+#endif
+	adds	x4,x4,x12		// accumulate input
+	adcs	x5,x5,x13
+	adc	x6,x6,x3
+
+	bl	__poly1305_mult
+
+.Linit_neon:
+	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
+	ubfx	x11,x4,#26,#26
+	extr	x12,x5,x4,#52
+	and	x12,x12,#0x03ffffff
+	ubfx	x13,x5,#14,#26
+	extr	x14,x6,x5,#40
+
+	stp	d8,d9,[sp,#16]		// meet ABI requirements
+	stp	d10,d11,[sp,#32]
+	stp	d12,d13,[sp,#48]
+	stp	d14,d15,[sp,#64]
+
+	fmov	d24,x10
+	fmov	d25,x11
+	fmov	d26,x12
+	fmov	d27,x13
+	fmov	d28,x14
+
+	////////////////////////////////// initialize r^n table
+	mov	x4,x7			// r^1
+	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
+	mov	x5,x8
+	mov	x6,xzr
+	add	x0,x0,#48+12
+	bl	__poly1305_splat
+
+	bl	__poly1305_mult		// r^2
+	sub	x0,x0,#4
+	bl	__poly1305_splat
+
+	bl	__poly1305_mult		// r^3
+	sub	x0,x0,#4
+	bl	__poly1305_splat
+
+	bl	__poly1305_mult		// r^4
+	sub	x0,x0,#4
+	bl	__poly1305_splat
+	ldr	x30,[sp,#8]
+
+	add	x16,x1,#32
+	adr	x17,.Lzeros
+	subs	x2,x2,#64
+	csel	x16,x17,x16,lo
+
+	mov	x4,#1
+	str	x4,[x0,#-24]		// set is_base2_26
+	sub	x0,x0,#48		// restore original x0
+	b	.Ldo_neon
+
+.align	4
+.Leven_neon:
+	add	x16,x1,#32
+	adr	x17,.Lzeros
+	subs	x2,x2,#64
+	csel	x16,x17,x16,lo
+
+	stp	d8,d9,[sp,#16]		// meet ABI requirements
+	stp	d10,d11,[sp,#32]
+	stp	d12,d13,[sp,#48]
+	stp	d14,d15,[sp,#64]
+
+	fmov	d24,x10
+	fmov	d25,x11
+	fmov	d26,x12
+	fmov	d27,x13
+	fmov	d28,x14
+
+.Ldo_neon:
+	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
+	ldp	x9,x13,[x16],#48
+
+	lsl	x3,x3,#24
+	add	x15,x0,#48
+
+#ifdef	__ARMEB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	and	x5,x9,#0x03ffffff
+	ubfx	x6,x8,#26,#26
+	ubfx	x7,x9,#26,#26
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	extr	x8,x12,x8,#52
+	extr	x9,x13,x9,#52
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	fmov	d14,x4
+	and	x8,x8,#0x03ffffff
+	and	x9,x9,#0x03ffffff
+	ubfx	x10,x12,#14,#26
+	ubfx	x11,x13,#14,#26
+	add	x12,x3,x12,lsr#40
+	add	x13,x3,x13,lsr#40
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	fmov	d15,x6
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	fmov	d16,x8
+	fmov	d17,x10
+	fmov	d18,x12
+
+	ldp	x8,x12,[x1],#16	// inp[0:1]
+	ldp	x9,x13,[x1],#48
+
+	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
+	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
+	ld1	{v8.4s},[x15]
+
+#ifdef	__ARMEB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	and	x5,x9,#0x03ffffff
+	ubfx	x6,x8,#26,#26
+	ubfx	x7,x9,#26,#26
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	extr	x8,x12,x8,#52
+	extr	x9,x13,x9,#52
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	fmov	d9,x4
+	and	x8,x8,#0x03ffffff
+	and	x9,x9,#0x03ffffff
+	ubfx	x10,x12,#14,#26
+	ubfx	x11,x13,#14,#26
+	add	x12,x3,x12,lsr#40
+	add	x13,x3,x13,lsr#40
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	fmov	d10,x6
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	movi	v31.2d,#-1
+	fmov	d11,x8
+	fmov	d12,x10
+	fmov	d13,x12
+	ushr	v31.2d,v31.2d,#38
+
+	b.ls	.Lskip_loop
+
+.align	4
+.Loop_neon:
+	////////////////////////////////////////////////////////////////
+	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	//   ___________________/
+	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	//   ___________________/ ____________________/
+	//
+	// Note that we start with inp[2:3]*r^2. This is because it
+	// doesn't depend on reduction in previous iteration.
+	////////////////////////////////////////////////////////////////
+	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
+	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
+	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
+	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
+	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+	subs	x2,x2,#64
+	umull	v23.2d,v14.2s,v7.s[2]
+	csel	x16,x17,x16,lo
+	umull	v22.2d,v14.2s,v5.s[2]
+	umull	v21.2d,v14.2s,v3.s[2]
+	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
+	umull	v20.2d,v14.2s,v1.s[2]
+	ldp	x9,x13,[x16],#48
+	umull	v19.2d,v14.2s,v0.s[2]
+#ifdef	__ARMEB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+
+	umlal	v23.2d,v15.2s,v5.s[2]
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	umlal	v22.2d,v15.2s,v3.s[2]
+	and	x5,x9,#0x03ffffff
+	umlal	v21.2d,v15.2s,v1.s[2]
+	ubfx	x6,x8,#26,#26
+	umlal	v20.2d,v15.2s,v0.s[2]
+	ubfx	x7,x9,#26,#26
+	umlal	v19.2d,v15.2s,v8.s[2]
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+
+	umlal	v23.2d,v16.2s,v3.s[2]
+	extr	x8,x12,x8,#52
+	umlal	v22.2d,v16.2s,v1.s[2]
+	extr	x9,x13,x9,#52
+	umlal	v21.2d,v16.2s,v0.s[2]
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	umlal	v20.2d,v16.2s,v8.s[2]
+	fmov	d14,x4
+	umlal	v19.2d,v16.2s,v6.s[2]
+	and	x8,x8,#0x03ffffff
+
+	umlal	v23.2d,v17.2s,v1.s[2]
+	and	x9,x9,#0x03ffffff
+	umlal	v22.2d,v17.2s,v0.s[2]
+	ubfx	x10,x12,#14,#26
+	umlal	v21.2d,v17.2s,v8.s[2]
+	ubfx	x11,x13,#14,#26
+	umlal	v20.2d,v17.2s,v6.s[2]
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	umlal	v19.2d,v17.2s,v4.s[2]
+	fmov	d15,x6
+
+	add	v11.2s,v11.2s,v26.2s
+	add	x12,x3,x12,lsr#40
+	umlal	v23.2d,v18.2s,v0.s[2]
+	add	x13,x3,x13,lsr#40
+	umlal	v22.2d,v18.2s,v8.s[2]
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	umlal	v21.2d,v18.2s,v6.s[2]
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	umlal	v20.2d,v18.2s,v4.s[2]
+	fmov	d16,x8
+	umlal	v19.2d,v18.2s,v2.s[2]
+	fmov	d17,x10
+
+	////////////////////////////////////////////////////////////////
+	// (hash+inp[0:1])*r^4 and accumulate
+
+	add	v9.2s,v9.2s,v24.2s
+	fmov	d18,x12
+	umlal	v22.2d,v11.2s,v1.s[0]
+	ldp	x8,x12,[x1],#16	// inp[0:1]
+	umlal	v19.2d,v11.2s,v6.s[0]
+	ldp	x9,x13,[x1],#48
+	umlal	v23.2d,v11.2s,v3.s[0]
+	umlal	v20.2d,v11.2s,v8.s[0]
+	umlal	v21.2d,v11.2s,v0.s[0]
+#ifdef	__ARMEB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+#endif
+
+	add	v10.2s,v10.2s,v25.2s
+	umlal	v22.2d,v9.2s,v5.s[0]
+	umlal	v23.2d,v9.2s,v7.s[0]
+	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
+	umlal	v21.2d,v9.2s,v3.s[0]
+	and	x5,x9,#0x03ffffff
+	umlal	v19.2d,v9.2s,v0.s[0]
+	ubfx	x6,x8,#26,#26
+	umlal	v20.2d,v9.2s,v1.s[0]
+	ubfx	x7,x9,#26,#26
+
+	add	v12.2s,v12.2s,v27.2s
+	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
+	umlal	v22.2d,v10.2s,v3.s[0]
+	extr	x8,x12,x8,#52
+	umlal	v23.2d,v10.2s,v5.s[0]
+	extr	x9,x13,x9,#52
+	umlal	v19.2d,v10.2s,v8.s[0]
+	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
+	umlal	v21.2d,v10.2s,v1.s[0]
+	fmov	d9,x4
+	umlal	v20.2d,v10.2s,v0.s[0]
+	and	x8,x8,#0x03ffffff
+
+	add	v13.2s,v13.2s,v28.2s
+	and	x9,x9,#0x03ffffff
+	umlal	v22.2d,v12.2s,v0.s[0]
+	ubfx	x10,x12,#14,#26
+	umlal	v19.2d,v12.2s,v4.s[0]
+	ubfx	x11,x13,#14,#26
+	umlal	v23.2d,v12.2s,v1.s[0]
+	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
+	umlal	v20.2d,v12.2s,v6.s[0]
+	fmov	d10,x6
+	umlal	v21.2d,v12.2s,v8.s[0]
+	add	x12,x3,x12,lsr#40
+
+	umlal	v22.2d,v13.2s,v8.s[0]
+	add	x13,x3,x13,lsr#40
+	umlal	v19.2d,v13.2s,v2.s[0]
+	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
+	umlal	v23.2d,v13.2s,v0.s[0]
+	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	umlal	v20.2d,v13.2s,v4.s[0]
+	fmov	d11,x8
+	umlal	v21.2d,v13.2s,v6.s[0]
+	fmov	d12,x10
+	fmov	d13,x12
+
+	/////////////////////////////////////////////////////////////////
+	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	// and P. Schwabe
+	//
+	// [see discussion in poly1305-armv4 module]
+
+	ushr	v29.2d,v22.2d,#26
+	xtn	v27.2s,v22.2d
+	ushr	v30.2d,v19.2d,#26
+	and	v19.16b,v19.16b,v31.16b
+	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
+	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
+	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
+
+	ushr	v29.2d,v23.2d,#26
+	xtn	v28.2s,v23.2d
+	ushr	v30.2d,v20.2d,#26
+	xtn	v25.2s,v20.2d
+	bic	v28.2s,#0xfc,lsl#24
+	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
+
+	add	v19.2d,v19.2d,v29.2d
+	shl	v29.2d,v29.2d,#2
+	shrn	v30.2s,v21.2d,#26
+	xtn	v26.2s,v21.2d
+	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
+	bic	v25.2s,#0xfc,lsl#24
+	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
+	bic	v26.2s,#0xfc,lsl#24
+
+	shrn	v29.2s,v19.2d,#26
+	xtn	v24.2s,v19.2d
+	ushr	v30.2s,v27.2s,#26
+	bic	v27.2s,#0xfc,lsl#24
+	bic	v24.2s,#0xfc,lsl#24
+	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
+	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
+
+	b.hi	.Loop_neon
+
+.Lskip_loop:
+	dup	v16.2d,v16.d[0]
+	add	v11.2s,v11.2s,v26.2s
+
+	////////////////////////////////////////////////////////////////
+	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	adds	x2,x2,#32
+	b.ne	.Long_tail
+
+	dup	v16.2d,v11.d[0]
+	add	v14.2s,v9.2s,v24.2s
+	add	v17.2s,v12.2s,v27.2s
+	add	v15.2s,v10.2s,v25.2s
+	add	v18.2s,v13.2s,v28.2s
+
+.Long_tail:
+	dup	v14.2d,v14.d[0]
+	umull2	v19.2d,v16.4s,v6.4s
+	umull2	v22.2d,v16.4s,v1.4s
+	umull2	v23.2d,v16.4s,v3.4s
+	umull2	v21.2d,v16.4s,v0.4s
+	umull2	v20.2d,v16.4s,v8.4s
+
+	dup	v15.2d,v15.d[0]
+	umlal2	v19.2d,v14.4s,v0.4s
+	umlal2	v21.2d,v14.4s,v3.4s
+	umlal2	v22.2d,v14.4s,v5.4s
+	umlal2	v23.2d,v14.4s,v7.4s
+	umlal2	v20.2d,v14.4s,v1.4s
+
+	dup	v17.2d,v17.d[0]
+	umlal2	v19.2d,v15.4s,v8.4s
+	umlal2	v22.2d,v15.4s,v3.4s
+	umlal2	v21.2d,v15.4s,v1.4s
+	umlal2	v23.2d,v15.4s,v5.4s
+	umlal2	v20.2d,v15.4s,v0.4s
+
+	dup	v18.2d,v18.d[0]
+	umlal2	v22.2d,v17.4s,v0.4s
+	umlal2	v23.2d,v17.4s,v1.4s
+	umlal2	v19.2d,v17.4s,v4.4s
+	umlal2	v20.2d,v17.4s,v6.4s
+	umlal2	v21.2d,v17.4s,v8.4s
+
+	umlal2	v22.2d,v18.4s,v8.4s
+	umlal2	v19.2d,v18.4s,v2.4s
+	umlal2	v23.2d,v18.4s,v0.4s
+	umlal2	v20.2d,v18.4s,v4.4s
+	umlal2	v21.2d,v18.4s,v6.4s
+
+	b.eq	.Lshort_tail
+
+	////////////////////////////////////////////////////////////////
+	// (hash+inp[0:1])*r^4:r^3 and accumulate
+
+	add	v9.2s,v9.2s,v24.2s
+	umlal	v22.2d,v11.2s,v1.2s
+	umlal	v19.2d,v11.2s,v6.2s
+	umlal	v23.2d,v11.2s,v3.2s
+	umlal	v20.2d,v11.2s,v8.2s
+	umlal	v21.2d,v11.2s,v0.2s
+
+	add	v10.2s,v10.2s,v25.2s
+	umlal	v22.2d,v9.2s,v5.2s
+	umlal	v19.2d,v9.2s,v0.2s
+	umlal	v23.2d,v9.2s,v7.2s
+	umlal	v20.2d,v9.2s,v1.2s
+	umlal	v21.2d,v9.2s,v3.2s
+
+	add	v12.2s,v12.2s,v27.2s
+	umlal	v22.2d,v10.2s,v3.2s
+	umlal	v19.2d,v10.2s,v8.2s
+	umlal	v23.2d,v10.2s,v5.2s
+	umlal	v20.2d,v10.2s,v0.2s
+	umlal	v21.2d,v10.2s,v1.2s
+
+	add	v13.2s,v13.2s,v28.2s
+	umlal	v22.2d,v12.2s,v0.2s
+	umlal	v19.2d,v12.2s,v4.2s
+	umlal	v23.2d,v12.2s,v1.2s
+	umlal	v20.2d,v12.2s,v6.2s
+	umlal	v21.2d,v12.2s,v8.2s
+
+	umlal	v22.2d,v13.2s,v8.2s
+	umlal	v19.2d,v13.2s,v2.2s
+	umlal	v23.2d,v13.2s,v0.2s
+	umlal	v20.2d,v13.2s,v4.2s
+	umlal	v21.2d,v13.2s,v6.2s
+
+.Lshort_tail:
+	////////////////////////////////////////////////////////////////
+	// horizontal add
+
+	addp	v22.2d,v22.2d,v22.2d
+	ldp	d8,d9,[sp,#16]		// meet ABI requirements
+	addp	v19.2d,v19.2d,v19.2d
+	ldp	d10,d11,[sp,#32]
+	addp	v23.2d,v23.2d,v23.2d
+	ldp	d12,d13,[sp,#48]
+	addp	v20.2d,v20.2d,v20.2d
+	ldp	d14,d15,[sp,#64]
+	addp	v21.2d,v21.2d,v21.2d
+
+	////////////////////////////////////////////////////////////////
+	// lazy reduction, but without narrowing
+
+	ushr	v29.2d,v22.2d,#26
+	and	v22.16b,v22.16b,v31.16b
+	ushr	v30.2d,v19.2d,#26
+	and	v19.16b,v19.16b,v31.16b
+
+	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
+	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
+
+	ushr	v29.2d,v23.2d,#26
+	and	v23.16b,v23.16b,v31.16b
+	ushr	v30.2d,v20.2d,#26
+	and	v20.16b,v20.16b,v31.16b
+	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
+
+	add	v19.2d,v19.2d,v29.2d
+	shl	v29.2d,v29.2d,#2
+	ushr	v30.2d,v21.2d,#26
+	and	v21.16b,v21.16b,v31.16b
+	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
+	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
+
+	ushr	v29.2d,v19.2d,#26
+	and	v19.16b,v19.16b,v31.16b
+	ushr	v30.2d,v22.2d,#26
+	and	v22.16b,v22.16b,v31.16b
+	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
+	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
+
+	////////////////////////////////////////////////////////////////
+	// write the result, can be partially reduced
+
+	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
+	st1	{v23.s}[0],[x0]
+
+.Lno_data_neon:
+	ldr	x29,[sp],#80
+	ret
+ENDPROC(poly1305_blocks_neon)
+
+.align	5
+ENTRY(poly1305_emit_neon)
+	ldr	x17,[x0,#24]
+	cbz	x17,poly1305_emit_arm
+
+	ldp	w10,w11,[x0]		// load hash value base 2^26
+	ldp	w12,w13,[x0,#8]
+	ldr	w14,[x0,#16]
+
+	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
+	lsr	x5,x12,#12
+	adds	x4,x4,x12,lsl#52
+	add	x5,x5,x13,lsl#14
+	adc	x5,x5,xzr
+	lsr	x6,x14,#24
+	adds	x5,x5,x14,lsl#40
+	adc	x6,x6,xzr		// can be partially reduced...
+
+	ldp	x10,x11,[x2]	// load nonce
+
+	and	x12,x6,#-4		// ... so reduce
+	add	x12,x12,x6,lsr#2
+	and	x6,x6,#3
+	adds	x4,x4,x12
+	adcs	x5,x5,xzr
+	adc	x6,x6,xzr
+
+	adds	x12,x4,#5		// compare to modulus
+	adcs	x13,x5,xzr
+	adc	x14,x6,xzr
+
+	tst	x14,#-4			// see if it's carried/borrowed
+
+	csel	x4,x4,x12,eq
+	csel	x5,x5,x13,eq
+
+#ifdef	__ARMEB__
+	ror	x10,x10,#32		// flip nonce words
+	ror	x11,x11,#32
+#endif
+	adds	x4,x4,x10		// accumulate nonce
+	adc	x5,x5,x11
+#ifdef	__ARMEB__
+	rev	x4,x4			// flip output bytes
+	rev	x5,x5
+#endif
+	stp	x4,x5,[x1]		// write result
+
+	ret
+ENDPROC(poly1305_emit_neon)
+
+.align	5
+.Lzeros:
+.long	0,0,0,0,0,0,0,0
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 07/20] zinc: Poly1305 generic C implementations and selftest
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Jean-Philippe Aumasson
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

These two C implementations -- a 32x32 one and a 64x64 one, depending on
the platform -- come from Andrew Moon's public domain poly1305-donna
portable code, modified for usage in the kernel and for usage with
accelerated primitives.

Information: https://cr.yp.to/mac.html

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
---
 include/zinc/poly1305.h              |  38 ++
 lib/zinc/Kconfig                     |   4 +
 lib/zinc/Makefile                    |   4 +
 lib/zinc/main.c                      |   5 +
 lib/zinc/poly1305/poly1305-donna32.h | 205 +++++++
 lib/zinc/poly1305/poly1305-donna64.h | 182 ++++++
 lib/zinc/poly1305/poly1305.c         | 131 ++++
 lib/zinc/selftest/poly1305.h         | 876 +++++++++++++++++++++++++++
 8 files changed, 1445 insertions(+)
 create mode 100644 include/zinc/poly1305.h
 create mode 100644 lib/zinc/poly1305/poly1305-donna32.h
 create mode 100644 lib/zinc/poly1305/poly1305-donna64.h
 create mode 100644 lib/zinc/poly1305/poly1305.c
 create mode 100644 lib/zinc/selftest/poly1305.h

diff --git a/include/zinc/poly1305.h b/include/zinc/poly1305.h
new file mode 100644
index 000000000000..338430c8477a
--- /dev/null
+++ b/include/zinc/poly1305.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _ZINC_POLY1305_H
+#define _ZINC_POLY1305_H
+
+#include <linux/simd.h>
+#include <linux/types.h>
+
+enum poly1305_lengths {
+	POLY1305_BLOCK_SIZE = 16,
+	POLY1305_KEY_SIZE = 32,
+	POLY1305_MAC_SIZE = 16
+};
+
+struct poly1305_ctx {
+	u8 opaque[24 * sizeof(u64)];
+	u32 nonce[4];
+	u8 data[POLY1305_BLOCK_SIZE];
+	size_t num;
+} __aligned(8);
+
+void poly1305_fpu_init(void);
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE],
+		   simd_context_t simd_context);
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *input, size_t len,
+		     simd_context_t simd_context);
+void poly1305_final(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE],
+		    simd_context_t simd_context);
+
+#ifdef DEBUG
+bool poly1305_selftest(void);
+#endif
+
+#endif /* _ZINC_POLY1305_H */
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
index e7d396d61607..bc8c61334362 100644
--- a/lib/zinc/Kconfig
+++ b/lib/zinc/Kconfig
@@ -6,6 +6,10 @@ config ZINC_CHACHA20
 	select ZINC
 	select CRYPTO_ALGAPI
 
+config ZINC_POLY1305
+	bool
+	select ZINC
+
 config ZINC_DEBUG
 	bool "Zinc cryptography library debugging and self-tests"
 	depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 9f6a5e65d729..d1e3892e06d9 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -23,6 +23,10 @@ CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
 endif
 endif
 
+ifeq ($(CONFIG_ZINC_POLY1305),y)
+zinc-y += poly1305/poly1305.o
+endif
+
 zinc-y += main.o
 
 obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
index 7e8e84b706b7..d871dd406a5c 100644
--- a/lib/zinc/main.c
+++ b/lib/zinc/main.c
@@ -4,6 +4,7 @@
  */
 
 #include <zinc/chacha20.h>
+#include <zinc/poly1305.h>
 
 #include <linux/init.h>
 #include <linux/module.h>
@@ -21,6 +22,10 @@ static int __init mod_init(void)
 {
 #ifdef CONFIG_ZINC_CHACHA20
 	chacha20_fpu_init();
+#endif
+#ifdef CONFIG_ZINC_POLY1305
+	poly1305_fpu_init();
+	selftest(poly1305);
 #endif
 	return 0;
 }
diff --git a/lib/zinc/poly1305/poly1305-donna32.h b/lib/zinc/poly1305/poly1305-donna32.h
new file mode 100644
index 000000000000..dc32123210f9
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-donna32.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is based in part on Andrew Moon's poly1305-donna, which is in the
+ * public domain.
+ */
+
+struct poly1305_internal {
+	u32 h[5];
+	u32 r[5];
+	u32 s[4];
+};
+
+static void poly1305_init_generic(void *ctx, const u8 key[16])
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	st->r[0] = (get_unaligned_le32(&key[0])) & 0x3ffffff;
+	st->r[1] = (get_unaligned_le32(&key[3]) >> 2) & 0x3ffff03;
+	st->r[2] = (get_unaligned_le32(&key[6]) >> 4) & 0x3ffc0ff;
+	st->r[3] = (get_unaligned_le32(&key[9]) >> 6) & 0x3f03fff;
+	st->r[4] = (get_unaligned_le32(&key[12]) >> 8) & 0x00fffff;
+
+	/* s = 5*r */
+	st->s[0] = st->r[1] * 5;
+	st->s[1] = st->r[2] * 5;
+	st->s[2] = st->r[3] * 5;
+	st->s[3] = st->r[4] * 5;
+
+	/* h = 0 */
+	st->h[0] = 0;
+	st->h[1] = 0;
+	st->h[2] = 0;
+	st->h[3] = 0;
+	st->h[4] = 0;
+}
+
+static void poly1305_blocks_generic(void *ctx, const u8 *input, size_t len,
+				    const u32 padbit)
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+	const u32 hibit = padbit << 24;
+	u32 r0, r1, r2, r3, r4;
+	u32 s1, s2, s3, s4;
+	u32 h0, h1, h2, h3, h4;
+	u64 d0, d1, d2, d3, d4;
+	u32 c;
+
+	r0 = st->r[0];
+	r1 = st->r[1];
+	r2 = st->r[2];
+	r3 = st->r[3];
+	r4 = st->r[4];
+
+	s1 = st->s[0];
+	s2 = st->s[1];
+	s3 = st->s[2];
+	s4 = st->s[3];
+
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+	h3 = st->h[3];
+	h4 = st->h[4];
+
+	while (len >= POLY1305_BLOCK_SIZE) {
+		/* h += m[i] */
+		h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff;
+		h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff;
+		h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff;
+		h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff;
+		h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit;
+
+		/* h *= r */
+		d0 = ((u64)h0 * r0) + ((u64)h1 * s4) +
+		     ((u64)h2 * s3) + ((u64)h3 * s2) +
+		     ((u64)h4 * s1);
+		d1 = ((u64)h0 * r1) + ((u64)h1 * r0) +
+		     ((u64)h2 * s4) + ((u64)h3 * s3) +
+		     ((u64)h4 * s2);
+		d2 = ((u64)h0 * r2) + ((u64)h1 * r1) +
+		     ((u64)h2 * r0) + ((u64)h3 * s4) +
+		     ((u64)h4 * s3);
+		d3 = ((u64)h0 * r3) + ((u64)h1 * r2) +
+		     ((u64)h2 * r1) + ((u64)h3 * r0) +
+		     ((u64)h4 * s4);
+		d4 = ((u64)h0 * r4) + ((u64)h1 * r3) +
+		     ((u64)h2 * r2) + ((u64)h3 * r1) +
+		     ((u64)h4 * r0);
+
+		/* (partial) h %= p */
+		c = (u32)(d0 >> 26);
+		h0 = (u32)d0 & 0x3ffffff;
+		d1 += c;
+		c = (u32)(d1 >> 26);
+		h1 = (u32)d1 & 0x3ffffff;
+		d2 += c;
+		c = (u32)(d2 >> 26);
+		h2 = (u32)d2 & 0x3ffffff;
+		d3 += c;
+		c = (u32)(d3 >> 26);
+		h3 = (u32)d3 & 0x3ffffff;
+		d4 += c;
+		c = (u32)(d4 >> 26);
+		h4 = (u32)d4 & 0x3ffffff;
+		h0 += c * 5;
+		c = (h0 >> 26);
+		h0 = h0 & 0x3ffffff;
+		h1 += c;
+
+		input += POLY1305_BLOCK_SIZE;
+		len -= POLY1305_BLOCK_SIZE;
+	}
+
+	st->h[0] = h0;
+	st->h[1] = h1;
+	st->h[2] = h2;
+	st->h[3] = h3;
+	st->h[4] = h4;
+}
+
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+	u32 h0, h1, h2, h3, h4, c;
+	u32 g0, g1, g2, g3, g4;
+	u64 f;
+	u32 mask;
+
+	/* fully carry h */
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+	h3 = st->h[3];
+	h4 = st->h[4];
+
+	c = h1 >> 26;
+	h1 = h1 & 0x3ffffff;
+	h2 += c;
+	c = h2 >> 26;
+	h2 = h2 & 0x3ffffff;
+	h3 += c;
+	c = h3 >> 26;
+	h3 = h3 & 0x3ffffff;
+	h4 += c;
+	c = h4 >> 26;
+	h4 = h4 & 0x3ffffff;
+	h0 += c * 5;
+	c = h0 >> 26;
+	h0 = h0 & 0x3ffffff;
+	h1 += c;
+
+	/* compute h + -p */
+	g0 = h0 + 5;
+	c = g0 >> 26;
+	g0 &= 0x3ffffff;
+	g1 = h1 + c;
+	c = g1 >> 26;
+	g1 &= 0x3ffffff;
+	g2 = h2 + c;
+	c = g2 >> 26;
+	g2 &= 0x3ffffff;
+	g3 = h3 + c;
+	c = g3 >> 26;
+	g3 &= 0x3ffffff;
+	g4 = h4 + c - (1UL << 26);
+
+	/* select h if h < p, or h + -p if h >= p */
+	mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+	g0 &= mask;
+	g1 &= mask;
+	g2 &= mask;
+	g3 &= mask;
+	g4 &= mask;
+	mask = ~mask;
+
+	h0 = (h0 & mask) | g0;
+	h1 = (h1 & mask) | g1;
+	h2 = (h2 & mask) | g2;
+	h3 = (h3 & mask) | g3;
+	h4 = (h4 & mask) | g4;
+
+	/* h = h % (2^128) */
+	h0 = ((h0) | (h1 << 26)) & 0xffffffff;
+	h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
+	h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
+	h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
+
+	/* mac = (h + nonce) % (2^128) */
+	f = (u64)h0 + nonce[0];
+	h0 = (u32)f;
+	f = (u64)h1 + nonce[1] + (f >> 32);
+	h1 = (u32)f;
+	f = (u64)h2 + nonce[2] + (f >> 32);
+	h2 = (u32)f;
+	f = (u64)h3 + nonce[3] + (f >> 32);
+	h3 = (u32)f;
+
+	put_unaligned_le32(h0, &mac[0]);
+	put_unaligned_le32(h1, &mac[4]);
+	put_unaligned_le32(h2, &mac[8]);
+	put_unaligned_le32(h3, &mac[12]);
+}
diff --git a/lib/zinc/poly1305/poly1305-donna64.h b/lib/zinc/poly1305/poly1305-donna64.h
new file mode 100644
index 000000000000..de7ab1246024
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-donna64.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is based in part on Andrew Moon's poly1305-donna, which is in the
+ * public domain.
+ */
+
+typedef __uint128_t u128;
+
+struct poly1305_internal {
+	u64 r[3];
+	u64 h[3];
+	u64 s[2];
+};
+
+static void poly1305_init_generic(void *ctx, const u8 key[16])
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+	u64 t0, t1;
+
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	t0 = get_unaligned_le64(&key[0]);
+	t1 = get_unaligned_le64(&key[8]);
+
+	st->r[0] = t0 & 0xffc0fffffff;
+	st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
+	st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f;
+
+	/* s = 20*r */
+	st->s[0] = st->r[1] * 20;
+	st->s[1] = st->r[2] * 20;
+
+	/* h = 0 */
+	st->h[0] = 0;
+	st->h[1] = 0;
+	st->h[2] = 0;
+}
+
+static void poly1305_blocks_generic(void *ctx, const u8 *input, size_t len,
+				    const u32 padbit)
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+	const u64 hibit = ((u64)padbit) << 40;
+	u64 r0, r1, r2;
+	u64 s1, s2;
+	u64 h0, h1, h2;
+	u64 c;
+	u128 d0, d1, d2, d;
+
+	r0 = st->r[0];
+	r1 = st->r[1];
+	r2 = st->r[2];
+
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+
+	s1 = st->s[0];
+	s2 = st->s[1];
+
+	while (len >= POLY1305_BLOCK_SIZE) {
+		u64 t0, t1;
+
+		/* h += m[i] */
+		t0 = get_unaligned_le64(&input[0]);
+		t1 = get_unaligned_le64(&input[8]);
+
+		h0 += t0 & 0xfffffffffff;
+		h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
+		h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
+
+		/* h *= r */
+		d0 = (u128)h0 * r0;
+		d = (u128)h1 * s2;
+		d0 += d;
+		d = (u128)h2 * s1;
+		d0 += d;
+		d1 = (u128)h0 * r1;
+		d = (u128)h1 * r0;
+		d1 += d;
+		d = (u128)h2 * s2;
+		d1 += d;
+		d2 = (u128)h0 * r2;
+		d = (u128)h1 * r1;
+		d2 += d;
+		d = (u128)h2 * r0;
+		d2 += d;
+
+		/* (partial) h %= p */
+		c = (u64)(d0 >> 44);
+		h0 = (u64)d0 & 0xfffffffffff;
+		d1 += c;
+		c = (u64)(d1 >> 44);
+		h1 = (u64)d1 & 0xfffffffffff;
+		d2 += c;
+		c = (u64)(d2 >> 42);
+		h2 = (u64)d2 & 0x3ffffffffff;
+		h0 += c * 5;
+		c = h0 >> 44;
+		h0 = h0 & 0xfffffffffff;
+		h1 += c;
+
+		input += POLY1305_BLOCK_SIZE;
+		len -= POLY1305_BLOCK_SIZE;
+	}
+
+	st->h[0] = h0;
+	st->h[1] = h1;
+	st->h[2] = h2;
+}
+
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+	u64 h0, h1, h2, c;
+	u64 g0, g1, g2;
+	u64 t0, t1;
+
+	/* fully carry h */
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+
+	c = h1 >> 44;
+	h1 &= 0xfffffffffff;
+	h2 += c;
+	c = h2 >> 42;
+	h2 &= 0x3ffffffffff;
+	h0 += c * 5;
+	c = h0 >> 44;
+	h0 &= 0xfffffffffff;
+	h1 += c;
+	c = h1 >> 44;
+	h1 &= 0xfffffffffff;
+	h2 += c;
+	c = h2 >> 42;
+	h2 &= 0x3ffffffffff;
+	h0 += c * 5;
+	c = h0 >> 44;
+	h0 &= 0xfffffffffff;
+	h1 += c;
+
+	/* compute h + -p */
+	g0 = h0 + 5;
+	c  = g0 >> 44;
+	g0 &= 0xfffffffffff;
+	g1 = h1 + c;
+	c  = g1 >> 44;
+	g1 &= 0xfffffffffff;
+	g2 = h2 + c - (1ULL << 42);
+
+	/* select h if h < p, or h + -p if h >= p */
+	c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1;
+	g0 &= c;
+	g1 &= c;
+	g2 &= c;
+	c  = ~c;
+	h0 = (h0 & c) | g0;
+	h1 = (h1 & c) | g1;
+	h2 = (h2 & c) | g2;
+
+	/* h = (h + nonce) */
+	t0 = ((u64)nonce[1] << 32) | nonce[0];
+	t1 = ((u64)nonce[3] << 32) | nonce[2];
+
+	h0 += t0 & 0xfffffffffff;
+	c = h0 >> 44;
+	h0 &= 0xfffffffffff;
+	h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
+	c = h1 >> 44;
+	h1 &= 0xfffffffffff;
+	h2 += (((t1 >> 24)) & 0x3ffffffffff) + c;
+	h2 &= 0x3ffffffffff;
+
+	/* mac = h % (2^128) */
+	h0 = h0 | (h1 << 44);
+	h1 = (h1 >> 20) | (h2 << 24);
+
+	put_unaligned_le64(h0, &mac[0]);
+	put_unaligned_le64(h1, &mac[8]);
+}
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
new file mode 100644
index 000000000000..9a71ac1d4e39
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305.c
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Implementation of the Poly1305 message authenticator.
+ *
+ * Information: https://cr.yp.to/mac.html
+ */
+
+#include <zinc/poly1305.h>
+
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#ifndef HAVE_POLY1305_ARCH_IMPLEMENTATION
+static inline bool poly1305_init_arch(void *ctx,
+				      const u8 key[POLY1305_KEY_SIZE],
+				      simd_context_t simd_context)
+{
+	return false;
+}
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *input,
+					const size_t len, const u32 padbit,
+					simd_context_t simd_context)
+{
+	return false;
+}
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+				      const u32 nonce[4],
+				      simd_context_t simd_context)
+{
+	return false;
+}
+void __init poly1305_fpu_init(void)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+#include "poly1305-donna64.h"
+#else
+#include "poly1305-donna32.h"
+#endif
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE],
+		   simd_context_t simd_context)
+{
+	ctx->nonce[0] = get_unaligned_le32(&key[16]);
+	ctx->nonce[1] = get_unaligned_le32(&key[20]);
+	ctx->nonce[2] = get_unaligned_le32(&key[24]);
+	ctx->nonce[3] = get_unaligned_le32(&key[28]);
+
+	if (!poly1305_init_arch(ctx->opaque, key, simd_context))
+		poly1305_init_generic(ctx->opaque, key);
+
+	ctx->num = 0;
+}
+EXPORT_SYMBOL(poly1305_init);
+
+static inline void poly1305_blocks(void *ctx, const u8 *input, const size_t len,
+				   const u32 padbit,
+				   simd_context_t simd_context)
+{
+	if (!poly1305_blocks_arch(ctx, input, len, padbit, simd_context))
+		poly1305_blocks_generic(ctx, input, len, padbit);
+}
+
+static inline void poly1305_emit(void *ctx, u8 mac[POLY1305_KEY_SIZE],
+				 const u32 nonce[4],
+				 simd_context_t simd_context)
+{
+	if (!poly1305_emit_arch(ctx, mac, nonce, simd_context))
+		poly1305_emit_generic(ctx, mac, nonce);
+}
+
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *input, size_t len,
+		     simd_context_t simd_context)
+{
+	const size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+	size_t rem;
+
+	if (num) {
+		rem = POLY1305_BLOCK_SIZE - num;
+		if (len < rem) {
+			memcpy(ctx->data + num, input, len);
+			ctx->num = num + len;
+			return;
+		}
+		memcpy(ctx->data + num, input, rem);
+		poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1,
+				simd_context);
+		input += rem;
+		len -= rem;
+	}
+
+	rem = len % POLY1305_BLOCK_SIZE;
+	len -= rem;
+
+	if (len >= POLY1305_BLOCK_SIZE) {
+		poly1305_blocks(ctx->opaque, input, len, 1, simd_context);
+		input += len;
+	}
+
+	if (rem)
+		memcpy(ctx->data, input, rem);
+
+	ctx->num = rem;
+}
+EXPORT_SYMBOL(poly1305_update);
+
+void poly1305_final(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE],
+		    simd_context_t simd_context)
+{
+	size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+
+	if (num) {
+		ctx->data[num++] = 1;
+		while (num < POLY1305_BLOCK_SIZE)
+			ctx->data[num++] = 0;
+		poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0,
+				simd_context);
+	}
+
+	poly1305_emit(ctx->opaque, mac, ctx->nonce, simd_context);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(poly1305_final);
+
+#include "../selftest/poly1305.h"
diff --git a/lib/zinc/selftest/poly1305.h b/lib/zinc/selftest/poly1305.h
new file mode 100644
index 000000000000..8138a9399aed
--- /dev/null
+++ b/lib/zinc/selftest/poly1305.h
@@ -0,0 +1,876 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifdef DEBUG
+struct poly1305_testvec {
+	u8 input[600];
+	u8 output[POLY1305_MAC_SIZE];
+	u8 key[POLY1305_KEY_SIZE];
+	size_t ilen;
+};
+
+static const struct poly1305_testvec poly1305_testvecs[] __initconst = {
+{ /* RFC7539 */
+	.input	= { 0x43, 0x72, 0x79, 0x70, 0x74, 0x6f, 0x67, 0x72,
+		    0x61, 0x70, 0x68, 0x69, 0x63, 0x20, 0x46, 0x6f,
+		    0x72, 0x75, 0x6d, 0x20, 0x52, 0x65, 0x73, 0x65,
+		    0x61, 0x72, 0x63, 0x68, 0x20, 0x47, 0x72, 0x6f,
+		    0x75, 0x70 },
+	.ilen	= 34,
+	.output	= { 0xa8, 0x06, 0x1d, 0xc1, 0x30, 0x51, 0x36, 0xc6,
+		    0xc2, 0x2b, 0x8b, 0xaf, 0x0c, 0x01, 0x27, 0xa9 },
+	.key	= { 0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33,
+		    0x7f, 0x44, 0x52, 0xfe, 0x42, 0xd5, 0x06, 0xa8,
+		    0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d, 0xb2, 0xfd,
+		    0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b },
+}, { /* "The Poly1305-AES message-authentication code" */
+	.input	= { 0xf3, 0xf6 },
+	.ilen	= 2,
+	.output	= { 0xf4, 0xc6, 0x33, 0xc3, 0x04, 0x4f, 0xc1, 0x45,
+		    0xf8, 0x4f, 0x33, 0x5c, 0xb8, 0x19, 0x53, 0xde },
+	.key	= { 0x85, 0x1f, 0xc4, 0x0c, 0x34, 0x67, 0xac, 0x0b,
+		    0xe0, 0x5c, 0xc2, 0x04, 0x04, 0xf3, 0xf7, 0x00,
+		    0x58, 0x0b, 0x3b, 0x0f, 0x94, 0x47, 0xbb, 0x1e,
+		    0x69, 0xd0, 0x95, 0xb5, 0x92, 0x8b, 0x6d, 0xbc },
+}, {
+	.input	= "",
+	.ilen	= 0,
+	.output	= { 0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+		    0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7 },
+	.key	= { 0xa0, 0xf3, 0x08, 0x00, 0x00, 0xf4, 0x64, 0x00,
+		    0xd0, 0xc7, 0xe9, 0x07, 0x6c, 0x83, 0x44, 0x03,
+		    0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+		    0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7 },
+}, {
+	.input	= { 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+	.ilen	= 32,
+	.output	= { 0x0e, 0xe1, 0xc1, 0x6b, 0xb7, 0x3f, 0x0f, 0x4f,
+		    0xd1, 0x98, 0x81, 0x75, 0x3c, 0x01, 0xcd, 0xbe },
+	.key	= { 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9 },
+	.ilen	= 63,
+	.output	= { 0x51, 0x54, 0xad, 0x0d, 0x2c, 0xb2, 0x6e, 0x01,
+		    0x27, 0x4f, 0xc5, 0x11, 0x48, 0x49, 0x1f, 0x1b },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, { /* self-generated vectors exercise "significant" lengths, such that they
+      * are handled by different code paths */
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf },
+	.ilen	= 64,
+	.output	= { 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+		    0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67 },
+	.ilen	= 48,
+	.output	= { 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+		    0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+	.ilen	= 96,
+	.output	= { 0xbb, 0xb6, 0x13, 0xb2, 0xb6, 0xd7, 0x53, 0xba,
+		    0x07, 0x39, 0x5b, 0x91, 0x6a, 0xae, 0xce, 0x15 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24 },
+	.ilen	= 112,
+	.output	= { 0xc7, 0x94, 0xd7, 0x05, 0x7d, 0x17, 0x78, 0xc4,
+		    0xbb, 0xee, 0x0a, 0x39, 0xb3, 0xd9, 0x73, 0x42 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+	.ilen	= 128,
+	.output	= { 0xff, 0xbc, 0xb9, 0xb3, 0x71, 0x42, 0x31, 0x52,
+		    0xd7, 0xfc, 0xa5, 0xad, 0x04, 0x2f, 0xba, 0xa9 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+		    0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+		    0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66 },
+	.ilen	= 144,
+	.output	= { 0x06, 0x9e, 0xd6, 0xb8, 0xef, 0x0f, 0x20, 0x7b,
+		    0x3e, 0x24, 0x3b, 0xb1, 0x01, 0x9f, 0xe6, 0x32 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+		    0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+		    0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+		    0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+		    0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61 },
+	.ilen	= 160,
+	.output	= { 0xcc, 0xa3, 0x39, 0xd9, 0xa4, 0x5f, 0xa2, 0x36,
+		    0x8c, 0x2c, 0x68, 0xb3, 0xa4, 0x17, 0x91, 0x33 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+		    0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+		    0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+		    0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+		    0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+		    0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36 },
+	.ilen	= 288,
+	.output	= { 0x53, 0xf6, 0xe8, 0x28, 0xa2, 0xf0, 0xfe, 0x0e,
+		    0xe8, 0x15, 0xbf, 0x0b, 0xd5, 0x84, 0x1a, 0x34 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, {
+	.input	= { 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+		    0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+		    0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+		    0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+		    0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+		    0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+		    0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+		    0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+		    0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+		    0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+		    0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+		    0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+		    0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+		    0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+		    0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+		    0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+		    0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+		    0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+		    0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+		    0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+		    0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+		    0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+		    0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+		    0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+		    0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61 },
+	.ilen	= 320,
+	.output	= { 0xb8, 0x46, 0xd4, 0x4e, 0x9b, 0xbd, 0x53, 0xce,
+		    0xdf, 0xfb, 0xfb, 0xb6, 0xb7, 0xfa, 0x49, 0x33 },
+	.key	= { 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+		    0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+		    0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+		    0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57 },
+}, { /* 4th power of the key spills to 131th bit in SIMD key setup */
+	.input	= { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+	.ilen	= 256,
+	.output	= { 0x07, 0x14, 0x5a, 0x4c, 0x02, 0xfe, 0x5f, 0xa3,
+		    0x20, 0x36, 0xde, 0x68, 0xfa, 0xbe, 0x90, 0x66 },
+	.key	= { 0xad, 0x62, 0x81, 0x07, 0xe8, 0x35, 0x1d, 0x0f,
+		    0x2c, 0x23, 0x1a, 0x05, 0xdc, 0x4a, 0x41, 0x06,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* OpenSSL's poly1305_ieee754.c failed this in final stage */
+	.input	= { 0x84, 0x23, 0x64, 0xe1, 0x56, 0x33, 0x6c, 0x09,
+		    0x98, 0xb9, 0x33, 0xa6, 0x23, 0x77, 0x26, 0x18,
+		    0x0d, 0x9e, 0x3f, 0xdc, 0xbd, 0xe4, 0xcd, 0x5d,
+		    0x17, 0x08, 0x0f, 0xc3, 0xbe, 0xb4, 0x96, 0x14,
+		    0xd7, 0x12, 0x2c, 0x03, 0x74, 0x63, 0xff, 0x10,
+		    0x4d, 0x73, 0xf1, 0x9c, 0x12, 0x70, 0x46, 0x28,
+		    0xd4, 0x17, 0xc4, 0xc5, 0x4a, 0x3f, 0xe3, 0x0d,
+		    0x3c, 0x3d, 0x77, 0x14, 0x38, 0x2d, 0x43, 0xb0,
+		    0x38, 0x2a, 0x50, 0xa5, 0xde, 0xe5, 0x4b, 0xe8,
+		    0x44, 0xb0, 0x76, 0xe8, 0xdf, 0x88, 0x20, 0x1a,
+		    0x1c, 0xd4, 0x3b, 0x90, 0xeb, 0x21, 0x64, 0x3f,
+		    0xa9, 0x6f, 0x39, 0xb5, 0x18, 0xaa, 0x83, 0x40,
+		    0xc9, 0x42, 0xff, 0x3c, 0x31, 0xba, 0xf7, 0xc9,
+		    0xbd, 0xbf, 0x0f, 0x31, 0xae, 0x3f, 0xa0, 0x96,
+		    0xbf, 0x8c, 0x63, 0x03, 0x06, 0x09, 0x82, 0x9f,
+		    0xe7, 0x2e, 0x17, 0x98, 0x24, 0x89, 0x0b, 0xc8,
+		    0xe0, 0x8c, 0x31, 0x5c, 0x1c, 0xce, 0x2a, 0x83,
+		    0x14, 0x4d, 0xbb, 0xff, 0x09, 0xf7, 0x4e, 0x3e,
+		    0xfc, 0x77, 0x0b, 0x54, 0xd0, 0x98, 0x4a, 0x8f,
+		    0x19, 0xb1, 0x47, 0x19, 0xe6, 0x36, 0x35, 0x64,
+		    0x1d, 0x6b, 0x1e, 0xed, 0xf6, 0x3e, 0xfb, 0xf0,
+		    0x80, 0xe1, 0x78, 0x3d, 0x32, 0x44, 0x54, 0x12,
+		    0x11, 0x4c, 0x20, 0xde, 0x0b, 0x83, 0x7a, 0x0d,
+		    0xfa, 0x33, 0xd6, 0xb8, 0x28, 0x25, 0xff, 0xf4,
+		    0x4c, 0x9a, 0x70, 0xea, 0x54, 0xce, 0x47, 0xf0,
+		    0x7d, 0xf6, 0x98, 0xe6, 0xb0, 0x33, 0x23, 0xb5,
+		    0x30, 0x79, 0x36, 0x4a, 0x5f, 0xc3, 0xe9, 0xdd,
+		    0x03, 0x43, 0x92, 0xbd, 0xde, 0x86, 0xdc, 0xcd,
+		    0xda, 0x94, 0x32, 0x1c, 0x5e, 0x44, 0x06, 0x04,
+		    0x89, 0x33, 0x6c, 0xb6, 0x5b, 0xf3, 0x98, 0x9c,
+		    0x36, 0xf7, 0x28, 0x2c, 0x2f, 0x5d, 0x2b, 0x88,
+		    0x2c, 0x17, 0x1e, 0x74 },
+	.ilen	= 252,
+	.output	= { 0xf2, 0x48, 0x31, 0x2e, 0x57, 0x8d, 0x9d, 0x58,
+		    0xf8, 0xb7, 0xbb, 0x4d, 0x19, 0x10, 0x54, 0x31 },
+	.key	= { 0x95, 0xd5, 0xc0, 0x05, 0x50, 0x3e, 0x51, 0x0d,
+		    0x8c, 0xd0, 0xaa, 0x07, 0x2c, 0x4a, 0x4d, 0x06,
+		    0x6e, 0xab, 0xc5, 0x2d, 0x11, 0x65, 0x3d, 0xf4,
+		    0x7f, 0xbf, 0x63, 0xab, 0x19, 0x8b, 0xcc, 0x26 },
+}, { /* AVX2 in OpenSSL's poly1305-x86.pl failed this with 176+32 split */
+	.input	= { 0x24, 0x8a, 0xc3, 0x10, 0x85, 0xb6, 0xc2, 0xad,
+		    0xaa, 0xa3, 0x82, 0x59, 0xa0, 0xd7, 0x19, 0x2c,
+		    0x5c, 0x35, 0xd1, 0xbb, 0x4e, 0xf3, 0x9a, 0xd9,
+		    0x4c, 0x38, 0xd1, 0xc8, 0x24, 0x79, 0xe2, 0xdd,
+		    0x21, 0x59, 0xa0, 0x77, 0x02, 0x4b, 0x05, 0x89,
+		    0xbc, 0x8a, 0x20, 0x10, 0x1b, 0x50, 0x6f, 0x0a,
+		    0x1a, 0xd0, 0xbb, 0xab, 0x76, 0xe8, 0x3a, 0x83,
+		    0xf1, 0xb9, 0x4b, 0xe6, 0xbe, 0xae, 0x74, 0xe8,
+		    0x74, 0xca, 0xb6, 0x92, 0xc5, 0x96, 0x3a, 0x75,
+		    0x43, 0x6b, 0x77, 0x61, 0x21, 0xec, 0x9f, 0x62,
+		    0x39, 0x9a, 0x3e, 0x66, 0xb2, 0xd2, 0x27, 0x07,
+		    0xda, 0xe8, 0x19, 0x33, 0xb6, 0x27, 0x7f, 0x3c,
+		    0x85, 0x16, 0xbc, 0xbe, 0x26, 0xdb, 0xbd, 0x86,
+		    0xf3, 0x73, 0x10, 0x3d, 0x7c, 0xf4, 0xca, 0xd1,
+		    0x88, 0x8c, 0x95, 0x21, 0x18, 0xfb, 0xfb, 0xd0,
+		    0xd7, 0xb4, 0xbe, 0xdc, 0x4a, 0xe4, 0x93, 0x6a,
+		    0xff, 0x91, 0x15, 0x7e, 0x7a, 0xa4, 0x7c, 0x54,
+		    0x44, 0x2e, 0xa7, 0x8d, 0x6a, 0xc2, 0x51, 0xd3,
+		    0x24, 0xa0, 0xfb, 0xe4, 0x9d, 0x89, 0xcc, 0x35,
+		    0x21, 0xb6, 0x6d, 0x16, 0xe9, 0xc6, 0x6a, 0x37,
+		    0x09, 0x89, 0x4e, 0x4e, 0xb0, 0xa4, 0xee, 0xdc,
+		    0x4a, 0xe1, 0x94, 0x68, 0xe6, 0x6b, 0x81, 0xf2,
+		    0x71, 0x35, 0x1b, 0x1d, 0x92, 0x1e, 0xa5, 0x51,
+		    0x04, 0x7a, 0xbc, 0xc6, 0xb8, 0x7a, 0x90, 0x1f,
+		    0xde, 0x7d, 0xb7, 0x9f, 0xa1, 0x81, 0x8c, 0x11,
+		    0x33, 0x6d, 0xbc, 0x07, 0x24, 0x4a, 0x40, 0xeb },
+	.ilen	= 208,
+	.output	= { 0xbc, 0x93, 0x9b, 0xc5, 0x28, 0x14, 0x80, 0xfa,
+		    0x99, 0xc6, 0xd6, 0x8c, 0x25, 0x8e, 0xc4, 0x2f },
+	.key	= { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* test vectors from Google */
+	.input	= "",
+	.ilen	= 0,
+	.output	= { 0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+		    0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c },
+	.key	= { 0xc8, 0xaf, 0xaa, 0xc3, 0x31, 0xee, 0x37, 0x2c,
+		    0xd6, 0x08, 0x2d, 0xe1, 0x34, 0x94, 0x3b, 0x17,
+		    0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+		    0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c },
+}, {
+	.input	= { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+		    0x72, 0x6c, 0x64, 0x21 },
+	.ilen	= 12,
+	.output	= { 0xa6, 0xf7, 0x45, 0x00, 0x8f, 0x81, 0xc9, 0x16,
+		    0xa2, 0x0d, 0xcc, 0x74, 0xee, 0xf2, 0xb2, 0xf0 },
+	.key	= { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+		    0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+		    0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+		    0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35 },
+}, {
+	.input	= { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.ilen	= 32,
+	.output	= { 0x49, 0xec, 0x78, 0x09, 0x0e, 0x48, 0x1e, 0xc6,
+		    0xc2, 0x6b, 0x33, 0xb9, 0x1c, 0xcc, 0x03, 0x07 },
+	.key	= { 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+		    0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+		    0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+		    0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35 },
+}, {
+	.input	= { 0x89, 0xda, 0xb8, 0x0b, 0x77, 0x17, 0xc1, 0xdb,
+		    0x5d, 0xb4, 0x37, 0x86, 0x0a, 0x3f, 0x70, 0x21,
+		    0x8e, 0x93, 0xe1, 0xb8, 0xf4, 0x61, 0xfb, 0x67,
+		    0x7f, 0x16, 0xf3, 0x5f, 0x6f, 0x87, 0xe2, 0xa9,
+		    0x1c, 0x99, 0xbc, 0x3a, 0x47, 0xac, 0xe4, 0x76,
+		    0x40, 0xcc, 0x95, 0xc3, 0x45, 0xbe, 0x5e, 0xcc,
+		    0xa5, 0xa3, 0x52, 0x3c, 0x35, 0xcc, 0x01, 0x89,
+		    0x3a, 0xf0, 0xb6, 0x4a, 0x62, 0x03, 0x34, 0x27,
+		    0x03, 0x72, 0xec, 0x12, 0x48, 0x2d, 0x1b, 0x1e,
+		    0x36, 0x35, 0x61, 0x69, 0x8a, 0x57, 0x8b, 0x35,
+		    0x98, 0x03, 0x49, 0x5b, 0xb4, 0xe2, 0xef, 0x19,
+		    0x30, 0xb1, 0x7a, 0x51, 0x90, 0xb5, 0x80, 0xf1,
+		    0x41, 0x30, 0x0d, 0xf3, 0x0a, 0xdb, 0xec, 0xa2,
+		    0x8f, 0x64, 0x27, 0xa8, 0xbc, 0x1a, 0x99, 0x9f,
+		    0xd5, 0x1c, 0x55, 0x4a, 0x01, 0x7d, 0x09, 0x5d,
+		    0x8c, 0x3e, 0x31, 0x27, 0xda, 0xf9, 0xf5, 0x95 },
+	.ilen	= 128,
+	.output	= { 0xc8, 0x5d, 0x15, 0xed, 0x44, 0xc3, 0x78, 0xd6,
+		    0xb0, 0x0e, 0x23, 0x06, 0x4c, 0x7b, 0xcd, 0x51 },
+	.key	= { 0x2d, 0x77, 0x3b, 0xe3, 0x7a, 0xdb, 0x1e, 0x4d,
+		    0x68, 0x3b, 0xf0, 0x07, 0x5e, 0x79, 0xc4, 0xee,
+		    0x03, 0x79, 0x18, 0x53, 0x5a, 0x7f, 0x99, 0xcc,
+		    0xb7, 0x04, 0x0f, 0xb5, 0xf5, 0xf4, 0x3a, 0xea },
+}, {
+	.input	= { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b,
+		    0x17, 0x03, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00,
+		    0x06, 0xdb, 0x1f, 0x1f, 0x36, 0x8d, 0x69, 0x6a,
+		    0x81, 0x0a, 0x34, 0x9c, 0x0c, 0x71, 0x4c, 0x9a,
+		    0x5e, 0x78, 0x50, 0xc2, 0x40, 0x7d, 0x72, 0x1a,
+		    0xcd, 0xed, 0x95, 0xe0, 0x18, 0xd7, 0xa8, 0x52,
+		    0x66, 0xa6, 0xe1, 0x28, 0x9c, 0xdb, 0x4a, 0xeb,
+		    0x18, 0xda, 0x5a, 0xc8, 0xa2, 0xb0, 0x02, 0x6d,
+		    0x24, 0xa5, 0x9a, 0xd4, 0x85, 0x22, 0x7f, 0x3e,
+		    0xae, 0xdb, 0xb2, 0xe7, 0xe3, 0x5e, 0x1c, 0x66,
+		    0xcd, 0x60, 0xf9, 0xab, 0xf7, 0x16, 0xdc, 0xc9,
+		    0xac, 0x42, 0x68, 0x2d, 0xd7, 0xda, 0xb2, 0x87,
+		    0xa7, 0x02, 0x4c, 0x4e, 0xef, 0xc3, 0x21, 0xcc,
+		    0x05, 0x74, 0xe1, 0x67, 0x93, 0xe3, 0x7c, 0xec,
+		    0x03, 0xc5, 0xbd, 0xa4, 0x2b, 0x54, 0xc1, 0x14,
+		    0xa8, 0x0b, 0x57, 0xaf, 0x26, 0x41, 0x6c, 0x7b,
+		    0xe7, 0x42, 0x00, 0x5e, 0x20, 0x85, 0x5c, 0x73,
+		    0xe2, 0x1d, 0xc8, 0xe2, 0xed, 0xc9, 0xd4, 0x35,
+		    0xcb, 0x6f, 0x60, 0x59, 0x28, 0x00, 0x11, 0xc2,
+		    0x70, 0xb7, 0x15, 0x70, 0x05, 0x1c, 0x1c, 0x9b,
+		    0x30, 0x52, 0x12, 0x66, 0x20, 0xbc, 0x1e, 0x27,
+		    0x30, 0xfa, 0x06, 0x6c, 0x7a, 0x50, 0x9d, 0x53,
+		    0xc6, 0x0e, 0x5a, 0xe1, 0xb4, 0x0a, 0xa6, 0xe3,
+		    0x9e, 0x49, 0x66, 0x92, 0x28, 0xc9, 0x0e, 0xec,
+		    0xb4, 0xa5, 0x0d, 0xb3, 0x2a, 0x50, 0xbc, 0x49,
+		    0xe9, 0x0b, 0x4f, 0x4b, 0x35, 0x9a, 0x1d, 0xfd,
+		    0x11, 0x74, 0x9c, 0xd3, 0x86, 0x7f, 0xcf, 0x2f,
+		    0xb7, 0xbb, 0x6c, 0xd4, 0x73, 0x8f, 0x6a, 0x4a,
+		    0xd6, 0xf7, 0xca, 0x50, 0x58, 0xf7, 0x61, 0x88,
+		    0x45, 0xaf, 0x9f, 0x02, 0x0f, 0x6c, 0x3b, 0x96,
+		    0x7b, 0x8f, 0x4c, 0xd4, 0xa9, 0x1e, 0x28, 0x13,
+		    0xb5, 0x07, 0xae, 0x66, 0xf2, 0xd3, 0x5c, 0x18,
+		    0x28, 0x4f, 0x72, 0x92, 0x18, 0x60, 0x62, 0xe1,
+		    0x0f, 0xd5, 0x51, 0x0d, 0x18, 0x77, 0x53, 0x51,
+		    0xef, 0x33, 0x4e, 0x76, 0x34, 0xab, 0x47, 0x43,
+		    0xf5, 0xb6, 0x8f, 0x49, 0xad, 0xca, 0xb3, 0x84,
+		    0xd3, 0xfd, 0x75, 0xf7, 0x39, 0x0f, 0x40, 0x06,
+		    0xef, 0x2a, 0x29, 0x5c, 0x8c, 0x7a, 0x07, 0x6a,
+		    0xd5, 0x45, 0x46, 0xcd, 0x25, 0xd2, 0x10, 0x7f,
+		    0xbe, 0x14, 0x36, 0xc8, 0x40, 0x92, 0x4a, 0xae,
+		    0xbe, 0x5b, 0x37, 0x08, 0x93, 0xcd, 0x63, 0xd1,
+		    0x32, 0x5b, 0x86, 0x16, 0xfc, 0x48, 0x10, 0x88,
+		    0x6b, 0xc1, 0x52, 0xc5, 0x32, 0x21, 0xb6, 0xdf,
+		    0x37, 0x31, 0x19, 0x39, 0x32, 0x55, 0xee, 0x72,
+		    0xbc, 0xaa, 0x88, 0x01, 0x74, 0xf1, 0x71, 0x7f,
+		    0x91, 0x84, 0xfa, 0x91, 0x64, 0x6f, 0x17, 0xa2,
+		    0x4a, 0xc5, 0x5d, 0x16, 0xbf, 0xdd, 0xca, 0x95,
+		    0x81, 0xa9, 0x2e, 0xda, 0x47, 0x92, 0x01, 0xf0,
+		    0xed, 0xbf, 0x63, 0x36, 0x00, 0xd6, 0x06, 0x6d,
+		    0x1a, 0xb3, 0x6d, 0x5d, 0x24, 0x15, 0xd7, 0x13,
+		    0x51, 0xbb, 0xcd, 0x60, 0x8a, 0x25, 0x10, 0x8d,
+		    0x25, 0x64, 0x19, 0x92, 0xc1, 0xf2, 0x6c, 0x53,
+		    0x1c, 0xf9, 0xf9, 0x02, 0x03, 0xbc, 0x4c, 0xc1,
+		    0x9f, 0x59, 0x27, 0xd8, 0x34, 0xb0, 0xa4, 0x71,
+		    0x16, 0xd3, 0x88, 0x4b, 0xbb, 0x16, 0x4b, 0x8e,
+		    0xc8, 0x83, 0xd1, 0xac, 0x83, 0x2e, 0x56, 0xb3,
+		    0x91, 0x8a, 0x98, 0x60, 0x1a, 0x08, 0xd1, 0x71,
+		    0x88, 0x15, 0x41, 0xd5, 0x94, 0xdb, 0x39, 0x9c,
+		    0x6a, 0xe6, 0x15, 0x12, 0x21, 0x74, 0x5a, 0xec,
+		    0x81, 0x4c, 0x45, 0xb0, 0xb0, 0x5b, 0x56, 0x54,
+		    0x36, 0xfd, 0x6f, 0x13, 0x7a, 0xa1, 0x0a, 0x0c,
+		    0x0b, 0x64, 0x37, 0x61, 0xdb, 0xd6, 0xf9, 0xa9,
+		    0xdc, 0xb9, 0x9b, 0x1a, 0x6e, 0x69, 0x08, 0x54,
+		    0xce, 0x07, 0x69, 0xcd, 0xe3, 0x97, 0x61, 0xd8,
+		    0x2f, 0xcd, 0xec, 0x15, 0xf0, 0xd9, 0x2d, 0x7d,
+		    0x8e, 0x94, 0xad, 0xe8, 0xeb, 0x83, 0xfb, 0xe0 },
+	.ilen	= 528,
+	.output	= { 0x26, 0x37, 0x40, 0x8f, 0xe1, 0x30, 0x86, 0xea,
+		    0x73, 0xf9, 0x71, 0xe3, 0x42, 0x5e, 0x28, 0x20 },
+	.key	= { 0x99, 0xe5, 0x82, 0x2d, 0xd4, 0x17, 0x3c, 0x99,
+		    0x5e, 0x3d, 0xae, 0x0d, 0xde, 0xfb, 0x97, 0x74,
+		    0x3f, 0xde, 0x3b, 0x08, 0x01, 0x34, 0xb3, 0x9f,
+		    0x76, 0xe9, 0xbf, 0x8d, 0x0e, 0x88, 0xd5, 0x46 },
+}, { /* test vectors from Hanno Böck */
+	.input	= { 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0x80, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xc5,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe3, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xac, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe6,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x00, 0x00, 0x00,
+		    0xaf, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+		    0xcc, 0xcc, 0xff, 0xff, 0xff, 0xf5, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0xff, 0xff, 0xff, 0xe7, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x71, 0x92, 0x05, 0xa8, 0x52, 0x1d,
+		    0xfc },
+	.ilen	= 257,
+	.output	= { 0x85, 0x59, 0xb8, 0x76, 0xec, 0xee, 0xd6, 0x6e,
+		    0xb3, 0x77, 0x98, 0xc0, 0x45, 0x7b, 0xaf, 0xf9 },
+	.key	= { 0x7f, 0x1b, 0x02, 0x64, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc },
+}, {
+	.input	= { 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+		    0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+		    0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+		    0xaa, 0xaa, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x64 },
+	.ilen	= 39,
+	.output	= { 0x00, 0xbd, 0x12, 0x58, 0x97, 0x8e, 0x20, 0x54,
+		    0x44, 0xc9, 0xaa, 0xaa, 0x82, 0x00, 0x6f, 0xed },
+	.key	= { 0xe0, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+		    0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa },
+}, {
+	.input	= { 0x02, 0xfc },
+	.ilen	= 2,
+	.output	= { 0x06, 0x12, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+		    0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c },
+	.key	= { 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+		    0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+		    0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+		    0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c },
+}, {
+	.input	= { 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7a, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x5c, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x6e, 0x7b, 0x00, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7a, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x5c,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+		    0x7b, 0x6e, 0x7b, 0x00, 0x13, 0x00, 0x00, 0x00,
+		    0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00,
+		    0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+		    0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x00,
+		    0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00, 0x09,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x7a, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+		    0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc },
+	.ilen	= 415,
+	.output	= { 0x33, 0x20, 0x5b, 0xbf, 0x9e, 0x9f, 0x8f, 0x72,
+		    0x12, 0xab, 0x9e, 0x2a, 0xb9, 0xb7, 0xe4, 0xa5 },
+	.key	= { 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7b, 0x7b },
+}, {
+	.input	= { 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0xff, 0xff, 0xff, 0xe9,
+		    0xe9, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+		    0xac, 0xac, 0xac, 0xac, 0x00, 0x00, 0xac, 0xac,
+		    0xec, 0x01, 0x00, 0xac, 0xac, 0xac, 0x2c, 0xac,
+		    0xa2, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+		    0xac, 0xac, 0xac, 0xac, 0x64, 0xf2 },
+	.ilen	= 118,
+	.output	= { 0x02, 0xee, 0x7c, 0x8c, 0x54, 0x6d, 0xde, 0xb1,
+		    0xa4, 0x67, 0xe4, 0xc3, 0x98, 0x11, 0x58, 0xb9 },
+	.key	= { 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7f,
+		    0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0xcf, 0x77, 0x77, 0x77, 0x77, 0x77,
+		    0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77 },
+}, { /* nacl */
+	.input	= { 0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
+		    0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
+		    0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
+		    0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
+		    0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
+		    0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
+		    0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
+		    0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
+		    0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
+		    0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
+		    0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
+		    0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
+		    0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
+		    0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
+		    0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
+		    0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
+		    0xe3, 0x55, 0xa5 },
+	.ilen	= 131,
+	.output	= { 0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
+		    0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9 },
+	.key	= { 0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
+		    0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
+		    0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
+		    0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80 },
+}, { /* wrap 2^130-5 */
+	.input	= { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+	.ilen	= 16,
+	.output	= { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.key	= { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* wrap 2^128 */
+	.input	= { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.ilen	= 16,
+	.output	= { 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.key	= { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+}, { /* limb carry */
+	.input	= { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.ilen	= 48,
+	.output	= { 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.key	= { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 2^130-5 */
+	.input	= { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xfb, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+		    0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+		    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+		    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 },
+	.ilen	= 48,
+	.output	= { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.key	= { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 2^130-6 */
+	.input	= { 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+	.ilen	= 16,
+	.output	= { 0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+	.key	= { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 5*H+L reduction intermediate */
+	.input	= { 0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+		    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.ilen	= 64,
+	.output	= { 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.key	= { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+}, { /* 5*H+L reduction final */
+	.input	= { 0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+		    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.ilen	= 48,
+	.output	= { 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+	.key	= { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+} };
+
+bool __init poly1305_selftest(void)
+{
+	simd_context_t simd_context = simd_get();
+	bool success = true;
+	size_t i, j;
+
+	for (i = 0; i < ARRAY_SIZE(poly1305_testvecs); ++i) {
+		struct poly1305_ctx poly1305;
+		u8 out[POLY1305_MAC_SIZE];
+
+		memset(out, 0, sizeof(out));
+		memset(&poly1305, 0, sizeof(poly1305));
+		poly1305_init(&poly1305, poly1305_testvecs[i].key,
+			      simd_context);
+		poly1305_update(&poly1305, poly1305_testvecs[i].input,
+				poly1305_testvecs[i].ilen, simd_context);
+		poly1305_final(&poly1305, out, simd_context);
+		if (memcmp(out, poly1305_testvecs[i].output,
+			   POLY1305_MAC_SIZE)) {
+			pr_info("poly1305 self-test %zu: FAIL\n", i + 1);
+			success = false;
+		}
+		simd_context = simd_relax(simd_context);
+
+		if (poly1305_testvecs[i].ilen <= 1)
+			continue;
+
+		for (j = 1; j < poly1305_testvecs[i].ilen - 1; ++j) {
+			memset(out, 0, sizeof(out));
+			memset(&poly1305, 0, sizeof(poly1305));
+			poly1305_init(&poly1305, poly1305_testvecs[i].key,
+				      simd_context);
+			poly1305_update(&poly1305, poly1305_testvecs[i].input,
+					j, simd_context);
+			poly1305_update(&poly1305,
+					poly1305_testvecs[i].input + j,
+					poly1305_testvecs[i].ilen - j,
+					simd_context);
+			poly1305_final(&poly1305, out, simd_context);
+			if (memcmp(out, poly1305_testvecs[i].output,
+				   POLY1305_MAC_SIZE)) {
+				pr_info("poly1305 self-test %zu (split %zu): FAIL\n",
+					i + 1, j);
+				success = false;
+			}
+			simd_context = simd_relax(simd_context);
+		}
+	}
+	simd_put(simd_context);
+
+	if (success)
+		pr_info("poly1305 self-tests: pass\n");
+
+	return success;
+}
+#endif
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 06/20] zinc: ChaCha20 MIPS32r2 implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, René van Dorst, Samuel Neves,
	Andy Lutomirski, Jean-Philippe Aumasson, Ralf Baechle,
	Paul Burton, James Hogan, linux-mips
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

This MIPS32r2 implementation comes from René van Dorst and me and
results in a nice speedup on the usual OpenWRT targets.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: René van Dorst <opensource@vdorst.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@linux-mips.org
---
 lib/zinc/Makefile                      |   4 +
 lib/zinc/chacha20/chacha20-mips-glue.h |  28 ++
 lib/zinc/chacha20/chacha20-mips.S      | 474 +++++++++++++++++++++++++
 3 files changed, 506 insertions(+)
 create mode 100644 lib/zinc/chacha20/chacha20-mips-glue.h
 create mode 100644 lib/zinc/chacha20/chacha20-mips.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 32e4bd94ea0b..9f6a5e65d729 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -17,6 +17,10 @@ ifeq ($(CONFIG_ZINC_ARCH_ARM64),y)
 zinc-y += chacha20/chacha20-arm64.o
 CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
 endif
+ifeq ($(CONFIG_ZINC_ARCH_MIPS)$(CONFIG_CPU_MIPS32_R2),yy)
+zinc-y += chacha20/chacha20-mips.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
+endif
 endif
 
 zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-mips-glue.h b/lib/zinc/chacha20/chacha20-mips-glue.h
new file mode 100644
index 000000000000..5b2c8cec36c8
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips-glue.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+
+asmlinkage void chacha20_mips(u8 *out, const u8 *in, const size_t len,
+			      const u32 key[8], const u32 counter[4]);
+void __init chacha20_fpu_init(void)
+{
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+				 const u32 key[8], const u32 counter[4],
+				 simd_context_t simd_context)
+{
+	chacha20_mips(dst, src, len, key, counter);
+	return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+				  const u8 *key, simd_context_t simd_context)
+{
+	return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-mips.S b/lib/zinc/chacha20/chacha20-mips.S
new file mode 100644
index 000000000000..77da2c2fb240
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips.S
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32	0x3c
+#define MASK_BYTES	0x03
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE	4*16
+
+#define X0  $t0
+#define X1  $t1
+#define X2  $t2
+#define X3  $t3
+#define X4  $t4
+#define X5  $t5
+#define X6  $t6
+#define X7  $t7
+#define X8  $v1
+#define X9  $fp
+#define X10 $s7
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0  $s1
+#define T1  $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define OUT		$a0
+#define IN		$a1
+#define BYTES		$a2
+/* KEY and NONCE argument must be u32 aligned */
+#define KEY		$a3
+/* NONCE pointer is given via stack */
+#define NONCE		$t9
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0		$v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X		X15
+#define SAVED_CA	$ra
+
+#define PTR_LAST_ROUND	$t8
+
+/* ChaCha20 constants and stack location */
+#define CONSTANT_OFS_SP	48
+#define UNALIGNED_OFS_SP 40
+
+#define CONSTANT_1	0x61707865
+#define CONSTANT_2	0x3320646e
+#define CONSTANT_3	0x79622d32
+#define CONSTANT_4	0x6b206574
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define	CPU_TO_LE32(n) \
+	wsbh	n; \
+	rotr	n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define STORE_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \
+	.if ((s != NONCE) || (o != 0)); \
+		lw	T0, o(s); \
+	.endif; \
+	lwl	T1, x-4+MSB ## (IN); \
+	lwr	T1, x-4+LSB ## (IN); \
+	.if ((s == NONCE) && (o == 0)); \
+		addu	X ## a, NONCE_0; \
+	.else; \
+		addu	X ## a, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## a); \
+	xor	X ## a, T1; \
+	swl	X ## a, x-4+MSB ## (OUT); \
+	swr	X ## a, x-4+LSB ## (OUT);
+
+#define STORE_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \
+	.if ((s != NONCE) || (o != 0)); \
+		lw	T0, o(s); \
+	.endif; \
+	lw	T1, x-4 ## (IN); \
+	.if ((s == NONCE) && (o == 0)); \
+		addu	X ## a, NONCE_0; \
+	.else; \
+		addu	X ## a, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## a); \
+	xor	X ## a, T1; \
+	sw	X ## a, x-4 ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_aligned_ ## a: ; \
+	.if ((s == NONCE) && (o == 0)); \
+		move	SAVED_CA, NONCE_0; \
+	.else; \
+		lw	SAVED_CA, o(s);\
+	.endif; \
+	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
+	move	SAVED_X, X ## a;
+
+#define JMPTBL_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \
+	.if ((s == NONCE) && (o == 0)); \
+		move	SAVED_CA, NONCE_0; \
+	.else; \
+		lw	SAVED_CA, o(s);\
+	.endif; \
+	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+	move	SAVED_X, X ## a;
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	addu	X(A), X(K); \
+	addu	X(B), X(L); \
+	addu	X(C), X(M); \
+	addu	X(D), X(N); \
+	xor	X(V), X(A); \
+	xor	X(W), X(B); \
+	xor	X(Y), X(C); \
+	xor	X(Z), X(D); \
+	rotl	X(V), S;    \
+	rotl	X(W), S;    \
+	rotl	X(Y), S;    \
+	rotl	X(Z), S;
+
+.text
+.set reorder
+.set noat
+.globl chacha20_mips
+.ent   chacha20_mips
+chacha20_mips:
+	.frame $sp, STACK_SIZE, $ra
+	/* This is in the fifth argument */
+	lw	NONCE, 16($sp)
+
+	/* Return bytes = 0. */
+	.set noreorder
+	beqz	BYTES, .Lchacha20_mips_end
+	addiu	$sp, -STACK_SIZE
+	.set reorder
+
+	/* Calculate PTR_LAST_ROUND */
+	addiu	PTR_LAST_ROUND, BYTES, -1
+	ins	PTR_LAST_ROUND, $zero, 0, 6
+	addu	PTR_LAST_ROUND, OUT
+
+	/* Save s0-s7, fp, ra. */
+	sw	$ra,  0($sp)
+	sw	$fp,  4($sp)
+	sw	$s0,  8($sp)
+	sw	$s1, 12($sp)
+	sw	$s2, 16($sp)
+	sw	$s3, 20($sp)
+	sw	$s4, 24($sp)
+	sw	$s5, 28($sp)
+	sw	$s6, 32($sp)
+	sw	$s7, 36($sp)
+
+	lw	NONCE_0, 0(NONCE)
+	/* Test IN or OUT is unaligned.
+	 * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003
+	 */
+	or	T1, IN, OUT
+	andi	T1, 0x3
+
+	/* Load constant */
+	lui	X0, %hi(CONSTANT_1)
+	lui	X1, %hi(CONSTANT_2)
+	lui	X2, %hi(CONSTANT_3)
+	lui	X3, %hi(CONSTANT_4)
+	ori	X0, %lo(CONSTANT_1)
+	ori	X1, %lo(CONSTANT_2)
+	ori	X2, %lo(CONSTANT_3)
+	ori	X3, %lo(CONSTANT_4)
+
+	/* Store constant on stack. */
+	sw	X0,  0+CONSTANT_OFS_SP($sp)
+	sw	X1,  4+CONSTANT_OFS_SP($sp)
+	sw	X2,  8+CONSTANT_OFS_SP($sp)
+	sw	X3, 12+CONSTANT_OFS_SP($sp)
+
+	sw	T1, UNALIGNED_OFS_SP($sp)
+
+	.set	noreorder
+	b	.Lchacha20_rounds_start
+	andi	BYTES, (CHACHA20_BLOCK_SIZE-1)
+	.set	reorder
+
+.align 4
+.Loop_chacha20_rounds:
+	addiu	IN,  CHACHA20_BLOCK_SIZE
+	addiu	OUT, CHACHA20_BLOCK_SIZE
+	addiu	NONCE_0, 1
+
+	lw	X0,  0+CONSTANT_OFS_SP($sp)
+	lw	X1,  4+CONSTANT_OFS_SP($sp)
+	lw	X2,  8+CONSTANT_OFS_SP($sp)
+	lw	X3, 12+CONSTANT_OFS_SP($sp)
+	lw	T1,   UNALIGNED_OFS_SP($sp)
+
+.Lchacha20_rounds_start:
+	lw	X4,   0(KEY)
+	lw	X5,   4(KEY)
+	lw	X6,   8(KEY)
+	lw	X7,  12(KEY)
+	lw	X8,  16(KEY)
+	lw	X9,  20(KEY)
+	lw	X10, 24(KEY)
+	lw	X11, 28(KEY)
+
+	move	X12, NONCE_0
+	lw	X13,  4(NONCE)
+	lw	X14,  8(NONCE)
+	lw	X15, 12(NONCE)
+
+	li	$at, 9
+.Loop_chacha20_xor_rounds:
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	.set noreorder
+	bnez	$at, .Loop_chacha20_xor_rounds
+	addiu	$at, -1
+
+	/* Unaligned? Jump */
+	bnez	T1, .Loop_chacha20_unaligned
+	andi	$at, BYTES, MASK_U32
+
+	/* Last round? No jump */
+	bne	OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+
+	/* Full block? Jump */
+	beqz	BYTES, .Lchacha20_mips_xor_aligned_64_b
+	/* Calculate lower half jump table addr and offset */
+	ins	T0, $at, 2, 6
+
+	subu	T0, $at
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+
+	jr	T0
+	/* Delay slot */
+	nop
+
+	.set	reorder
+
+.Loop_chacha20_unaligned:
+	.set noreorder
+
+	/* Last round? no jump */
+	bne	OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	/* Full block? Jump */
+	beqz	BYTES, .Lchacha20_mips_xor_unaligned_64_b
+
+	/* Calculate lower half jump table addr and offset */
+	ins     T0, $at, 2, 6
+	subu	T0, $at
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	jr	T0
+	/* Delay slot */
+	nop
+
+	.set	reorder
+
+/* Aligned code path
+ */
+.align 4
+	STORE_ALIGNED(64, 15, NONCE,12)
+	STORE_ALIGNED(60, 14, NONCE, 8)
+	STORE_ALIGNED(56, 13, NONCE, 4)
+	STORE_ALIGNED(52, 12, NONCE, 0)
+	STORE_ALIGNED(48, 11, KEY, 28)
+	STORE_ALIGNED(44, 10, KEY, 24)
+	STORE_ALIGNED(40,  9, KEY, 20)
+	STORE_ALIGNED(36,  8, KEY, 16)
+	STORE_ALIGNED(32,  7, KEY, 12)
+	STORE_ALIGNED(28,  6, KEY,  8)
+	STORE_ALIGNED(24,  5, KEY,  4)
+	STORE_ALIGNED(20,  4, KEY,  0)
+	STORE_ALIGNED(16,  3, $sp, 12+CONSTANT_OFS_SP)
+	STORE_ALIGNED(12,  2, $sp,  8+CONSTANT_OFS_SP)
+	STORE_ALIGNED( 8,  1, $sp,  4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_aligned_4_b:
+	/* STORE_ALIGNED( 4,  0, $sp, 0+CONSTANT_OFS_SP) */
+	lw	T0, 0+CONSTANT_OFS_SP($sp)
+	lw	T1, 0(IN)
+	addu	X0, T0
+	CPU_TO_LE32(X0)
+	xor	X0, T1
+	.set noreorder
+	bne	OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+	sw	X0, 0(OUT)
+	.set reorder
+
+	.set noreorder
+	bne	$at, BYTES, .Lchacha20_mips_xor_bytes
+	/* Empty delayslot, Increase NONCE_0, return NONCE_0 value */
+	addiu	NONCE_0, 1
+	.set noreorder
+
+.Lchacha20_mips_xor_done:
+	/* Restore used registers */
+	lw	$ra,  0($sp)
+	lw	$fp,  4($sp)
+	lw	$s0,  8($sp)
+	lw	$s1, 12($sp)
+	lw	$s2, 16($sp)
+	lw	$s3, 20($sp)
+	lw	$s4, 24($sp)
+	lw	$s5, 28($sp)
+	lw	$s6, 32($sp)
+	lw	$s7, 36($sp)
+.Lchacha20_mips_end:
+	.set noreorder
+	jr	$ra
+	addiu	$sp, STACK_SIZE
+	.set reorder
+
+	.set noreorder
+	/* Start jump table */
+	JMPTBL_ALIGNED( 0,  0, $sp,  0+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED( 4,  1, $sp,  4+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED( 8,  2, $sp,  8+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED(12,  3, $sp, 12+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED(16,  4, KEY,  0)
+	JMPTBL_ALIGNED(20,  5, KEY,  4)
+	JMPTBL_ALIGNED(24,  6, KEY,  8)
+	JMPTBL_ALIGNED(28,  7, KEY, 12)
+	JMPTBL_ALIGNED(32,  8, KEY, 16)
+	JMPTBL_ALIGNED(36,  9, KEY, 20)
+	JMPTBL_ALIGNED(40, 10, KEY, 24)
+	JMPTBL_ALIGNED(44, 11, KEY, 28)
+	JMPTBL_ALIGNED(48, 12, NONCE, 0)
+	JMPTBL_ALIGNED(52, 13, NONCE, 4)
+	JMPTBL_ALIGNED(56, 14, NONCE, 8)
+	JMPTBL_ALIGNED(60, 15, NONCE,12)
+	/* End jump table */
+	.set reorder
+
+/* Unaligned code path
+ */
+	STORE_UNALIGNED(64, 15, NONCE,12)
+	STORE_UNALIGNED(60, 14, NONCE, 8)
+	STORE_UNALIGNED(56, 13, NONCE, 4)
+	STORE_UNALIGNED(52, 12, NONCE, 0)
+	STORE_UNALIGNED(48, 11, KEY, 28)
+	STORE_UNALIGNED(44, 10, KEY, 24)
+	STORE_UNALIGNED(40,  9, KEY, 20)
+	STORE_UNALIGNED(36,  8, KEY, 16)
+	STORE_UNALIGNED(32,  7, KEY, 12)
+	STORE_UNALIGNED(28,  6, KEY,  8)
+	STORE_UNALIGNED(24,  5, KEY,  4)
+	STORE_UNALIGNED(20,  4, KEY,  0)
+	STORE_UNALIGNED(16,  3, $sp, 12+CONSTANT_OFS_SP)
+	STORE_UNALIGNED(12,  2, $sp,  8+CONSTANT_OFS_SP)
+	STORE_UNALIGNED( 8,  1, $sp,  4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_unaligned_4_b:
+	/* STORE_UNALIGNED( 4,  0, $sp, 0+CONSTANT_OFS_SP) */
+	lw	T0, 0+CONSTANT_OFS_SP($sp)
+	lwl	T1, 0+MSB(IN)
+	lwr	T1, 0+LSB(IN)
+	addu	X0, T0
+	CPU_TO_LE32(X0)
+	xor	X0, T1
+	swl	X0, 0+MSB(OUT)
+	.set noreorder
+	bne	OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+	swr	X0, 0+LSB(OUT)
+	.set reorder
+
+	/* Fall through to byte handling */
+	.set noreorder
+	beq	$at, BYTES, .Lchacha20_mips_xor_done
+	/* Empty delayslot, increase NONCE_0, return NONCE_0 value */
+.Lchacha20_mips_xor_unaligned_0_b:
+.Lchacha20_mips_xor_aligned_0_b:
+	addiu	NONCE_0, 1
+	.set reorder
+
+.Lchacha20_mips_xor_bytes:
+	addu	OUT, $at
+	addu	IN, $at
+	addu	SAVED_X, SAVED_CA
+	/* First byte */
+	lbu	T1, 0(IN)
+	andi	$at, BYTES, 2
+	CPU_TO_LE32(SAVED_X)
+	ROTR(SAVED_X)
+	xor	T1, SAVED_X
+	.set noreorder
+	beqz	$at, .Lchacha20_mips_xor_done
+	sb	T1, 0(OUT)
+	.set reorder
+	/* Second byte */
+	lbu	T1, 1(IN)
+	andi	$at, BYTES, 1
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	.set noreorder
+	beqz	$at, .Lchacha20_mips_xor_done
+	sb	T1, 1(OUT)
+	.set reorder
+	/* Third byte */
+	lbu	T1, 2(IN)
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	.set noreorder
+	b	.Lchacha20_mips_xor_done
+	sb	T1, 2(OUT)
+	.set reorder
+.set noreorder
+
+.Lchacha20_mips_jmptbl_unaligned:
+	/* Start jump table */
+	JMPTBL_UNALIGNED( 0,  0, $sp,  0+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED( 4,  1, $sp,  4+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED( 8,  2, $sp,  8+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED(12,  3, $sp, 12+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED(16,  4, KEY,  0)
+	JMPTBL_UNALIGNED(20,  5, KEY,  4)
+	JMPTBL_UNALIGNED(24,  6, KEY,  8)
+	JMPTBL_UNALIGNED(28,  7, KEY, 12)
+	JMPTBL_UNALIGNED(32,  8, KEY, 16)
+	JMPTBL_UNALIGNED(36,  9, KEY, 20)
+	JMPTBL_UNALIGNED(40, 10, KEY, 24)
+	JMPTBL_UNALIGNED(44, 11, KEY, 28)
+	JMPTBL_UNALIGNED(48, 12, NONCE, 0)
+	JMPTBL_UNALIGNED(52, 13, NONCE, 4)
+	JMPTBL_UNALIGNED(56, 14, NONCE, 8)
+	JMPTBL_UNALIGNED(60, 15, NONCE,12)
+	/* End jump table */
+.set reorder
+
+.end chacha20_mips
+.set at
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 05/20] zinc: ChaCha20 x86_64 implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Jean-Philippe Aumasson, Andy Polyakov, Thomas Gleixner,
	Ingo Molnar, x86
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

This provides SSSE3, AVX-2, AVX-512F, and AVX-512VL implementations for
ChaCha20. The AVX-512F implementation is disabled on Skylake, due to
throttling, and the VL ymm implementation is used instead. These come
from Andy Polyakov's implementation, with the following modifications
from Samuel Neves:

  - Some cosmetic changes, like renaming labels to .Lname, constants,
    and other Linux conventions.

  - CPU feature checking is done in C by the glue code, so that has been
    removed from the assembly.

  - Eliminate translating certain instructions, such as pshufb, palignr,
    vprotd, etc, to .byte directives. This is meant for compatibility
    with ancient toolchains, but presumably it is unnecessary here,
    since the build system already does checks on what GNU as can
    assemble.

  - When aligning the stack, the original code was saving %rsp to %r9.
    To keep objtool happy, we use instead the DRAP idiom to save %rsp
    to %r10:

      leaq    8(%rsp),%r10
      ... code here ...
      leaq    -8(%r10),%rsp

  - The original code assumes the stack comes aligned to 16 bytes. This
    is not necessarily the case, and to avoid crashes,
    `andq $-alignment, %rsp` was added in the prolog of a few functions.

  - The original hardcodes returns as .byte 0xf3,0xc3, aka "rep ret".
    We replace this by "ret". "rep ret" was meant to help with AMD K8
    chips, cf. http://repzret.org/p/repzret. It makes no sense to
    continue to use this kludge for code that won't even run on ancient
    AMD chips.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
---
 lib/zinc/Makefile                        |    4 +
 lib/zinc/chacha20/chacha20-x86_64-glue.h |  102 +
 lib/zinc/chacha20/chacha20-x86_64.S      | 2632 ++++++++++++++++++++++
 3 files changed, 2738 insertions(+)
 create mode 100644 lib/zinc/chacha20/chacha20-x86_64-glue.h
 create mode 100644 lib/zinc/chacha20/chacha20-x86_64.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 8d14cb13349a..32e4bd94ea0b 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -5,6 +5,10 @@ ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
 
 ifeq ($(CONFIG_ZINC_CHACHA20),y)
 zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_ZINC_ARCH_X86_64),y)
+zinc-y += chacha20/chacha20-x86_64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-x86_64-glue.h
+endif
 ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
 zinc-y += chacha20/chacha20-arm.o
 CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
diff --git a/lib/zinc/chacha20/chacha20-x86_64-glue.h b/lib/zinc/chacha20/chacha20-x86_64-glue.h
new file mode 100644
index 000000000000..e4f6c3162d3f
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64-glue.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+#ifdef CONFIG_AS_SSSE3
+asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce,
+				const u8 *key);
+asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len,
+			       const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len,
+			      const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len,
+				const u32 key[8], const u32 counter[4]);
+asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len,
+				  const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_ssse3 __ro_after_init;
+static bool chacha20_use_avx2 __ro_after_init;
+static bool chacha20_use_avx512 __ro_after_init;
+static bool chacha20_use_avx512vl __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+	chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
+	chacha20_use_avx2 =
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX2) &&
+		cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+	chacha20_use_avx512 =
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+				  XFEATURE_MASK_AVX512, NULL) &&
+		/* Skylake downclocks unacceptably much when using zmm. */
+		boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+	chacha20_use_avx512vl =
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+				  XFEATURE_MASK_AVX512, NULL);
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+				 const u32 key[8], const u32 counter[4],
+				 simd_context_t simd_context)
+{
+	if (simd_context != HAVE_FULL_SIMD)
+		return false;
+
+#ifdef CONFIG_AS_AVX512
+	if (chacha20_use_avx512) {
+		chacha20_avx512(dst, src, len, key, counter);
+		return true;
+	}
+	if (chacha20_use_avx512vl) {
+		chacha20_avx512vl(dst, src, len, key, counter);
+		return true;
+	}
+#endif
+#ifdef CONFIG_AS_AVX2
+	if (chacha20_use_avx2) {
+		chacha20_avx2(dst, src, len, key, counter);
+		return true;
+	}
+#endif
+#ifdef CONFIG_AS_SSSE3
+	if (chacha20_use_ssse3) {
+		chacha20_ssse3(dst, src, len, key, counter);
+		return true;
+	}
+#endif
+	return false;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+				  const u8 *key, simd_context_t simd_context)
+{
+#if defined(CONFIG_AS_SSSE3)
+	if (simd_context == HAVE_FULL_SIMD && chacha20_use_ssse3) {
+		hchacha20_ssse3(derived_key, nonce, key);
+		return true;
+	}
+#endif
+	return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-x86_64.S b/lib/zinc/chacha20/chacha20-x86_64.S
new file mode 100644
index 000000000000..3f503a319692
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64.S
@@ -0,0 +1,2632 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.Lzero, "aM", @progbits, 16
+.align	16
+.Lzero:
+.long	0,0,0,0
+.section .rodata.cst16.Lone, "aM", @progbits, 16
+.align	16
+.Lone:
+.long	1,0,0,0
+.section .rodata.cst16.Linc, "aM", @progbits, 16
+.align	16
+.Linc:
+.long	0,1,2,3
+.section .rodata.cst16.Lfour, "aM", @progbits, 16
+.align	16
+.Lfour:
+.long	4,4,4,4
+.section .rodata.cst32.Lincy, "aM", @progbits, 32
+.align	32
+.Lincy:
+.long	0,2,4,6,1,3,5,7
+.section .rodata.cst32.Leight, "aM", @progbits, 32
+.align	32
+.Leight:
+.long	8,8,8,8,8,8,8,8
+.section .rodata.cst16.Lrot16, "aM", @progbits, 16
+.align	16
+.Lrot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.section .rodata.cst16.Lrot24, "aM", @progbits, 16
+.align	16
+.Lrot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.section .rodata.cst16.Lsigma, "aM", @progbits, 16
+.align	16
+.Lsigma:
+.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.section .rodata.cst64.Lfourz, "aM", @progbits, 64
+.align	64
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.section .rodata.cst64.Lincz, "aM", @progbits, 64
+.align	64
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
+.align	64
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
+.align	64
+.Ltwoy:
+.long	2,0,0,0, 2,0,0,0
+
+.text
+
+#ifdef CONFIG_AS_SSSE3
+.align	32
+ENTRY(hchacha20_ssse3)
+	movdqa	.Lsigma(%rip),%xmm0
+	movdqu	(%rdx),%xmm1
+	movdqu	16(%rdx),%xmm2
+	movdqu	(%rsi),%xmm3
+	movdqa	.Lrot16(%rip),%xmm6
+	movdqa	.Lrot24(%rip),%xmm7
+	movq	$10,%r8
+	.align	32
+.Loop_hssse3:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decq	%r8
+	jnz	.Loop_hssse3
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	ret
+ENDPROC(hchacha20_ssse3)
+
+.align	32
+ENTRY(chacha20_ssse3)
+.Lchacha20_ssse3:
+	cmpq	$0,%rdx
+	je	.Lssse3_epilogue
+	leaq	8(%rsp),%r10
+
+	cmpq	$128,%rdx
+	ja	.Lchacha20_4x
+
+.Ldo_sse3_after_all:
+	subq	$64+8,%rsp
+	andq	$-32,%rsp
+	movdqa	.Lsigma(%rip),%xmm0
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	.Lrot16(%rip),%xmm6
+	movdqa	.Lrot24(%rip),%xmm7
+
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	$10,%r8
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_outer_ssse3:
+	movdqa	.Lone(%rip),%xmm3
+	movdqa	0(%rsp),%xmm0
+	movdqa	16(%rsp),%xmm1
+	movdqa	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+	movq	$10,%r8
+	movdqa	%xmm3,48(%rsp)
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_ssse3:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decq	%r8
+	jnz	.Loop_ssse3
+	paddd	0(%rsp),%xmm0
+	paddd	16(%rsp),%xmm1
+	paddd	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+
+	cmpq	$64,%rdx
+	jb	.Ltail_ssse3
+
+	movdqu	0(%rsi),%xmm4
+	movdqu	16(%rsi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%rsi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%rsi),%xmm5
+	leaq	64(%rsi),%rsi
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm1,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rdx
+	jnz	.Loop_outer_ssse3
+
+	jmp	.Ldone_ssse3
+
+.align	16
+.Ltail_ssse3:
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	xorq	%r8,%r8
+
+.Loop_tail_ssse3:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_ssse3
+
+.Ldone_ssse3:
+	leaq	-8(%r10),%rsp
+
+.Lssse3_epilogue:
+	ret
+
+.align	32
+.Lchacha20_4x:
+	leaq	8(%rsp),%r10
+
+.Lproceed4x:
+	subq	$0x140+8,%rsp
+	andq	$-32,%rsp
+	movdqa	.Lsigma(%rip),%xmm11
+	movdqu	(%rcx),%xmm15
+	movdqu	16(%rcx),%xmm7
+	movdqu	(%r8),%xmm3
+	leaq	256(%rsp),%rcx
+	leaq	.Lrot16(%rip),%r9
+	leaq	.Lrot24(%rip),%r11
+
+	pshufd	$0x00,%xmm11,%xmm8
+	pshufd	$0x55,%xmm11,%xmm9
+	movdqa	%xmm8,64(%rsp)
+	pshufd	$0xaa,%xmm11,%xmm10
+	movdqa	%xmm9,80(%rsp)
+	pshufd	$0xff,%xmm11,%xmm11
+	movdqa	%xmm10,96(%rsp)
+	movdqa	%xmm11,112(%rsp)
+
+	pshufd	$0x00,%xmm15,%xmm12
+	pshufd	$0x55,%xmm15,%xmm13
+	movdqa	%xmm12,128-256(%rcx)
+	pshufd	$0xaa,%xmm15,%xmm14
+	movdqa	%xmm13,144-256(%rcx)
+	pshufd	$0xff,%xmm15,%xmm15
+	movdqa	%xmm14,160-256(%rcx)
+	movdqa	%xmm15,176-256(%rcx)
+
+	pshufd	$0x00,%xmm7,%xmm4
+	pshufd	$0x55,%xmm7,%xmm5
+	movdqa	%xmm4,192-256(%rcx)
+	pshufd	$0xaa,%xmm7,%xmm6
+	movdqa	%xmm5,208-256(%rcx)
+	pshufd	$0xff,%xmm7,%xmm7
+	movdqa	%xmm6,224-256(%rcx)
+	movdqa	%xmm7,240-256(%rcx)
+
+	pshufd	$0x00,%xmm3,%xmm0
+	pshufd	$0x55,%xmm3,%xmm1
+	paddd	.Linc(%rip),%xmm0
+	pshufd	$0xaa,%xmm3,%xmm2
+	movdqa	%xmm1,272-256(%rcx)
+	pshufd	$0xff,%xmm3,%xmm3
+	movdqa	%xmm2,288-256(%rcx)
+	movdqa	%xmm3,304-256(%rcx)
+
+	jmp	.Loop_enter4x
+
+.align	32
+.Loop_outer4x:
+	movdqa	64(%rsp),%xmm8
+	movdqa	80(%rsp),%xmm9
+	movdqa	96(%rsp),%xmm10
+	movdqa	112(%rsp),%xmm11
+	movdqa	128-256(%rcx),%xmm12
+	movdqa	144-256(%rcx),%xmm13
+	movdqa	160-256(%rcx),%xmm14
+	movdqa	176-256(%rcx),%xmm15
+	movdqa	192-256(%rcx),%xmm4
+	movdqa	208-256(%rcx),%xmm5
+	movdqa	224-256(%rcx),%xmm6
+	movdqa	240-256(%rcx),%xmm7
+	movdqa	256-256(%rcx),%xmm0
+	movdqa	272-256(%rcx),%xmm1
+	movdqa	288-256(%rcx),%xmm2
+	movdqa	304-256(%rcx),%xmm3
+	paddd	.Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm7,48(%rsp)
+	movdqa	(%r9),%xmm7
+	movl	$10,%eax
+	movdqa	%xmm0,256-256(%rcx)
+	jmp	.Loop4x
+
+.align	32
+.Loop4x:
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+	pshufb	%xmm7,%xmm0
+	pshufb	%xmm7,%xmm1
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm6
+	pslld	$12,%xmm12
+	psrld	$20,%xmm6
+	movdqa	%xmm13,%xmm7
+	pslld	$12,%xmm13
+	por	%xmm6,%xmm12
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm13
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm7
+	pslld	$7,%xmm12
+	psrld	$25,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$7,%xmm13
+	por	%xmm7,%xmm12
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm13
+	movdqa	%xmm4,0(%rsp)
+	movdqa	%xmm5,16(%rsp)
+	movdqa	32(%rsp),%xmm4
+	movdqa	48(%rsp),%xmm5
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+	pshufb	%xmm7,%xmm2
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm6
+	pslld	$12,%xmm14
+	psrld	$20,%xmm6
+	movdqa	%xmm15,%xmm7
+	pslld	$12,%xmm15
+	por	%xmm6,%xmm14
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm15
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+	pshufb	%xmm6,%xmm2
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm7
+	pslld	$7,%xmm14
+	psrld	$25,%xmm7
+	movdqa	%xmm15,%xmm6
+	pslld	$7,%xmm15
+	por	%xmm7,%xmm14
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm15
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+	pshufb	%xmm7,%xmm3
+	pshufb	%xmm7,%xmm0
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm6
+	pslld	$12,%xmm13
+	psrld	$20,%xmm6
+	movdqa	%xmm14,%xmm7
+	pslld	$12,%xmm14
+	por	%xmm6,%xmm13
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm14
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+	pshufb	%xmm6,%xmm3
+	pshufb	%xmm6,%xmm0
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm7
+	pslld	$7,%xmm13
+	psrld	$25,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$7,%xmm14
+	por	%xmm7,%xmm13
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm14
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm5,48(%rsp)
+	movdqa	0(%rsp),%xmm4
+	movdqa	16(%rsp),%xmm5
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+	pshufb	%xmm7,%xmm1
+	pshufb	%xmm7,%xmm2
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm6
+	pslld	$12,%xmm15
+	psrld	$20,%xmm6
+	movdqa	%xmm12,%xmm7
+	pslld	$12,%xmm12
+	por	%xmm6,%xmm15
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm12
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm7
+	pslld	$7,%xmm15
+	psrld	$25,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$7,%xmm12
+	por	%xmm7,%xmm15
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm12
+	decl	%eax
+	jnz	.Loop4x
+
+	paddd	64(%rsp),%xmm8
+	paddd	80(%rsp),%xmm9
+	paddd	96(%rsp),%xmm10
+	paddd	112(%rsp),%xmm11
+
+	movdqa	%xmm8,%xmm6
+	punpckldq	%xmm9,%xmm8
+	movdqa	%xmm10,%xmm7
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm9,%xmm6
+	punpckhdq	%xmm11,%xmm7
+	movdqa	%xmm8,%xmm9
+	punpcklqdq	%xmm10,%xmm8
+	movdqa	%xmm6,%xmm11
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm10,%xmm9
+	punpckhqdq	%xmm7,%xmm11
+	paddd	128-256(%rcx),%xmm12
+	paddd	144-256(%rcx),%xmm13
+	paddd	160-256(%rcx),%xmm14
+	paddd	176-256(%rcx),%xmm15
+
+	movdqa	%xmm8,0(%rsp)
+	movdqa	%xmm9,16(%rsp)
+	movdqa	32(%rsp),%xmm8
+	movdqa	48(%rsp),%xmm9
+
+	movdqa	%xmm12,%xmm10
+	punpckldq	%xmm13,%xmm12
+	movdqa	%xmm14,%xmm7
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm13,%xmm10
+	punpckhdq	%xmm15,%xmm7
+	movdqa	%xmm12,%xmm13
+	punpcklqdq	%xmm14,%xmm12
+	movdqa	%xmm10,%xmm15
+	punpcklqdq	%xmm7,%xmm10
+	punpckhqdq	%xmm14,%xmm13
+	punpckhqdq	%xmm7,%xmm15
+	paddd	192-256(%rcx),%xmm4
+	paddd	208-256(%rcx),%xmm5
+	paddd	224-256(%rcx),%xmm8
+	paddd	240-256(%rcx),%xmm9
+
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm11,48(%rsp)
+
+	movdqa	%xmm4,%xmm14
+	punpckldq	%xmm5,%xmm4
+	movdqa	%xmm8,%xmm7
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm5,%xmm14
+	punpckhdq	%xmm9,%xmm7
+	movdqa	%xmm4,%xmm5
+	punpcklqdq	%xmm8,%xmm4
+	movdqa	%xmm14,%xmm9
+	punpcklqdq	%xmm7,%xmm14
+	punpckhqdq	%xmm8,%xmm5
+	punpckhqdq	%xmm7,%xmm9
+	paddd	256-256(%rcx),%xmm0
+	paddd	272-256(%rcx),%xmm1
+	paddd	288-256(%rcx),%xmm2
+	paddd	304-256(%rcx),%xmm3
+
+	movdqa	%xmm0,%xmm8
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm8
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm8,%xmm3
+	punpcklqdq	%xmm7,%xmm8
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	cmpq	$256,%rdx
+	jb	.Ltail4x
+
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	48(%rsp),%xmm6
+	pxor	%xmm15,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$256,%rdx
+	jnz	.Loop_outer4x
+
+	jmp	.Ldone4x
+
+.Ltail4x:
+	cmpq	$192,%rdx
+	jae	.L192_or_more4x
+	cmpq	$128,%rdx
+	jae	.L128_or_more4x
+	cmpq	$64,%rdx
+	jae	.L64_or_more4x
+
+
+	xorq	%r9,%r9
+
+	movdqa	%xmm12,16(%rsp)
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L64_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	16(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r9,%r9
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm13,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm5,32(%rsp)
+	subq	$64,%rdx
+	movdqa	%xmm1,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L128_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	je	.Ldone4x
+
+	movdqa	32(%rsp),%xmm6
+	leaq	128(%rsi),%rsi
+	xorq	%r9,%r9
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm10,16(%rsp)
+	leaq	128(%rdi),%rdi
+	movdqa	%xmm14,32(%rsp)
+	subq	$128,%rdx
+	movdqa	%xmm8,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L192_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	48(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r9,%r9
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm15,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm9,32(%rsp)
+	subq	$192,%rdx
+	movdqa	%xmm3,48(%rsp)
+
+.Loop_tail4x:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail4x
+
+.Ldone4x:
+	leaq	-8(%r10),%rsp
+
+.L4x_epilogue:
+	ret
+ENDPROC(chacha20_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX2
+.align	32
+ENTRY(chacha20_avx2)
+.Lchacha20_avx2:
+	cmpq	$0,%rdx
+	je	.L8x_epilogue
+	leaq	8(%rsp),%r10
+
+	subq	$0x280+8,%rsp
+	andq	$-32,%rsp
+	vzeroupper
+
+	vbroadcasti128	.Lsigma(%rip),%ymm11
+	vbroadcasti128	(%rcx),%ymm3
+	vbroadcasti128	16(%rcx),%ymm15
+	vbroadcasti128	(%r8),%ymm7
+	leaq	256(%rsp),%rcx
+	leaq	512(%rsp),%rax
+	leaq	.Lrot16(%rip),%r9
+	leaq	.Lrot24(%rip),%r11
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vmovdqa	%ymm8,128-256(%rcx)
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vmovdqa	%ymm9,160-256(%rcx)
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa	%ymm10,192-256(%rcx)
+	vmovdqa	%ymm11,224-256(%rcx)
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vmovdqa	%ymm0,256-256(%rcx)
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vmovdqa	%ymm1,288-256(%rcx)
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa	%ymm2,320-256(%rcx)
+	vmovdqa	%ymm3,352-256(%rcx)
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vmovdqa	%ymm12,384-512(%rax)
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vmovdqa	%ymm13,416-512(%rax)
+	vpshufd	$0xff,%ymm15,%ymm15
+	vmovdqa	%ymm14,448-512(%rax)
+	vmovdqa	%ymm15,480-512(%rax)
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpaddd	.Lincy(%rip),%ymm4,%ymm4
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vmovdqa	%ymm5,544-512(%rax)
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa	%ymm6,576-512(%rax)
+	vmovdqa	%ymm7,608-512(%rax)
+
+	jmp	.Loop_enter8x
+
+.align	32
+.Loop_outer8x:
+	vmovdqa	128-256(%rcx),%ymm8
+	vmovdqa	160-256(%rcx),%ymm9
+	vmovdqa	192-256(%rcx),%ymm10
+	vmovdqa	224-256(%rcx),%ymm11
+	vmovdqa	256-256(%rcx),%ymm0
+	vmovdqa	288-256(%rcx),%ymm1
+	vmovdqa	320-256(%rcx),%ymm2
+	vmovdqa	352-256(%rcx),%ymm3
+	vmovdqa	384-512(%rax),%ymm12
+	vmovdqa	416-512(%rax),%ymm13
+	vmovdqa	448-512(%rax),%ymm14
+	vmovdqa	480-512(%rax),%ymm15
+	vmovdqa	512-512(%rax),%ymm4
+	vmovdqa	544-512(%rax),%ymm5
+	vmovdqa	576-512(%rax),%ymm6
+	vmovdqa	608-512(%rax),%ymm7
+	vpaddd	.Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+	vmovdqa	%ymm14,64(%rsp)
+	vmovdqa	%ymm15,96(%rsp)
+	vbroadcasti128	(%r9),%ymm15
+	vmovdqa	%ymm4,512-512(%rax)
+	movl	$10,%eax
+	jmp	.Loop8x
+
+.align	32
+.Loop8x:
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$12,%ymm0,%ymm14
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$12,%ymm1,%ymm15
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$7,%ymm0,%ymm15
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$7,%ymm1,%ymm14
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vmovdqa	%ymm12,0(%rsp)
+	vmovdqa	%ymm13,32(%rsp)
+	vmovdqa	64(%rsp),%ymm12
+	vmovdqa	96(%rsp),%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$12,%ymm2,%ymm14
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$12,%ymm3,%ymm15
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$7,%ymm2,%ymm15
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$7,%ymm3,%ymm14
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$12,%ymm1,%ymm14
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$12,%ymm2,%ymm15
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$7,%ymm1,%ymm15
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$7,%ymm2,%ymm14
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vmovdqa	%ymm12,64(%rsp)
+	vmovdqa	%ymm13,96(%rsp)
+	vmovdqa	0(%rsp),%ymm12
+	vmovdqa	32(%rsp),%ymm13
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$12,%ymm3,%ymm14
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$12,%ymm0,%ymm15
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$7,%ymm3,%ymm15
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$7,%ymm0,%ymm14
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	decl	%eax
+	jnz	.Loop8x
+
+	leaq	512(%rsp),%rax
+	vpaddd	128-256(%rcx),%ymm8,%ymm8
+	vpaddd	160-256(%rcx),%ymm9,%ymm9
+	vpaddd	192-256(%rcx),%ymm10,%ymm10
+	vpaddd	224-256(%rcx),%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm14
+	vpunpckldq	%ymm11,%ymm10,%ymm15
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm15,%ymm14,%ymm9
+	vpunpckhqdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	256-256(%rcx),%ymm0,%ymm0
+	vpaddd	288-256(%rcx),%ymm1,%ymm1
+	vpaddd	320-256(%rcx),%ymm2,%ymm2
+	vpaddd	352-256(%rcx),%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm10
+	vpunpckldq	%ymm3,%ymm2,%ymm15
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm10,%ymm1
+	vpunpckhqdq	%ymm15,%ymm10,%ymm10
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
+	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
+	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
+	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
+	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
+	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
+	vmovdqa	%ymm15,0(%rsp)
+	vmovdqa	%ymm9,32(%rsp)
+	vmovdqa	64(%rsp),%ymm15
+	vmovdqa	96(%rsp),%ymm9
+
+	vpaddd	384-512(%rax),%ymm12,%ymm12
+	vpaddd	416-512(%rax),%ymm13,%ymm13
+	vpaddd	448-512(%rax),%ymm15,%ymm15
+	vpaddd	480-512(%rax),%ymm9,%ymm9
+
+	vpunpckldq	%ymm13,%ymm12,%ymm2
+	vpunpckldq	%ymm9,%ymm15,%ymm8
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm9,%ymm15,%ymm15
+	vpunpcklqdq	%ymm8,%ymm2,%ymm13
+	vpunpckhqdq	%ymm8,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm12,%ymm9
+	vpunpckhqdq	%ymm15,%ymm12,%ymm12
+	vpaddd	512-512(%rax),%ymm4,%ymm4
+	vpaddd	544-512(%rax),%ymm5,%ymm5
+	vpaddd	576-512(%rax),%ymm6,%ymm6
+	vpaddd	608-512(%rax),%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm15
+	vpunpckldq	%ymm7,%ymm6,%ymm8
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm8,%ymm15,%ymm5
+	vpunpckhqdq	%ymm8,%ymm15,%ymm15
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
+	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
+	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
+	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
+	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
+	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
+	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
+	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
+	vmovdqa	0(%rsp),%ymm6
+	vmovdqa	32(%rsp),%ymm12
+
+	cmpq	$512,%rdx
+	jb	.Ltail8x
+
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm12,%ymm12
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vpxor	64(%rsi),%ymm10,%ymm10
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm12,0(%rdi)
+	vmovdqu	%ymm13,32(%rdi)
+	vmovdqu	%ymm10,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm14,%ymm14
+	vpxor	32(%rsi),%ymm2,%ymm2
+	vpxor	64(%rsi),%ymm3,%ymm3
+	vpxor	96(%rsi),%ymm7,%ymm7
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm14,0(%rdi)
+	vmovdqu	%ymm2,32(%rdi)
+	vmovdqu	%ymm3,64(%rdi)
+	vmovdqu	%ymm7,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm11,%ymm11
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm0,%ymm0
+	vpxor	96(%rsi),%ymm4,%ymm4
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm11,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm0,64(%rdi)
+	vmovdqu	%ymm4,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$512,%rdx
+	jnz	.Loop_outer8x
+
+	jmp	.Ldone8x
+
+.Ltail8x:
+	cmpq	$448,%rdx
+	jae	.L448_or_more8x
+	cmpq	$384,%rdx
+	jae	.L384_or_more8x
+	cmpq	$320,%rdx
+	jae	.L320_or_more8x
+	cmpq	$256,%rdx
+	jae	.L256_or_more8x
+	cmpq	$192,%rdx
+	jae	.L192_or_more8x
+	cmpq	$128,%rdx
+	jae	.L128_or_more8x
+	cmpq	$64,%rdx
+	jae	.L64_or_more8x
+
+	xorq	%r9,%r9
+	vmovdqa	%ymm6,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L64_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	je	.Ldone8x
+
+	leaq	64(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm1,0(%rsp)
+	leaq	64(%rdi),%rdi
+	subq	$64,%rdx
+	vmovdqa	%ymm5,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L128_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	je	.Ldone8x
+
+	leaq	128(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm12,0(%rsp)
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	vmovdqa	%ymm13,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L192_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	je	.Ldone8x
+
+	leaq	192(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm10,0(%rsp)
+	leaq	192(%rdi),%rdi
+	subq	$192,%rdx
+	vmovdqa	%ymm15,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L256_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	je	.Ldone8x
+
+	leaq	256(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm14,0(%rsp)
+	leaq	256(%rdi),%rdi
+	subq	$256,%rdx
+	vmovdqa	%ymm2,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L320_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	je	.Ldone8x
+
+	leaq	320(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm3,0(%rsp)
+	leaq	320(%rdi),%rdi
+	subq	$320,%rdx
+	vmovdqa	%ymm7,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L384_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	je	.Ldone8x
+
+	leaq	384(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm11,0(%rsp)
+	leaq	384(%rdi),%rdi
+	subq	$384,%rdx
+	vmovdqa	%ymm9,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L448_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vpxor	384(%rsi),%ymm11,%ymm11
+	vpxor	416(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	vmovdqu	%ymm11,384(%rdi)
+	vmovdqu	%ymm9,416(%rdi)
+	je	.Ldone8x
+
+	leaq	448(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm0,0(%rsp)
+	leaq	448(%rdi),%rdi
+	subq	$448,%rdx
+	vmovdqa	%ymm4,32(%rsp)
+
+.Loop_tail8x:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail8x
+
+.Ldone8x:
+	vzeroall
+	leaq	-8(%r10),%rsp
+
+.L8x_epilogue:
+	ret
+ENDPROC(chacha20_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align	32
+ENTRY(chacha20_avx512)
+.Lchacha20_avx512:
+	cmpq	$0,%rdx
+	je	.Lavx512_epilogue
+	leaq	8(%rsp),%r10
+
+	cmpq	$512,%rdx
+	ja	.Lchacha20_16x
+
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vbroadcasti32x4	.Lsigma(%rip),%zmm0
+	vbroadcasti32x4	(%rcx),%zmm1
+	vbroadcasti32x4	16(%rcx),%zmm2
+	vbroadcasti32x4	(%r8),%zmm3
+
+	vmovdqa32	%zmm0,%zmm16
+	vmovdqa32	%zmm1,%zmm17
+	vmovdqa32	%zmm2,%zmm18
+	vpaddd	.Lzeroz(%rip),%zmm3,%zmm3
+	vmovdqa32	.Lfourz(%rip),%zmm20
+	movq	$10,%r8
+	vmovdqa32	%zmm3,%zmm19
+	jmp	.Loop_avx512
+
+.align	16
+.Loop_outer_avx512:
+	vmovdqa32	%zmm16,%zmm0
+	vmovdqa32	%zmm17,%zmm1
+	vmovdqa32	%zmm18,%zmm2
+	vpaddd	%zmm20,%zmm19,%zmm3
+	movq	$10,%r8
+	vmovdqa32	%zmm3,%zmm19
+	jmp	.Loop_avx512
+
+.align	32
+.Loop_avx512:
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$16,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$12,%zmm1,%zmm1
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$8,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$7,%zmm1,%zmm1
+	vpshufd	$78,%zmm2,%zmm2
+	vpshufd	$57,%zmm1,%zmm1
+	vpshufd	$147,%zmm3,%zmm3
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$16,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$12,%zmm1,%zmm1
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$8,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$7,%zmm1,%zmm1
+	vpshufd	$78,%zmm2,%zmm2
+	vpshufd	$147,%zmm1,%zmm1
+	vpshufd	$57,%zmm3,%zmm3
+	decq	%r8
+	jnz	.Loop_avx512
+	vpaddd	%zmm16,%zmm0,%zmm0
+	vpaddd	%zmm17,%zmm1,%zmm1
+	vpaddd	%zmm18,%zmm2,%zmm2
+	vpaddd	%zmm19,%zmm3,%zmm3
+
+	subq	$64,%rdx
+	jb	.Ltail64_avx512
+
+	vpxor	0(%rsi),%xmm0,%xmm4
+	vpxor	16(%rsi),%xmm1,%xmm5
+	vpxor	32(%rsi),%xmm2,%xmm6
+	vpxor	48(%rsi),%xmm3,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512
+
+	vextracti32x4	$1,%zmm0,%xmm4
+	vextracti32x4	$1,%zmm1,%xmm5
+	vextracti32x4	$1,%zmm2,%xmm6
+	vextracti32x4	$1,%zmm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512
+
+	vextracti32x4	$2,%zmm0,%xmm4
+	vextracti32x4	$2,%zmm1,%xmm5
+	vextracti32x4	$2,%zmm2,%xmm6
+	vextracti32x4	$2,%zmm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512
+
+	vextracti32x4	$3,%zmm0,%xmm4
+	vextracti32x4	$3,%zmm1,%xmm5
+	vextracti32x4	$3,%zmm2,%xmm6
+	vextracti32x4	$3,%zmm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jnz	.Loop_outer_avx512
+
+	jmp	.Ldone_avx512
+
+.align	16
+.Ltail64_avx512:
+	vmovdqa	%xmm0,0(%rsp)
+	vmovdqa	%xmm1,16(%rsp)
+	vmovdqa	%xmm2,32(%rsp)
+	vmovdqa	%xmm3,48(%rsp)
+	addq	$64,%rdx
+	jmp	.Loop_tail_avx512
+
+.align	16
+.Ltail_avx512:
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	vmovdqa	%xmm7,48(%rsp)
+	addq	$64,%rdx
+
+.Loop_tail_avx512:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_avx512
+
+	vmovdqa32	%zmm16,0(%rsp)
+
+.Ldone_avx512:
+	vzeroall
+	leaq	-8(%r10),%rsp
+
+.Lavx512_epilogue:
+	ret
+
+.align	32
+.Lchacha20_16x:
+	leaq	8(%rsp),%r10
+
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vzeroupper
+
+	leaq	.Lsigma(%rip),%r9
+	vbroadcasti32x4	(%r9),%zmm3
+	vbroadcasti32x4	(%rcx),%zmm7
+	vbroadcasti32x4	16(%rcx),%zmm11
+	vbroadcasti32x4	(%r8),%zmm15
+
+	vpshufd	$0x00,%zmm3,%zmm0
+	vpshufd	$0x55,%zmm3,%zmm1
+	vpshufd	$0xaa,%zmm3,%zmm2
+	vpshufd	$0xff,%zmm3,%zmm3
+	vmovdqa64	%zmm0,%zmm16
+	vmovdqa64	%zmm1,%zmm17
+	vmovdqa64	%zmm2,%zmm18
+	vmovdqa64	%zmm3,%zmm19
+
+	vpshufd	$0x00,%zmm7,%zmm4
+	vpshufd	$0x55,%zmm7,%zmm5
+	vpshufd	$0xaa,%zmm7,%zmm6
+	vpshufd	$0xff,%zmm7,%zmm7
+	vmovdqa64	%zmm4,%zmm20
+	vmovdqa64	%zmm5,%zmm21
+	vmovdqa64	%zmm6,%zmm22
+	vmovdqa64	%zmm7,%zmm23
+
+	vpshufd	$0x00,%zmm11,%zmm8
+	vpshufd	$0x55,%zmm11,%zmm9
+	vpshufd	$0xaa,%zmm11,%zmm10
+	vpshufd	$0xff,%zmm11,%zmm11
+	vmovdqa64	%zmm8,%zmm24
+	vmovdqa64	%zmm9,%zmm25
+	vmovdqa64	%zmm10,%zmm26
+	vmovdqa64	%zmm11,%zmm27
+
+	vpshufd	$0x00,%zmm15,%zmm12
+	vpshufd	$0x55,%zmm15,%zmm13
+	vpshufd	$0xaa,%zmm15,%zmm14
+	vpshufd	$0xff,%zmm15,%zmm15
+	vpaddd	.Lincz(%rip),%zmm12,%zmm12
+	vmovdqa64	%zmm12,%zmm28
+	vmovdqa64	%zmm13,%zmm29
+	vmovdqa64	%zmm14,%zmm30
+	vmovdqa64	%zmm15,%zmm31
+
+	movl	$10,%eax
+	jmp	.Loop16x
+
+.align	32
+.Loop_outer16x:
+	vpbroadcastd	0(%r9),%zmm0
+	vpbroadcastd	4(%r9),%zmm1
+	vpbroadcastd	8(%r9),%zmm2
+	vpbroadcastd	12(%r9),%zmm3
+	vpaddd	.Lsixteen(%rip),%zmm28,%zmm28
+	vmovdqa64	%zmm20,%zmm4
+	vmovdqa64	%zmm21,%zmm5
+	vmovdqa64	%zmm22,%zmm6
+	vmovdqa64	%zmm23,%zmm7
+	vmovdqa64	%zmm24,%zmm8
+	vmovdqa64	%zmm25,%zmm9
+	vmovdqa64	%zmm26,%zmm10
+	vmovdqa64	%zmm27,%zmm11
+	vmovdqa64	%zmm28,%zmm12
+	vmovdqa64	%zmm29,%zmm13
+	vmovdqa64	%zmm30,%zmm14
+	vmovdqa64	%zmm31,%zmm15
+
+	vmovdqa64	%zmm0,%zmm16
+	vmovdqa64	%zmm1,%zmm17
+	vmovdqa64	%zmm2,%zmm18
+	vmovdqa64	%zmm3,%zmm19
+
+	movl	$10,%eax
+	jmp	.Loop16x
+
+.align	32
+.Loop16x:
+	vpaddd	%zmm4,%zmm0,%zmm0
+	vpaddd	%zmm5,%zmm1,%zmm1
+	vpaddd	%zmm6,%zmm2,%zmm2
+	vpaddd	%zmm7,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm12,%zmm12
+	vpxord	%zmm1,%zmm13,%zmm13
+	vpxord	%zmm2,%zmm14,%zmm14
+	vpxord	%zmm3,%zmm15,%zmm15
+	vprold	$16,%zmm12,%zmm12
+	vprold	$16,%zmm13,%zmm13
+	vprold	$16,%zmm14,%zmm14
+	vprold	$16,%zmm15,%zmm15
+	vpaddd	%zmm12,%zmm8,%zmm8
+	vpaddd	%zmm13,%zmm9,%zmm9
+	vpaddd	%zmm14,%zmm10,%zmm10
+	vpaddd	%zmm15,%zmm11,%zmm11
+	vpxord	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm9,%zmm5,%zmm5
+	vpxord	%zmm10,%zmm6,%zmm6
+	vpxord	%zmm11,%zmm7,%zmm7
+	vprold	$12,%zmm4,%zmm4
+	vprold	$12,%zmm5,%zmm5
+	vprold	$12,%zmm6,%zmm6
+	vprold	$12,%zmm7,%zmm7
+	vpaddd	%zmm4,%zmm0,%zmm0
+	vpaddd	%zmm5,%zmm1,%zmm1
+	vpaddd	%zmm6,%zmm2,%zmm2
+	vpaddd	%zmm7,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm12,%zmm12
+	vpxord	%zmm1,%zmm13,%zmm13
+	vpxord	%zmm2,%zmm14,%zmm14
+	vpxord	%zmm3,%zmm15,%zmm15
+	vprold	$8,%zmm12,%zmm12
+	vprold	$8,%zmm13,%zmm13
+	vprold	$8,%zmm14,%zmm14
+	vprold	$8,%zmm15,%zmm15
+	vpaddd	%zmm12,%zmm8,%zmm8
+	vpaddd	%zmm13,%zmm9,%zmm9
+	vpaddd	%zmm14,%zmm10,%zmm10
+	vpaddd	%zmm15,%zmm11,%zmm11
+	vpxord	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm9,%zmm5,%zmm5
+	vpxord	%zmm10,%zmm6,%zmm6
+	vpxord	%zmm11,%zmm7,%zmm7
+	vprold	$7,%zmm4,%zmm4
+	vprold	$7,%zmm5,%zmm5
+	vprold	$7,%zmm6,%zmm6
+	vprold	$7,%zmm7,%zmm7
+	vpaddd	%zmm5,%zmm0,%zmm0
+	vpaddd	%zmm6,%zmm1,%zmm1
+	vpaddd	%zmm7,%zmm2,%zmm2
+	vpaddd	%zmm4,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm15,%zmm15
+	vpxord	%zmm1,%zmm12,%zmm12
+	vpxord	%zmm2,%zmm13,%zmm13
+	vpxord	%zmm3,%zmm14,%zmm14
+	vprold	$16,%zmm15,%zmm15
+	vprold	$16,%zmm12,%zmm12
+	vprold	$16,%zmm13,%zmm13
+	vprold	$16,%zmm14,%zmm14
+	vpaddd	%zmm15,%zmm10,%zmm10
+	vpaddd	%zmm12,%zmm11,%zmm11
+	vpaddd	%zmm13,%zmm8,%zmm8
+	vpaddd	%zmm14,%zmm9,%zmm9
+	vpxord	%zmm10,%zmm5,%zmm5
+	vpxord	%zmm11,%zmm6,%zmm6
+	vpxord	%zmm8,%zmm7,%zmm7
+	vpxord	%zmm9,%zmm4,%zmm4
+	vprold	$12,%zmm5,%zmm5
+	vprold	$12,%zmm6,%zmm6
+	vprold	$12,%zmm7,%zmm7
+	vprold	$12,%zmm4,%zmm4
+	vpaddd	%zmm5,%zmm0,%zmm0
+	vpaddd	%zmm6,%zmm1,%zmm1
+	vpaddd	%zmm7,%zmm2,%zmm2
+	vpaddd	%zmm4,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm15,%zmm15
+	vpxord	%zmm1,%zmm12,%zmm12
+	vpxord	%zmm2,%zmm13,%zmm13
+	vpxord	%zmm3,%zmm14,%zmm14
+	vprold	$8,%zmm15,%zmm15
+	vprold	$8,%zmm12,%zmm12
+	vprold	$8,%zmm13,%zmm13
+	vprold	$8,%zmm14,%zmm14
+	vpaddd	%zmm15,%zmm10,%zmm10
+	vpaddd	%zmm12,%zmm11,%zmm11
+	vpaddd	%zmm13,%zmm8,%zmm8
+	vpaddd	%zmm14,%zmm9,%zmm9
+	vpxord	%zmm10,%zmm5,%zmm5
+	vpxord	%zmm11,%zmm6,%zmm6
+	vpxord	%zmm8,%zmm7,%zmm7
+	vpxord	%zmm9,%zmm4,%zmm4
+	vprold	$7,%zmm5,%zmm5
+	vprold	$7,%zmm6,%zmm6
+	vprold	$7,%zmm7,%zmm7
+	vprold	$7,%zmm4,%zmm4
+	decl	%eax
+	jnz	.Loop16x
+
+	vpaddd	%zmm16,%zmm0,%zmm0
+	vpaddd	%zmm17,%zmm1,%zmm1
+	vpaddd	%zmm18,%zmm2,%zmm2
+	vpaddd	%zmm19,%zmm3,%zmm3
+
+	vpunpckldq	%zmm1,%zmm0,%zmm18
+	vpunpckldq	%zmm3,%zmm2,%zmm19
+	vpunpckhdq	%zmm1,%zmm0,%zmm0
+	vpunpckhdq	%zmm3,%zmm2,%zmm2
+	vpunpcklqdq	%zmm19,%zmm18,%zmm1
+	vpunpckhqdq	%zmm19,%zmm18,%zmm18
+	vpunpcklqdq	%zmm2,%zmm0,%zmm3
+	vpunpckhqdq	%zmm2,%zmm0,%zmm0
+	vpaddd	%zmm20,%zmm4,%zmm4
+	vpaddd	%zmm21,%zmm5,%zmm5
+	vpaddd	%zmm22,%zmm6,%zmm6
+	vpaddd	%zmm23,%zmm7,%zmm7
+
+	vpunpckldq	%zmm5,%zmm4,%zmm2
+	vpunpckldq	%zmm7,%zmm6,%zmm19
+	vpunpckhdq	%zmm5,%zmm4,%zmm4
+	vpunpckhdq	%zmm7,%zmm6,%zmm6
+	vpunpcklqdq	%zmm19,%zmm2,%zmm5
+	vpunpckhqdq	%zmm19,%zmm2,%zmm2
+	vpunpcklqdq	%zmm6,%zmm4,%zmm7
+	vpunpckhqdq	%zmm6,%zmm4,%zmm4
+	vshufi32x4	$0x44,%zmm5,%zmm1,%zmm19
+	vshufi32x4	$0xee,%zmm5,%zmm1,%zmm5
+	vshufi32x4	$0x44,%zmm2,%zmm18,%zmm1
+	vshufi32x4	$0xee,%zmm2,%zmm18,%zmm2
+	vshufi32x4	$0x44,%zmm7,%zmm3,%zmm18
+	vshufi32x4	$0xee,%zmm7,%zmm3,%zmm7
+	vshufi32x4	$0x44,%zmm4,%zmm0,%zmm3
+	vshufi32x4	$0xee,%zmm4,%zmm0,%zmm4
+	vpaddd	%zmm24,%zmm8,%zmm8
+	vpaddd	%zmm25,%zmm9,%zmm9
+	vpaddd	%zmm26,%zmm10,%zmm10
+	vpaddd	%zmm27,%zmm11,%zmm11
+
+	vpunpckldq	%zmm9,%zmm8,%zmm6
+	vpunpckldq	%zmm11,%zmm10,%zmm0
+	vpunpckhdq	%zmm9,%zmm8,%zmm8
+	vpunpckhdq	%zmm11,%zmm10,%zmm10
+	vpunpcklqdq	%zmm0,%zmm6,%zmm9
+	vpunpckhqdq	%zmm0,%zmm6,%zmm6
+	vpunpcklqdq	%zmm10,%zmm8,%zmm11
+	vpunpckhqdq	%zmm10,%zmm8,%zmm8
+	vpaddd	%zmm28,%zmm12,%zmm12
+	vpaddd	%zmm29,%zmm13,%zmm13
+	vpaddd	%zmm30,%zmm14,%zmm14
+	vpaddd	%zmm31,%zmm15,%zmm15
+
+	vpunpckldq	%zmm13,%zmm12,%zmm10
+	vpunpckldq	%zmm15,%zmm14,%zmm0
+	vpunpckhdq	%zmm13,%zmm12,%zmm12
+	vpunpckhdq	%zmm15,%zmm14,%zmm14
+	vpunpcklqdq	%zmm0,%zmm10,%zmm13
+	vpunpckhqdq	%zmm0,%zmm10,%zmm10
+	vpunpcklqdq	%zmm14,%zmm12,%zmm15
+	vpunpckhqdq	%zmm14,%zmm12,%zmm12
+	vshufi32x4	$0x44,%zmm13,%zmm9,%zmm0
+	vshufi32x4	$0xee,%zmm13,%zmm9,%zmm13
+	vshufi32x4	$0x44,%zmm10,%zmm6,%zmm9
+	vshufi32x4	$0xee,%zmm10,%zmm6,%zmm10
+	vshufi32x4	$0x44,%zmm15,%zmm11,%zmm6
+	vshufi32x4	$0xee,%zmm15,%zmm11,%zmm15
+	vshufi32x4	$0x44,%zmm12,%zmm8,%zmm11
+	vshufi32x4	$0xee,%zmm12,%zmm8,%zmm12
+	vshufi32x4	$0x88,%zmm0,%zmm19,%zmm16
+	vshufi32x4	$0xdd,%zmm0,%zmm19,%zmm19
+	vshufi32x4	$0x88,%zmm13,%zmm5,%zmm0
+	vshufi32x4	$0xdd,%zmm13,%zmm5,%zmm13
+	vshufi32x4	$0x88,%zmm9,%zmm1,%zmm17
+	vshufi32x4	$0xdd,%zmm9,%zmm1,%zmm1
+	vshufi32x4	$0x88,%zmm10,%zmm2,%zmm9
+	vshufi32x4	$0xdd,%zmm10,%zmm2,%zmm10
+	vshufi32x4	$0x88,%zmm6,%zmm18,%zmm14
+	vshufi32x4	$0xdd,%zmm6,%zmm18,%zmm18
+	vshufi32x4	$0x88,%zmm15,%zmm7,%zmm6
+	vshufi32x4	$0xdd,%zmm15,%zmm7,%zmm15
+	vshufi32x4	$0x88,%zmm11,%zmm3,%zmm8
+	vshufi32x4	$0xdd,%zmm11,%zmm3,%zmm3
+	vshufi32x4	$0x88,%zmm12,%zmm4,%zmm11
+	vshufi32x4	$0xdd,%zmm12,%zmm4,%zmm12
+	cmpq	$1024,%rdx
+	jb	.Ltail16x
+
+	vpxord	0(%rsi),%zmm16,%zmm16
+	vpxord	64(%rsi),%zmm17,%zmm17
+	vpxord	128(%rsi),%zmm14,%zmm14
+	vpxord	192(%rsi),%zmm8,%zmm8
+	vmovdqu32	%zmm16,0(%rdi)
+	vmovdqu32	%zmm17,64(%rdi)
+	vmovdqu32	%zmm14,128(%rdi)
+	vmovdqu32	%zmm8,192(%rdi)
+
+	vpxord	256(%rsi),%zmm19,%zmm19
+	vpxord	320(%rsi),%zmm1,%zmm1
+	vpxord	384(%rsi),%zmm18,%zmm18
+	vpxord	448(%rsi),%zmm3,%zmm3
+	vmovdqu32	%zmm19,256(%rdi)
+	vmovdqu32	%zmm1,320(%rdi)
+	vmovdqu32	%zmm18,384(%rdi)
+	vmovdqu32	%zmm3,448(%rdi)
+
+	vpxord	512(%rsi),%zmm0,%zmm0
+	vpxord	576(%rsi),%zmm9,%zmm9
+	vpxord	640(%rsi),%zmm6,%zmm6
+	vpxord	704(%rsi),%zmm11,%zmm11
+	vmovdqu32	%zmm0,512(%rdi)
+	vmovdqu32	%zmm9,576(%rdi)
+	vmovdqu32	%zmm6,640(%rdi)
+	vmovdqu32	%zmm11,704(%rdi)
+
+	vpxord	768(%rsi),%zmm13,%zmm13
+	vpxord	832(%rsi),%zmm10,%zmm10
+	vpxord	896(%rsi),%zmm15,%zmm15
+	vpxord	960(%rsi),%zmm12,%zmm12
+	leaq	1024(%rsi),%rsi
+	vmovdqu32	%zmm13,768(%rdi)
+	vmovdqu32	%zmm10,832(%rdi)
+	vmovdqu32	%zmm15,896(%rdi)
+	vmovdqu32	%zmm12,960(%rdi)
+	leaq	1024(%rdi),%rdi
+
+	subq	$1024,%rdx
+	jnz	.Loop_outer16x
+
+	jmp	.Ldone16x
+
+.align	32
+.Ltail16x:
+	xorq	%r9,%r9
+	subq	%rsi,%rdi
+	cmpq	$64,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm16,%zmm16
+	vmovdqu32	%zmm16,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm17,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$128,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm17,%zmm17
+	vmovdqu32	%zmm17,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm14,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$192,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm14,%zmm14
+	vmovdqu32	%zmm14,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm8,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$256,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm8,%zmm8
+	vmovdqu32	%zmm8,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm19,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$320,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm19,%zmm19
+	vmovdqu32	%zmm19,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm1,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$384,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm1,%zmm1
+	vmovdqu32	%zmm1,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm18,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$448,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm18,%zmm18
+	vmovdqu32	%zmm18,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm3,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$512,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm3,%zmm3
+	vmovdqu32	%zmm3,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm0,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$576,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm0,%zmm0
+	vmovdqu32	%zmm0,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm9,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$640,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm9,%zmm9
+	vmovdqu32	%zmm9,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm6,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$704,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm6,%zmm6
+	vmovdqu32	%zmm6,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm11,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$768,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm11,%zmm11
+	vmovdqu32	%zmm11,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm13,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$832,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm13,%zmm13
+	vmovdqu32	%zmm13,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm10,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$896,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm10,%zmm10
+	vmovdqu32	%zmm10,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm15,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$960,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm15,%zmm15
+	vmovdqu32	%zmm15,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm12,%zmm16
+	leaq	64(%rsi),%rsi
+
+.Less_than_64_16x:
+	vmovdqa32	%zmm16,0(%rsp)
+	leaq	(%rdi,%rsi,1),%rdi
+	andq	$63,%rdx
+
+.Loop_tail16x:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail16x
+
+	vpxord	%zmm16,%zmm16,%zmm16
+	vmovdqa32	%zmm16,0(%rsp)
+
+.Ldone16x:
+	vzeroall
+	leaq	-8(%r10),%rsp
+
+.L16x_epilogue:
+	ret
+ENDPROC(chacha20_avx512)
+
+.align	32
+ENTRY(chacha20_avx512vl)
+	cmpq	$0,%rdx
+	je	.Lavx512vl_epilogue
+
+	leaq	8(%rsp),%r10
+
+	cmpq	$128,%rdx
+	ja	.Lchacha20_8xvl
+
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vbroadcasti128	.Lsigma(%rip),%ymm0
+	vbroadcasti128	(%rcx),%ymm1
+	vbroadcasti128	16(%rcx),%ymm2
+	vbroadcasti128	(%r8),%ymm3
+
+	vmovdqa32	%ymm0,%ymm16
+	vmovdqa32	%ymm1,%ymm17
+	vmovdqa32	%ymm2,%ymm18
+	vpaddd	.Lzeroz(%rip),%ymm3,%ymm3
+	vmovdqa32	.Ltwoy(%rip),%ymm20
+	movq	$10,%r8
+	vmovdqa32	%ymm3,%ymm19
+	jmp	.Loop_avx512vl
+
+.align	16
+.Loop_outer_avx512vl:
+	vmovdqa32	%ymm18,%ymm2
+	vpaddd	%ymm20,%ymm19,%ymm3
+	movq	$10,%r8
+	vmovdqa32	%ymm3,%ymm19
+	jmp	.Loop_avx512vl
+
+.align	32
+.Loop_avx512vl:
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$16,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$12,%ymm1,%ymm1
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$8,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$7,%ymm1,%ymm1
+	vpshufd	$78,%ymm2,%ymm2
+	vpshufd	$57,%ymm1,%ymm1
+	vpshufd	$147,%ymm3,%ymm3
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$16,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$12,%ymm1,%ymm1
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$8,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$7,%ymm1,%ymm1
+	vpshufd	$78,%ymm2,%ymm2
+	vpshufd	$147,%ymm1,%ymm1
+	vpshufd	$57,%ymm3,%ymm3
+	decq	%r8
+	jnz	.Loop_avx512vl
+	vpaddd	%ymm16,%ymm0,%ymm0
+	vpaddd	%ymm17,%ymm1,%ymm1
+	vpaddd	%ymm18,%ymm2,%ymm2
+	vpaddd	%ymm19,%ymm3,%ymm3
+
+	subq	$64,%rdx
+	jb	.Ltail64_avx512vl
+
+	vpxor	0(%rsi),%xmm0,%xmm4
+	vpxor	16(%rsi),%xmm1,%xmm5
+	vpxor	32(%rsi),%xmm2,%xmm6
+	vpxor	48(%rsi),%xmm3,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512vl
+
+	vextracti128	$1,%ymm0,%xmm4
+	vextracti128	$1,%ymm1,%xmm5
+	vextracti128	$1,%ymm2,%xmm6
+	vextracti128	$1,%ymm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512vl
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	vmovdqa32	%ymm16,%ymm0
+	vmovdqa32	%ymm17,%ymm1
+	jnz	.Loop_outer_avx512vl
+
+	jmp	.Ldone_avx512vl
+
+.align	16
+.Ltail64_avx512vl:
+	vmovdqa	%xmm0,0(%rsp)
+	vmovdqa	%xmm1,16(%rsp)
+	vmovdqa	%xmm2,32(%rsp)
+	vmovdqa	%xmm3,48(%rsp)
+	addq	$64,%rdx
+	jmp	.Loop_tail_avx512vl
+
+.align	16
+.Ltail_avx512vl:
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	vmovdqa	%xmm7,48(%rsp)
+	addq	$64,%rdx
+
+.Loop_tail_avx512vl:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_avx512vl
+
+	vmovdqa32	%ymm16,0(%rsp)
+	vmovdqa32	%ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+	vzeroall
+	leaq	-8(%r10),%rsp
+.Lavx512vl_epilogue:
+	ret
+
+.align	32
+.Lchacha20_8xvl:
+	leaq	8(%rsp),%r10
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vzeroupper
+
+	leaq	.Lsigma(%rip),%r9
+	vbroadcasti128	(%r9),%ymm3
+	vbroadcasti128	(%rcx),%ymm7
+	vbroadcasti128	16(%rcx),%ymm11
+	vbroadcasti128	(%r8),%ymm15
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa64	%ymm4,%ymm20
+	vmovdqa64	%ymm5,%ymm21
+	vmovdqa64	%ymm6,%ymm22
+	vmovdqa64	%ymm7,%ymm23
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa64	%ymm8,%ymm24
+	vmovdqa64	%ymm9,%ymm25
+	vmovdqa64	%ymm10,%ymm26
+	vmovdqa64	%ymm11,%ymm27
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vpshufd	$0xff,%ymm15,%ymm15
+	vpaddd	.Lincy(%rip),%ymm12,%ymm12
+	vmovdqa64	%ymm12,%ymm28
+	vmovdqa64	%ymm13,%ymm29
+	vmovdqa64	%ymm14,%ymm30
+	vmovdqa64	%ymm15,%ymm31
+
+	movl	$10,%eax
+	jmp	.Loop8xvl
+
+.align	32
+.Loop_outer8xvl:
+
+
+	vpbroadcastd	8(%r9),%ymm2
+	vpbroadcastd	12(%r9),%ymm3
+	vpaddd	.Leight(%rip),%ymm28,%ymm28
+	vmovdqa64	%ymm20,%ymm4
+	vmovdqa64	%ymm21,%ymm5
+	vmovdqa64	%ymm22,%ymm6
+	vmovdqa64	%ymm23,%ymm7
+	vmovdqa64	%ymm24,%ymm8
+	vmovdqa64	%ymm25,%ymm9
+	vmovdqa64	%ymm26,%ymm10
+	vmovdqa64	%ymm27,%ymm11
+	vmovdqa64	%ymm28,%ymm12
+	vmovdqa64	%ymm29,%ymm13
+	vmovdqa64	%ymm30,%ymm14
+	vmovdqa64	%ymm31,%ymm15
+
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+
+	movl	$10,%eax
+	jmp	.Loop8xvl
+
+.align	32
+.Loop8xvl:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm3,%ymm15,%ymm15
+	vprold	$16,%ymm12,%ymm12
+	vprold	$16,%ymm13,%ymm13
+	vprold	$16,%ymm14,%ymm14
+	vprold	$16,%ymm15,%ymm15
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm11,%ymm7,%ymm7
+	vprold	$12,%ymm4,%ymm4
+	vprold	$12,%ymm5,%ymm5
+	vprold	$12,%ymm6,%ymm6
+	vprold	$12,%ymm7,%ymm7
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm3,%ymm15,%ymm15
+	vprold	$8,%ymm12,%ymm12
+	vprold	$8,%ymm13,%ymm13
+	vprold	$8,%ymm14,%ymm14
+	vprold	$8,%ymm15,%ymm15
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm11,%ymm7,%ymm7
+	vprold	$7,%ymm4,%ymm4
+	vprold	$7,%ymm5,%ymm5
+	vprold	$7,%ymm6,%ymm6
+	vprold	$7,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpaddd	%ymm4,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm15,%ymm15
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm14,%ymm14
+	vprold	$16,%ymm15,%ymm15
+	vprold	$16,%ymm12,%ymm12
+	vprold	$16,%ymm13,%ymm13
+	vprold	$16,%ymm14,%ymm14
+	vpaddd	%ymm15,%ymm10,%ymm10
+	vpaddd	%ymm12,%ymm11,%ymm11
+	vpaddd	%ymm13,%ymm8,%ymm8
+	vpaddd	%ymm14,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpxor	%ymm9,%ymm4,%ymm4
+	vprold	$12,%ymm5,%ymm5
+	vprold	$12,%ymm6,%ymm6
+	vprold	$12,%ymm7,%ymm7
+	vprold	$12,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpaddd	%ymm4,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm15,%ymm15
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm14,%ymm14
+	vprold	$8,%ymm15,%ymm15
+	vprold	$8,%ymm12,%ymm12
+	vprold	$8,%ymm13,%ymm13
+	vprold	$8,%ymm14,%ymm14
+	vpaddd	%ymm15,%ymm10,%ymm10
+	vpaddd	%ymm12,%ymm11,%ymm11
+	vpaddd	%ymm13,%ymm8,%ymm8
+	vpaddd	%ymm14,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpxor	%ymm9,%ymm4,%ymm4
+	vprold	$7,%ymm5,%ymm5
+	vprold	$7,%ymm6,%ymm6
+	vprold	$7,%ymm7,%ymm7
+	vprold	$7,%ymm4,%ymm4
+	decl	%eax
+	jnz	.Loop8xvl
+
+	vpaddd	%ymm16,%ymm0,%ymm0
+	vpaddd	%ymm17,%ymm1,%ymm1
+	vpaddd	%ymm18,%ymm2,%ymm2
+	vpaddd	%ymm19,%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm18
+	vpunpckldq	%ymm3,%ymm2,%ymm19
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm19,%ymm18,%ymm1
+	vpunpckhqdq	%ymm19,%ymm18,%ymm18
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm20,%ymm4,%ymm4
+	vpaddd	%ymm21,%ymm5,%ymm5
+	vpaddd	%ymm22,%ymm6,%ymm6
+	vpaddd	%ymm23,%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm2
+	vpunpckldq	%ymm7,%ymm6,%ymm19
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm19,%ymm2,%ymm5
+	vpunpckhqdq	%ymm19,%ymm2,%ymm2
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vshufi32x4	$0,%ymm5,%ymm1,%ymm19
+	vshufi32x4	$3,%ymm5,%ymm1,%ymm5
+	vshufi32x4	$0,%ymm2,%ymm18,%ymm1
+	vshufi32x4	$3,%ymm2,%ymm18,%ymm2
+	vshufi32x4	$0,%ymm7,%ymm3,%ymm18
+	vshufi32x4	$3,%ymm7,%ymm3,%ymm7
+	vshufi32x4	$0,%ymm4,%ymm0,%ymm3
+	vshufi32x4	$3,%ymm4,%ymm0,%ymm4
+	vpaddd	%ymm24,%ymm8,%ymm8
+	vpaddd	%ymm25,%ymm9,%ymm9
+	vpaddd	%ymm26,%ymm10,%ymm10
+	vpaddd	%ymm27,%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm6
+	vpunpckldq	%ymm11,%ymm10,%ymm0
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm0,%ymm6,%ymm9
+	vpunpckhqdq	%ymm0,%ymm6,%ymm6
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm28,%ymm12,%ymm12
+	vpaddd	%ymm29,%ymm13,%ymm13
+	vpaddd	%ymm30,%ymm14,%ymm14
+	vpaddd	%ymm31,%ymm15,%ymm15
+
+	vpunpckldq	%ymm13,%ymm12,%ymm10
+	vpunpckldq	%ymm15,%ymm14,%ymm0
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm0,%ymm10,%ymm13
+	vpunpckhqdq	%ymm0,%ymm10,%ymm10
+	vpunpcklqdq	%ymm14,%ymm12,%ymm15
+	vpunpckhqdq	%ymm14,%ymm12,%ymm12
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+	vperm2i128	$0x20,%ymm10,%ymm6,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm6,%ymm10
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm6
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+	cmpq	$512,%rdx
+	jb	.Ltail8xvl
+
+	movl	$0x80,%eax
+	vpxord	0(%rsi),%ymm19,%ymm19
+	vpxor	32(%rsi),%ymm0,%ymm0
+	vpxor	64(%rsi),%ymm5,%ymm5
+	vpxor	96(%rsi),%ymm13,%ymm13
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu32	%ymm19,0(%rdi)
+	vmovdqu	%ymm0,32(%rdi)
+	vmovdqu	%ymm5,64(%rdi)
+	vmovdqu	%ymm13,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpxor	0(%rsi),%ymm1,%ymm1
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm2,%ymm2
+	vpxor	96(%rsi),%ymm10,%ymm10
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu	%ymm1,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm2,64(%rdi)
+	vmovdqu	%ymm10,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpxord	0(%rsi),%ymm18,%ymm18
+	vpxor	32(%rsi),%ymm6,%ymm6
+	vpxor	64(%rsi),%ymm7,%ymm7
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu32	%ymm18,0(%rdi)
+	vmovdqu	%ymm6,32(%rdi)
+	vmovdqu	%ymm7,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vpxor	64(%rsi),%ymm4,%ymm4
+	vpxor	96(%rsi),%ymm12,%ymm12
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu	%ymm3,0(%rdi)
+	vmovdqu	%ymm11,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vmovdqu	%ymm12,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpbroadcastd	0(%r9),%ymm0
+	vpbroadcastd	4(%r9),%ymm1
+
+	subq	$512,%rdx
+	jnz	.Loop_outer8xvl
+
+	jmp	.Ldone8xvl
+
+.align	32
+.Ltail8xvl:
+	vmovdqa64	%ymm19,%ymm8
+	xorq	%r9,%r9
+	subq	%rsi,%rdi
+	cmpq	$64,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm8,%ymm8
+	vpxor	32(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm8,0(%rdi,%rsi,1)
+	vmovdqu	%ymm0,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm5,%ymm8
+	vmovdqa	%ymm13,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$128,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm5,%ymm5
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm5,0(%rdi,%rsi,1)
+	vmovdqu	%ymm13,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm1,%ymm8
+	vmovdqa	%ymm9,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$192,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm1,%ymm1
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm1,0(%rdi,%rsi,1)
+	vmovdqu	%ymm9,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm2,%ymm8
+	vmovdqa	%ymm10,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$256,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm2,%ymm2
+	vpxor	32(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm2,0(%rdi,%rsi,1)
+	vmovdqu	%ymm10,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa32	%ymm18,%ymm8
+	vmovdqa	%ymm6,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$320,%rdx
+	jb	.Less_than_64_8xvl
+	vpxord	0(%rsi),%ymm18,%ymm18
+	vpxor	32(%rsi),%ymm6,%ymm6
+	vmovdqu32	%ymm18,0(%rdi,%rsi,1)
+	vmovdqu	%ymm6,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm7,%ymm8
+	vmovdqa	%ymm15,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$384,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm7,%ymm7
+	vpxor	32(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm7,0(%rdi,%rsi,1)
+	vmovdqu	%ymm15,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm3,%ymm8
+	vmovdqa	%ymm11,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$448,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm3,0(%rdi,%rsi,1)
+	vmovdqu	%ymm11,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm4,%ymm8
+	vmovdqa	%ymm12,%ymm0
+	leaq	64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+	vmovdqa	%ymm8,0(%rsp)
+	vmovdqa	%ymm0,32(%rsp)
+	leaq	(%rdi,%rsi,1),%rdi
+	andq	$63,%rdx
+
+.Loop_tail8xvl:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail8xvl
+
+	vpxor	%ymm8,%ymm8,%ymm8
+	vmovdqa	%ymm8,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+
+.Ldone8xvl:
+	vzeroall
+	leaq	-8(%r10),%rsp
+.L8xvl_epilogue:
+	ret
+ENDPROC(chacha20_avx512vl)
+
+#endif /* CONFIG_AS_AVX512 */
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 04/20] zinc: ChaCha20 ARM and ARM64 implementations
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Jean-Philippe Aumasson, Andy Polyakov, Russell King,
	linux-arm-kernel
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

These NEON and non-NEON implementations come from Andy Polyakov's
implementation. They are exactly the same as Andy Polyakov's original,
with the following exceptions:

- Entries and exits use the proper kernel convention macro.
- CPU feature checking is done in C by the glue code, so that has been
  removed from the assembly.
- The function names have been renamed to fit kernel conventions.
- Labels have been renamed (prefixed with .L) to fit kernel conventions.
- Constants have been rearranged so that they are closer to the code
  that is using them. [ARM only]
- The neon code can jump to the scalar code when it makes sense to do
  so.
- The neon_512 function as a separate function has been removed, leaving
  the decision up to the main neon entry point. [ARM64 only]

After '/^#/d;/^\..*[^:]$/d', the code has the following diff in actual
instructions from the original.

ARM:

-ChaCha20_ctr32:
-.LChaCha20_ctr32:
+ENTRY(chacha20_arm)
 	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
 	stmdb	sp!,{r0-r2,r4-r11,lr}
-	sub	r14,pc,#16		@ ChaCha20_ctr32
-	adr	r14,.LChaCha20_ctr32
 	cmp	r2,#0			@ len==0?
 	itt	eq
 	addeq	sp,sp,#4*3
-	beq	.Lno_data
-	cmp	r2,#192			@ test len
-	bls	.Lshort
-	ldr	r4,[r14,#-32]
-	ldr	r4,[r14,r4]
-	ldr	r4,[r4]
-	tst	r4,#ARMV7_NEON
-	bne	.LChaCha20_neon
+	beq	.Lno_data_arm
 .Lshort:
 	ldmia	r12,{r4-r7}		@ load counter and nonce
 	sub	sp,sp,#4*(16)		@ off-load area
-	sub	r14,r14,#64		@ .Lsigma
+	sub	r14,pc,#100		@ .Lsigma
+	adr	r14,.Lsigma		@ .Lsigma
 	stmdb	sp!,{r4-r7}		@ copy counter and nonce
 	ldmia	r3,{r4-r11}		@ load key
 	ldmia	r14,{r0-r3}		@ load sigma
@@ -617,14 +615,25 @@

 .Ldone:
 	add	sp,sp,#4*(32+3)
-.Lno_data:
+.Lno_data_arm:
 	ldmia	sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)

-ChaCha20_neon:
+ENTRY(chacha20_neon)
 	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
 	stmdb		sp!,{r0-r2,r4-r11,lr}
-.LChaCha20_neon:
-	adr		r14,.Lsigma
+	cmp		r2,#0			@ len==0?
+	itt		eq
+	addeq		sp,sp,#4*3
+	beq		.Lno_data_neon
+	cmp		r2,#192			@ test len
+	bls		.Lshort
+.Lchacha20_neon_begin:
+	adr		r14,.Lsigma2
 	vstmdb		sp!,{d8-d15}		@ ABI spec says so
 	stmdb		sp!,{r0-r3}

@@ -1265,4 +1274,6 @@
 	add		sp,sp,#4*(32+4)
 	vldmia		sp,{d8-d15}
 	add		sp,sp,#4*(16+3)
+.Lno_data_neon:
 	ldmia		sp!,{r4-r11,pc}
+ENDPROC(chacha20_neon)

ARM64:

-ChaCha20_ctr32:
+ENTRY(chacha20_arm)
 	cbz	x2,.Labort
-	adr	x5,.LOPENSSL_armcap_P
-	cmp	x2,#192
-	b.lo	.Lshort
-	ldrsw	x6,[x5]
-	ldr	x6,[x5]
-	ldr	w17,[x6,x5]
-	tst	w17,#ARMV7_NEON
-	b.ne	ChaCha20_neon
-
 .Lshort:
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0
@@ -279,8 +274,13 @@
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
 	ret
+ENDPROC(chacha20_arm)
+
+ENTRY(chacha20_neon)
+	cbz	x2,.Labort_neon
+	cmp	x2,#192
+	b.lo	.Lshort

-ChaCha20_neon:
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0

@@ -763,16 +763,6 @@
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
 	ret
-ChaCha20_512_neon:
-	stp	x29,x30,[sp,#-96]!
-	add	x29,sp,#0
-
-	adr	x5,.Lsigma
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]

 .L512_or_more_neon:
 	sub	sp,sp,#128+64
@@ -1920,4 +1910,6 @@
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#96
+.Labort_neon:
 	ret
+ENDPROC(chacha20_neon)

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
---
 lib/zinc/Makefile                     |    8 +
 lib/zinc/chacha20/chacha20-arm-glue.h |   50 +
 lib/zinc/chacha20/chacha20-arm.S      | 1473 +++++++++++++++++++
 lib/zinc/chacha20/chacha20-arm64.S    | 1942 +++++++++++++++++++++++++
 4 files changed, 3473 insertions(+)
 create mode 100644 lib/zinc/chacha20/chacha20-arm-glue.h
 create mode 100644 lib/zinc/chacha20/chacha20-arm.S
 create mode 100644 lib/zinc/chacha20/chacha20-arm64.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 0b5a964bfba6..8d14cb13349a 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -5,6 +5,14 @@ ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
 
 ifeq ($(CONFIG_ZINC_CHACHA20),y)
 zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_ZINC_ARCH_ARM),y)
+zinc-y += chacha20/chacha20-arm.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
+ifeq ($(CONFIG_ZINC_ARCH_ARM64),y)
+zinc-y += chacha20/chacha20-arm64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
 endif
 
 zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-arm-glue.h b/lib/zinc/chacha20/chacha20-arm-glue.h
new file mode 100644
index 000000000000..d32361514f3a
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm-glue.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
+			     const u32 key[8], const u32 counter[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&                                     \
+	(defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
+			      const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_neon __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+	chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+	chacha20_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len,
+				 const u32 key[8], const u32 counter[4],
+				 simd_context_t simd_context)
+{
+#if defined(ARM_USE_NEON)
+	if (simd_context == HAVE_FULL_SIMD && chacha20_use_neon) {
+		chacha20_neon(dst, src, len, key, counter);
+		return true;
+	}
+#endif
+	chacha20_arm(dst, src, len, key, counter);
+	return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+				  const u8 *key, simd_context_t simd_context)
+{
+	return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-arm.S b/lib/zinc/chacha20/chacha20-arm.S
new file mode 100644
index 000000000000..0ea1db1492eb
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm.S
@@ -0,0 +1,1473 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+.Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone:
+.long	1,0,0,0
+.word -1
+
+.align	5
+ENTRY(chacha20_arm)
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0-r2,r4-r11,lr}
+	cmp	r2,#0			@ len==0?
+#ifdef	__thumb2__
+	itt	eq
+#endif
+	addeq	sp,sp,#4*3
+	beq	.Lno_data_arm
+.Lshort:
+	ldmia	r12,{r4-r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
+	sub	r14,pc,#100		@ .Lsigma
+#else
+	adr	r14,.Lsigma		@ .Lsigma
+#endif
+	stmdb	sp!,{r4-r7}		@ copy counter and nonce
+	ldmia	r3,{r4-r11}		@ load key
+	ldmia	r14,{r0-r3}		@ load sigma
+	stmdb	sp!,{r4-r11}		@ copy key
+	stmdb	sp!,{r0-r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	b	.Loop_outer_enter
+
+.align	4
+.Loop_outer:
+	ldmia	sp,{r0-r9}		@ load key material
+	str	r11,[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+.Loop_outer_enter:
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	b	.Loop
+
+.align	4
+.Loop:
+	subs	r11,r11,#1
+	add	r0,r0,r4
+	mov	r12,r12,ror#16
+	add	r1,r1,r5
+	mov	r10,r10,ror#16
+	eor	r12,r12,r0,ror#16
+	eor	r10,r10,r1,ror#16
+	add	r8,r8,r12
+	mov	r4,r4,ror#20
+	add	r9,r9,r10
+	mov	r5,r5,ror#20
+	eor	r4,r4,r8,ror#20
+	eor	r5,r5,r9,ror#20
+	add	r0,r0,r4
+	mov	r12,r12,ror#24
+	add	r1,r1,r5
+	mov	r10,r10,ror#24
+	eor	r12,r12,r0,ror#24
+	eor	r10,r10,r1,ror#24
+	add	r8,r8,r12
+	mov	r4,r4,ror#25
+	add	r9,r9,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+13)]
+	ldr	r10,[sp,#4*(16+15)]
+	eor	r4,r4,r8,ror#25
+	eor	r5,r5,r9,ror#25
+	str	r8,[sp,#4*(16+8)]
+	ldr	r8,[sp,#4*(16+10)]
+	add	r2,r2,r6
+	mov	r14,r14,ror#16
+	str	r9,[sp,#4*(16+9)]
+	ldr	r9,[sp,#4*(16+11)]
+	add	r3,r3,r7
+	mov	r10,r10,ror#16
+	eor	r14,r14,r2,ror#16
+	eor	r10,r10,r3,ror#16
+	add	r8,r8,r14
+	mov	r6,r6,ror#20
+	add	r9,r9,r10
+	mov	r7,r7,ror#20
+	eor	r6,r6,r8,ror#20
+	eor	r7,r7,r9,ror#20
+	add	r2,r2,r6
+	mov	r14,r14,ror#24
+	add	r3,r3,r7
+	mov	r10,r10,ror#24
+	eor	r14,r14,r2,ror#24
+	eor	r10,r10,r3,ror#24
+	add	r8,r8,r14
+	mov	r6,r6,ror#25
+	add	r9,r9,r10
+	mov	r7,r7,ror#25
+	eor	r6,r6,r8,ror#25
+	eor	r7,r7,r9,ror#25
+	add	r0,r0,r5
+	mov	r10,r10,ror#16
+	add	r1,r1,r6
+	mov	r12,r12,ror#16
+	eor	r10,r10,r0,ror#16
+	eor	r12,r12,r1,ror#16
+	add	r8,r8,r10
+	mov	r5,r5,ror#20
+	add	r9,r9,r12
+	mov	r6,r6,ror#20
+	eor	r5,r5,r8,ror#20
+	eor	r6,r6,r9,ror#20
+	add	r0,r0,r5
+	mov	r10,r10,ror#24
+	add	r1,r1,r6
+	mov	r12,r12,ror#24
+	eor	r10,r10,r0,ror#24
+	eor	r12,r12,r1,ror#24
+	add	r8,r8,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+15)]
+	ldr	r10,[sp,#4*(16+13)]
+	add	r9,r9,r12
+	mov	r6,r6,ror#25
+	eor	r5,r5,r8,ror#25
+	eor	r6,r6,r9,ror#25
+	str	r8,[sp,#4*(16+10)]
+	ldr	r8,[sp,#4*(16+8)]
+	add	r2,r2,r7
+	mov	r10,r10,ror#16
+	str	r9,[sp,#4*(16+11)]
+	ldr	r9,[sp,#4*(16+9)]
+	add	r3,r3,r4
+	mov	r14,r14,ror#16
+	eor	r10,r10,r2,ror#16
+	eor	r14,r14,r3,ror#16
+	add	r8,r8,r10
+	mov	r7,r7,ror#20
+	add	r9,r9,r14
+	mov	r4,r4,ror#20
+	eor	r7,r7,r8,ror#20
+	eor	r4,r4,r9,ror#20
+	add	r2,r2,r7
+	mov	r10,r10,ror#24
+	add	r3,r3,r4
+	mov	r14,r14,ror#24
+	eor	r10,r10,r2,ror#24
+	eor	r14,r14,r3,ror#24
+	add	r8,r8,r10
+	mov	r7,r7,ror#25
+	add	r9,r9,r14
+	mov	r4,r4,ror#25
+	eor	r7,r7,r8,ror#25
+	eor	r4,r4,r9,ror#25
+	bne	.Loop
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	cmp	r11,#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	r8,[sp,#4*(0)]	@ load key material
+	ldr	r9,[sp,#4*(1)]
+
+#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ < 7
+	orr	r10,r12,r14
+	tst	r10,#3		@ are input and output aligned?
+	ldr	r10,[sp,#4*(2)]
+	bne	.Lunaligned
+	cmp	r11,#64		@ restore flags
+#else
+	ldr	r10,[sp,#4*(2)]
+#endif
+	ldr	r11,[sp,#4*(3)]
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+
+	add	r2,r2,r10
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r0,r0,r8	@ xor with input
+	eorhs	r1,r1,r9
+	 add	r8,sp,#4*(4)
+	str	r0,[r14],#16		@ store output
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	 ldmia	r8,{r8-r11}	@ load key material
+	str	r1,[r14,#-12]
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+	 add	r8,sp,#4*(8)
+	str	r4,[r14],#16		@ store output
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r5,[r14,#-12]
+	 ldmia	r8,{r8-r11}	@ load key material
+	str	r6,[r14,#-8]
+	 add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0-r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	 strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	 strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r0,r0,r8
+	eorhs	r1,r1,r9
+	 add	r8,sp,#4*(12)
+	str	r0,[r14],#16		@ store output
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	str	r1,[r14,#-12]
+	 ldmia	r8,{r8-r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	 addhi	r8,r8,#1		@ next counter value
+	 strhi	r8,[sp,#4*(12)]	@ save next counter value
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+#ifdef	__thumb2__
+	 it	ne
+#endif
+	 ldrne	r8,[sp,#4*(32+2)]	@ re-load len
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r4,[r14],#16		@ store output
+	str	r5,[r14,#-12]
+#ifdef	__thumb2__
+	it	hs
+#endif
+	 subhs	r11,r8,#64		@ len-=64
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#if __LINUX_ARM_ARCH__ < 7
+	b	.Ltail
+
+.align	4
+.Lunaligned:				@ unaligned endian-neutral path
+	cmp	r11,#64		@ restore flags
+#endif
+#endif
+#if __LINUX_ARM_ARCH__ < 7
+	ldr	r11,[sp,#4*(3)]
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	 strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r1,[r14,#-10]
+	 strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	 strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	 strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	 strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	 strb	r2,[r14,#-5]
+	 strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+0)
+	ldmia	r8,{r8-r11}		@ load key material
+	add	r0,sp,#4*(16+8)
+	add	r4,r4,r8		@ accumulate key material
+	add	r5,r5,r9
+	add	r6,r6,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	 strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r5,[r14,#-10]
+	 strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	 strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	 strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	 strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	 strb	r6,[r14,#-5]
+	 strb	r7,[r14,#-1]
+	add	r8,sp,#4*(4+4)
+	ldmia	r8,{r8-r11}		@ load key material
+	ldmia	r0,{r0-r7}		@ load second half
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
+	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	 strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r1,[r14,#-10]
+	 strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	 strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	 strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	 strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	 strb	r2,[r14,#-5]
+	 strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+8)
+	ldmia	r8,{r8-r11}		@ load key material
+	add	r4,r4,r8		@ accumulate key material
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	addhi	r8,r8,#1			@ next counter value
+	strhi	r8,[sp,#4*(12)]		@ save next counter value
+	add	r5,r5,r9
+	add	r6,r6,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	 strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r5,[r14,#-10]
+	 strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	 strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	 strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	 strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	 strb	r6,[r14,#-5]
+	 strb	r7,[r14,#-1]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
+#ifdef	__thumb2__
+	it	hs
+#endif
+	subhs	r11,r8,#64			@ len-=64
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#endif
+
+.Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	r9,sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+.Loop_tail:
+	ldrb	r10,[r9],#1	@ read buffer on stack
+	ldrb	r11,[r12],#1		@ read input
+	subs	r8,r8,#1
+	eor	r11,r11,r10
+	strb	r11,[r14],#1		@ store output
+	bne	.Loop_tail
+
+.Ldone:
+	add	sp,sp,#4*(32+3)
+.Lno_data_arm:
+	ldmia	sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)
+
+#if __LINUX_ARM_ARCH__ >= 7 && IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
+.align	5
+.Lsigma2:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone2:
+.long	1,0,0,0
+.word -1
+
+.arch	armv7-a
+.fpu	neon
+
+.align	5
+ENTRY(chacha20_neon)
+	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb		sp!,{r0-r2,r4-r11,lr}
+	cmp		r2,#0			@ len==0?
+#ifdef	__thumb2__
+	itt		eq
+#endif
+	addeq		sp,sp,#4*3
+	beq		.Lno_data_neon
+	cmp		r2,#192			@ test len
+	bls		.Lshort
+.Lchacha20_neon_begin:
+	adr		r14,.Lsigma2
+	vstmdb		sp!,{d8-d15}		@ ABI spec says so
+	stmdb		sp!,{r0-r3}
+
+	vld1.32		{q1-q2},[r3]		@ load key
+	ldmia		r3,{r4-r11}		@ load key
+
+	sub		sp,sp,#4*(16+16)
+	vld1.32		{q3},[r12]		@ load counter and nonce
+	add		r12,sp,#4*8
+	ldmia		r14,{r0-r3}		@ load sigma
+	vld1.32		{q0},[r14]!		@ load sigma
+	vld1.32		{q12},[r14]		@ one
+	vst1.32		{q2-q3},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32		{q0-q1},[sp]		@ copy sigma|1/2key
+
+	str		r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str		r11,[sp,#4*(16+11)]	@ off-load "rx"
+	vshl.i32	d26,d24,#1	@ two
+	vstr		d24,[sp,#4*(16+0)]
+	vshl.i32	d28,d24,#2	@ four
+	vstr		d26,[sp,#4*(16+2)]
+	vmov		q4,q0
+	vstr		d28,[sp,#4*(16+4)]
+	vmov		q8,q0
+	vmov		q5,q1
+	vmov		q9,q1
+	b		.Loop_neon_enter
+
+.align	4
+.Loop_neon_outer:
+	ldmia		sp,{r0-r9}		@ load key material
+	cmp		r11,#64*2		@ if len<=64*2
+	bls		.Lbreak_neon		@ switch to integer-only
+	vmov		q4,q0
+	str		r11,[sp,#4*(32+2)]	@ save len
+	vmov		q8,q0
+	str		r12,  [sp,#4*(32+1)]	@ save inp
+	vmov		q5,q1
+	str		r14,  [sp,#4*(32+0)]	@ save out
+	vmov		q9,q1
+.Loop_neon_enter:
+	ldr		r11, [sp,#4*(15)]
+	vadd.i32	q7,q3,q12		@ counter+1
+	ldr		r12,[sp,#4*(12)]	@ modulo-scheduled load
+	vmov		q6,q2
+	ldr		r10, [sp,#4*(13)]
+	vmov		q10,q2
+	ldr		r14,[sp,#4*(14)]
+	vadd.i32	q11,q7,q12		@ counter+2
+	str		r11, [sp,#4*(16+15)]
+	mov		r11,#10
+	add		r12,r12,#3	@ counter+3
+	b		.Loop_neon
+
+.align	4
+.Loop_neon:
+	subs		r11,r11,#1
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r4
+	vadd.i32	q4,q4,q5
+	mov	r12,r12,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r5
+	veor	q3,q3,q0
+	mov	r10,r10,ror#16
+	veor	q7,q7,q4
+	eor	r12,r12,r0,ror#16
+	veor	q11,q11,q8
+	eor	r10,r10,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r12
+	vrev32.16	q7,q7
+	mov	r4,r4,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r10
+	vadd.i32	q2,q2,q3
+	mov	r5,r5,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r4,r4,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r5,r5,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r4
+	veor	q13,q5,q6
+	mov	r12,r12,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r5
+	vshr.u32	q1,q12,#20
+	mov	r10,r10,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r12,r12,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r10,r10,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r12
+	vsli.32	q5,q13,#12
+	mov	r4,r4,ror#25
+	vsli.32	q9,q14,#12
+	add	r9,r9,r10
+	vadd.i32	q0,q0,q1
+	mov	r5,r5,ror#25
+	vadd.i32	q4,q4,q5
+	str	r10,[sp,#4*(16+13)]
+	vadd.i32	q8,q8,q9
+	ldr	r10,[sp,#4*(16+15)]
+	veor	q12,q3,q0
+	eor	r4,r4,r8,ror#25
+	veor	q13,q7,q4
+	eor	r5,r5,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+8)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+10)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r6
+	vshr.u32	q11,q14,#24
+	mov	r14,r14,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+9)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+11)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r7
+	vadd.i32	q2,q2,q3
+	mov	r10,r10,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r14,r14,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r10,r10,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r14
+	veor	q13,q5,q6
+	mov	r6,r6,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r10
+	vshr.u32	q1,q12,#25
+	mov	r7,r7,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r6,r6,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r7,r7,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r6
+	vsli.32	q5,q13,#7
+	mov	r14,r14,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r7
+	vext.8	q2,q2,q2,#8
+	mov	r10,r10,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r14,r14,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r10,r10,r3,ror#24
+	vext.8	q1,q1,q1,#4
+	add	r8,r8,r14
+	vext.8	q5,q5,q5,#4
+	mov	r6,r6,ror#25
+	vext.8	q9,q9,q9,#4
+	add	r9,r9,r10
+	vext.8	q3,q3,q3,#12
+	mov	r7,r7,ror#25
+	vext.8	q7,q7,q7,#12
+	eor	r6,r6,r8,ror#25
+	vext.8	q11,q11,q11,#12
+	eor	r7,r7,r9,ror#25
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r5
+	vadd.i32	q4,q4,q5
+	mov	r10,r10,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r6
+	veor	q3,q3,q0
+	mov	r12,r12,ror#16
+	veor	q7,q7,q4
+	eor	r10,r10,r0,ror#16
+	veor	q11,q11,q8
+	eor	r12,r12,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r10
+	vrev32.16	q7,q7
+	mov	r5,r5,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r12
+	vadd.i32	q2,q2,q3
+	mov	r6,r6,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r5,r5,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r6,r6,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r5
+	veor	q13,q5,q6
+	mov	r10,r10,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r6
+	vshr.u32	q1,q12,#20
+	mov	r12,r12,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r10,r10,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r12,r12,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r10
+	vsli.32	q5,q13,#12
+	mov	r5,r5,ror#25
+	vsli.32	q9,q14,#12
+	str	r10,[sp,#4*(16+15)]
+	vadd.i32	q0,q0,q1
+	ldr	r10,[sp,#4*(16+13)]
+	vadd.i32	q4,q4,q5
+	add	r9,r9,r12
+	vadd.i32	q8,q8,q9
+	mov	r6,r6,ror#25
+	veor	q12,q3,q0
+	eor	r5,r5,r8,ror#25
+	veor	q13,q7,q4
+	eor	r6,r6,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+10)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+8)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r7
+	vshr.u32	q11,q14,#24
+	mov	r10,r10,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+11)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+9)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r4
+	vadd.i32	q2,q2,q3
+	mov	r14,r14,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r10,r10,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r14,r14,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r10
+	veor	q13,q5,q6
+	mov	r7,r7,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r14
+	vshr.u32	q1,q12,#25
+	mov	r4,r4,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r7,r7,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r4,r4,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r7
+	vsli.32	q5,q13,#7
+	mov	r10,r10,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r4
+	vext.8	q2,q2,q2,#8
+	mov	r14,r14,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r10,r10,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r14,r14,r3,ror#24
+	vext.8	q1,q1,q1,#12
+	add	r8,r8,r10
+	vext.8	q5,q5,q5,#12
+	mov	r7,r7,ror#25
+	vext.8	q9,q9,q9,#12
+	add	r9,r9,r14
+	vext.8	q3,q3,q3,#4
+	mov	r4,r4,ror#25
+	vext.8	q7,q7,q7,#4
+	eor	r7,r7,r8,ror#25
+	vext.8	q11,q11,q11,#4
+	eor	r4,r4,r9,ror#25
+	bne		.Loop_neon
+
+	add		r11,sp,#32
+	vld1.32		{q12-q13},[sp]		@ load key material
+	vld1.32		{q14-q15},[r11]
+
+	ldr		r11,[sp,#4*(32+2)]	@ load len
+
+	str		r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str		r9, [sp,#4*(16+9)]
+	str		r12,[sp,#4*(16+12)]
+	str		r10, [sp,#4*(16+13)]
+	str		r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	ldr		r12,[sp,#4*(32+1)]	@ load inp
+	ldr		r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	q0,q0,q12		@ accumulate key material
+	vadd.i32	q4,q4,q12
+	vadd.i32	q8,q8,q12
+	vldr		d24,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	q1,q1,q13
+	vadd.i32	q5,q5,q13
+	vadd.i32	q9,q9,q13
+	vldr		d26,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	q2,q2,q14
+	vadd.i32	q6,q6,q14
+	vadd.i32	q10,q10,q14
+	vadd.i32	d14,d14,d24	@ counter+1
+	vadd.i32	d22,d22,d26	@ counter+2
+
+	vadd.i32	q3,q3,q15
+	vadd.i32	q7,q7,q15
+	vadd.i32	q11,q11,q15
+
+	cmp		r11,#64*4
+	blo		.Ltail_neon
+
+	vld1.8		{q12-q13},[r12]!	@ load input
+	 mov		r11,sp
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12		@ xor with input
+	veor		q1,q1,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q4,q4,q12
+	 vst1.8		{q0-q1},[r14]!	@ store output
+	veor		q5,q5,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q6,q6,q14
+	 vst1.8		{q2-q3},[r14]!
+	veor		q7,q7,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q8,q8,q12
+	 vld1.32	{q0-q1},[r11]!	@ load for next iteration
+	 veor		d25,d25,d25
+	 vldr		d24,[sp,#4*(16+4)]	@ four
+	veor		q9,q9,q13
+	 vld1.32	{q2-q3},[r11]
+	veor		q10,q10,q14
+	 vst1.8		{q4-q5},[r14]!
+	veor		q11,q11,q15
+	 vst1.8		{q6-q7},[r14]!
+
+	vadd.i32	d6,d6,d24	@ next counter value
+	vldr		d24,[sp,#4*(16+0)]	@ one
+
+	ldmia		sp,{r8-r11}	@ load key material
+	add		r0,r0,r8	@ accumulate key material
+	ldr		r8,[r12],#16		@ load input
+	 vst1.8		{q8-q9},[r14]!
+	add		r1,r1,r9
+	ldr		r9,[r12,#-12]
+	 vst1.8		{q10-q11},[r14]!
+	add		r2,r2,r10
+	ldr		r10,[r12,#-8]
+	add		r3,r3,r11
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+#endif
+	eor		r0,r0,r8	@ xor with input
+	 add		r8,sp,#4*(4)
+	eor		r1,r1,r9
+	str		r0,[r14],#16		@ store output
+	eor		r2,r2,r10
+	str		r1,[r14,#-12]
+	eor		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+	str		r2,[r14,#-8]
+	str		r3,[r14,#-4]
+
+	add		r4,r4,r8	@ accumulate key material
+	ldr		r8,[r12],#16		@ load input
+	add		r5,r5,r9
+	ldr		r9,[r12,#-12]
+	add		r6,r6,r10
+	ldr		r10,[r12,#-8]
+	add		r7,r7,r11
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	eor		r4,r4,r8
+	 add		r8,sp,#4*(8)
+	eor		r5,r5,r9
+	str		r4,[r14],#16		@ store output
+	eor		r6,r6,r10
+	str		r5,[r14,#-12]
+	eor		r7,r7,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+	str		r6,[r14,#-8]
+	 add		r0,sp,#4*(16+8)
+	str		r7,[r14,#-4]
+
+	ldmia		r0,{r0-r7}	@ load second half
+
+	add		r0,r0,r8	@ accumulate key material
+	ldr		r8,[r12],#16		@ load input
+	add		r1,r1,r9
+	ldr		r9,[r12,#-12]
+#ifdef	__thumb2__
+	it	hi
+#endif
+	 strhi		r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	add		r2,r2,r10
+	ldr		r10,[r12,#-8]
+#ifdef	__thumb2__
+	it	hi
+#endif
+	 strhi		r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add		r3,r3,r11
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+#endif
+	eor		r0,r0,r8
+	 add		r8,sp,#4*(12)
+	eor		r1,r1,r9
+	str		r0,[r14],#16		@ store output
+	eor		r2,r2,r10
+	str		r1,[r14,#-12]
+	eor		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+	str		r2,[r14,#-8]
+	str		r3,[r14,#-4]
+
+	add		r4,r4,r8	@ accumulate key material
+	 add		r8,r8,#4		@ next counter value
+	add		r5,r5,r9
+	 str		r8,[sp,#4*(12)]	@ save next counter value
+	ldr		r8,[r12],#16		@ load input
+	add		r6,r6,r10
+	 add		r4,r4,#3		@ counter+3
+	ldr		r9,[r12,#-12]
+	add		r7,r7,r11
+	ldr		r10,[r12,#-8]
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	eor		r4,r4,r8
+#ifdef	__thumb2__
+	it	hi
+#endif
+	 ldrhi		r8,[sp,#4*(32+2)]	@ re-load len
+	eor		r5,r5,r9
+	eor		r6,r6,r10
+	str		r4,[r14],#16		@ store output
+	eor		r7,r7,r11
+	str		r5,[r14,#-12]
+	 sub		r11,r8,#64*4	@ len-=64*4
+	str		r6,[r14,#-8]
+	str		r7,[r14,#-4]
+	bhi		.Loop_neon_outer
+
+	b		.Ldone_neon
+
+.align	4
+.Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str		r11, [sp,#4*(20+32+2)]	@ save len
+	 add		r11,sp,#4*(32+4)
+	str		r12,   [sp,#4*(20+32+1)]	@ save inp
+	str		r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr		r12,[sp,#4*(16+10)]
+	ldr		r14,[sp,#4*(16+11)]
+	 vldmia		r11,{d8-d15}			@ fulfill ABI requirement
+	str		r12,[sp,#4*(20+16+10)]	@ copy "rx"
+	str		r14,[sp,#4*(20+16+11)]	@ copy "rx"
+
+	ldr		r11, [sp,#4*(15)]
+	ldr		r12,[sp,#4*(12)]		@ modulo-scheduled load
+	ldr		r10, [sp,#4*(13)]
+	ldr		r14,[sp,#4*(14)]
+	str		r11, [sp,#4*(20+16+15)]
+	add		r11,sp,#4*(20)
+	vst1.32		{q0-q1},[r11]!		@ copy key
+	add		sp,sp,#4*(20)			@ switch frame
+	vst1.32		{q2-q3},[r11]
+	mov		r11,#10
+	b		.Loop				@ go integer-only
+
+.align	4
+.Ltail_neon:
+	cmp		r11,#64*3
+	bhs		.L192_or_more_neon
+	cmp		r11,#64*2
+	bhs		.L128_or_more_neon
+	cmp		r11,#64*1
+	bhs		.L64_or_more_neon
+
+	add		r8,sp,#4*(8)
+	vst1.8		{q0-q1},[sp]
+	add		r10,sp,#4*(0)
+	vst1.8		{q2-q3},[r8]
+	b		.Loop_tail_neon
+
+.align	4
+.L64_or_more_neon:
+	vld1.8		{q12-q13},[r12]!
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12
+	veor		q1,q1,q13
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vst1.8		{q0-q1},[r14]!
+	vst1.8		{q2-q3},[r14]!
+
+	beq		.Ldone_neon
+
+	add		r8,sp,#4*(8)
+	vst1.8		{q4-q5},[sp]
+	add		r10,sp,#4*(0)
+	vst1.8		{q6-q7},[r8]
+	sub		r11,r11,#64*1	@ len-=64*1
+	b		.Loop_tail_neon
+
+.align	4
+.L128_or_more_neon:
+	vld1.8		{q12-q13},[r12]!
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12
+	veor		q1,q1,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q4,q4,q12
+	veor		q5,q5,q13
+	 vst1.8		{q0-q1},[r14]!
+	veor		q6,q6,q14
+	 vst1.8		{q2-q3},[r14]!
+	veor		q7,q7,q15
+	vst1.8		{q4-q5},[r14]!
+	vst1.8		{q6-q7},[r14]!
+
+	beq		.Ldone_neon
+
+	add		r8,sp,#4*(8)
+	vst1.8		{q8-q9},[sp]
+	add		r10,sp,#4*(0)
+	vst1.8		{q10-q11},[r8]
+	sub		r11,r11,#64*2	@ len-=64*2
+	b		.Loop_tail_neon
+
+.align	4
+.L192_or_more_neon:
+	vld1.8		{q12-q13},[r12]!
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12
+	veor		q1,q1,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q4,q4,q12
+	veor		q5,q5,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q6,q6,q14
+	 vst1.8		{q0-q1},[r14]!
+	veor		q7,q7,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q8,q8,q12
+	 vst1.8		{q2-q3},[r14]!
+	veor		q9,q9,q13
+	 vst1.8		{q4-q5},[r14]!
+	veor		q10,q10,q14
+	 vst1.8		{q6-q7},[r14]!
+	veor		q11,q11,q15
+	vst1.8		{q8-q9},[r14]!
+	vst1.8		{q10-q11},[r14]!
+
+	beq		.Ldone_neon
+
+	ldmia		sp,{r8-r11}	@ load key material
+	add		r0,r0,r8	@ accumulate key material
+	 add		r8,sp,#4*(4)
+	add		r1,r1,r9
+	add		r2,r2,r10
+	add		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+
+	add		r4,r4,r8	@ accumulate key material
+	 add		r8,sp,#4*(8)
+	add		r5,r5,r9
+	add		r6,r6,r10
+	add		r7,r7,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	stmia		sp,{r0-r7}
+	 add		r0,sp,#4*(16+8)
+
+	ldmia		r0,{r0-r7}	@ load second half
+
+	add		r0,r0,r8	@ accumulate key material
+	 add		r8,sp,#4*(12)
+	add		r1,r1,r9
+	add		r2,r2,r10
+	add		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+
+	add		r4,r4,r8	@ accumulate key material
+	 add		r8,sp,#4*(8)
+	add		r5,r5,r9
+	 add		r4,r4,#3		@ counter+3
+	add		r6,r6,r10
+	add		r7,r7,r11
+	 ldr		r11,[sp,#4*(32+2)]	@ re-load len
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	stmia		r8,{r0-r7}
+	 add		r10,sp,#4*(0)
+	 sub		r11,r11,#64*3	@ len-=64*3
+
+.Loop_tail_neon:
+	ldrb		r8,[r10],#1	@ read buffer on stack
+	ldrb		r9,[r12],#1		@ read input
+	subs		r11,r11,#1
+	eor		r8,r8,r9
+	strb		r8,[r14],#1		@ store output
+	bne		.Loop_tail_neon
+
+.Ldone_neon:
+	add		sp,sp,#4*(32+4)
+	vldmia		sp,{d8-d15}
+	add		sp,sp,#4*(16+3)
+.Lno_data_neon:
+	ldmia		sp!,{r4-r11,pc}
+ENDPROC(chacha20_neon)
+#endif
diff --git a/lib/zinc/chacha20/chacha20-arm64.S b/lib/zinc/chacha20/chacha20-arm64.S
new file mode 100644
index 000000000000..f90162c32a33
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm64.S
@@ -0,0 +1,1942 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.align	5
+.Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+.Lone:
+.long	1,0,0,0
+
+.align	5
+ENTRY(chacha20_arm)
+	cbz	x2,.Labort
+.Lshort:
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adr	x5,.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__ARMEB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+.Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+.Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,.Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	.Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	.Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+.Labort:
+	ret
+
+.align	4
+.Ltail:
+	add	x2,x2,#64
+.Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+.Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+ENDPROC(chacha20_arm)
+
+.align	5
+ENTRY(chacha20_neon)
+	cbz	x2,.Labort_neon
+	cmp	x2,#192
+	b.lo	.Lshort
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adr	x5,.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	.L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+.Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+.Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,.Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	.Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	.Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
+.Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	.Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	.Last_neon
+
+.Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	.Last_neon
+.Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	.Last_neon
+
+.align	4
+.Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+.Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
+.L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+.Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+.Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+.Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	.Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	.Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	.Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	.Loop_outer
+
+.Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+.Labort_neon:
+	ret
+ENDPROC(chacha20_neon)
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 03/20] zinc: ChaCha20 generic C implementation
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Jean-Philippe Aumasson
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

This implements the ChaCha20 permutation as a single C statement, by way
of the comma operator, which the compiler is able to simplify
terrifically.

Information: https://cr.yp.to/chacha.html

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
---
 include/zinc/chacha20.h      |  54 +++++++++++
 lib/zinc/Kconfig             |   5 ++
 lib/zinc/Makefile            |   4 +
 lib/zinc/chacha20/chacha20.c | 168 +++++++++++++++++++++++++++++++++++
 lib/zinc/main.c              |   5 ++
 5 files changed, 236 insertions(+)
 create mode 100644 include/zinc/chacha20.h
 create mode 100644 lib/zinc/chacha20/chacha20.c

diff --git a/include/zinc/chacha20.h b/include/zinc/chacha20.h
new file mode 100644
index 000000000000..3c2c2f72d88a
--- /dev/null
+++ b/include/zinc/chacha20.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _ZINC_CHACHA20_H
+#define _ZINC_CHACHA20_H
+
+#include <asm/unaligned.h>
+#include <linux/simd.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+enum {
+	CHACHA20_IV_SIZE = 16,
+	CHACHA20_KEY_SIZE = 32,
+	CHACHA20_BLOCK_SIZE = 64,
+	CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(u32),
+	HCHACHA20_KEY_SIZE = 32,
+	HCHACHA20_NONCE_SIZE = 16
+};
+
+struct chacha20_ctx {
+	u32 key[8];
+	u32 counter[4];
+} __aligned(32);
+
+void chacha20_fpu_init(void);
+
+static inline void chacha20_init(struct chacha20_ctx *state,
+				 const u8 key[CHACHA20_KEY_SIZE],
+				 const u64 nonce)
+{
+	state->key[0] = get_unaligned_le32(key + 0);
+	state->key[1] = get_unaligned_le32(key + 4);
+	state->key[2] = get_unaligned_le32(key + 8);
+	state->key[3] = get_unaligned_le32(key + 12);
+	state->key[4] = get_unaligned_le32(key + 16);
+	state->key[5] = get_unaligned_le32(key + 20);
+	state->key[6] = get_unaligned_le32(key + 24);
+	state->key[7] = get_unaligned_le32(key + 28);
+	state->counter[0] = state->counter[1] = 0;
+	state->counter[2] = nonce & U32_MAX;
+	state->counter[3] = nonce >> 32;
+}
+void chacha20(struct chacha20_ctx *state, u8 *dst, const u8 *src, u32 len,
+	      simd_context_t simd_context);
+
+/* Derived key should be 32-bit aligned */
+void hchacha20(u8 derived_key[CHACHA20_KEY_SIZE],
+	       const u8 nonce[HCHACHA20_NONCE_SIZE],
+	       const u8 key[HCHACHA20_KEY_SIZE], simd_context_t simd_context);
+
+#endif /* _ZINC_CHACHA20_H */
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
index 5980c411af0d..e7d396d61607 100644
--- a/lib/zinc/Kconfig
+++ b/lib/zinc/Kconfig
@@ -1,6 +1,11 @@
 config ZINC
 	tristate
 
+config ZINC_CHACHA20
+	bool
+	select ZINC
+	select CRYPTO_ALGAPI
+
 config ZINC_DEBUG
 	bool "Zinc cryptography library debugging and self-tests"
 	depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index dad47573de42..0b5a964bfba6 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -3,6 +3,10 @@ ccflags-y += -Wframe-larger-than=8192
 ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
 ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
 
+ifeq ($(CONFIG_ZINC_CHACHA20),y)
+zinc-y += chacha20/chacha20.o
+endif
+
 zinc-y += main.o
 
 obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/chacha20/chacha20.c b/lib/zinc/chacha20/chacha20.c
new file mode 100644
index 000000000000..1d9168e6c142
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20.c
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Implementation of the ChaCha20 stream cipher.
+ *
+ * Information: https://cr.yp.to/chacha.html
+ */
+
+#include <zinc/chacha20.h>
+
+#include <linux/kernel.h>
+#include <crypto/algapi.h>
+
+#ifndef HAVE_CHACHA20_ARCH_IMPLEMENTATION
+void __init chacha20_fpu_init(void)
+{
+}
+static inline bool chacha20_arch(u8 *out, const u8 *in, const size_t len,
+				 const u32 key[8], const u32 counter[4],
+				 simd_context_t simd_context)
+{
+	return false;
+}
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce,
+				  const u8 *key, simd_context_t simd_context)
+{
+	return false;
+}
+#endif
+
+#define EXPAND_32_BYTE_K 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U
+
+#define QUARTER_ROUND(x, a, b, c, d) ( \
+	x[a] += x[b], \
+	x[d] = rol32((x[d] ^ x[a]), 16), \
+	x[c] += x[d], \
+	x[b] = rol32((x[b] ^ x[c]), 12), \
+	x[a] += x[b], \
+	x[d] = rol32((x[d] ^ x[a]), 8), \
+	x[c] += x[d], \
+	x[b] = rol32((x[b] ^ x[c]), 7) \
+)
+
+#define C(i, j) (i * 4 + j)
+
+#define DOUBLE_ROUND(x) ( \
+	/* Column Round */ \
+	QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
+	QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
+	QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
+	QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
+	/* Diagonal Round */ \
+	QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
+	QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
+	QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
+	QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
+)
+
+#define TWENTY_ROUNDS(x) ( \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x), \
+	DOUBLE_ROUND(x) \
+)
+
+static void chacha20_block_generic(__le32 *stream, u32 *state)
+{
+	u32 x[CHACHA20_BLOCK_WORDS];
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(x); ++i)
+		x[i] = state[i];
+
+	TWENTY_ROUNDS(x);
+
+	for (i = 0; i < ARRAY_SIZE(x); ++i)
+		stream[i] = cpu_to_le32(x[i] + state[i]);
+
+	++state[12];
+}
+
+static void chacha20_generic(u8 *out, const u8 *in, u32 len, const u32 key[8],
+			     const u32 counter[4])
+{
+	__le32 buf[CHACHA20_BLOCK_WORDS];
+	u32 x[] = {
+		EXPAND_32_BYTE_K,
+		key[0], key[1], key[2], key[3],
+		key[4], key[5], key[6], key[7],
+		counter[0], counter[1], counter[2], counter[3]
+	};
+
+	if (out != in)
+		memmove(out, in, len);
+
+	while (len >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_generic(buf, x);
+		crypto_xor(out, (u8 *)buf, CHACHA20_BLOCK_SIZE);
+		len -= CHACHA20_BLOCK_SIZE;
+		out += CHACHA20_BLOCK_SIZE;
+	}
+	if (len) {
+		chacha20_block_generic(buf, x);
+		crypto_xor(out, (u8 *)buf, len);
+	}
+}
+
+void chacha20(struct chacha20_ctx *state, u8 *dst, const u8 *src, u32 len,
+	      simd_context_t simd_context)
+{
+	if (!chacha20_arch(dst, src, len, state->key, state->counter,
+			   simd_context))
+		chacha20_generic(dst, src, len, state->key, state->counter);
+	state->counter[0] += (len + 63) / 64;
+}
+EXPORT_SYMBOL(chacha20);
+
+static void hchacha20_generic(u8 derived_key[CHACHA20_KEY_SIZE],
+			      const u8 nonce[HCHACHA20_NONCE_SIZE],
+			      const u8 key[HCHACHA20_KEY_SIZE])
+{
+	__le32 *out = (__force __le32 *)derived_key;
+	u32 x[] = { EXPAND_32_BYTE_K,
+		    get_unaligned_le32(key + 0),
+		    get_unaligned_le32(key + 4),
+		    get_unaligned_le32(key + 8),
+		    get_unaligned_le32(key + 12),
+		    get_unaligned_le32(key + 16),
+		    get_unaligned_le32(key + 20),
+		    get_unaligned_le32(key + 24),
+		    get_unaligned_le32(key + 28),
+		    get_unaligned_le32(nonce + 0),
+		    get_unaligned_le32(nonce + 4),
+		    get_unaligned_le32(nonce + 8),
+		    get_unaligned_le32(nonce + 12)
+	};
+
+	TWENTY_ROUNDS(x);
+
+	out[0] = cpu_to_le32(x[0]);
+	out[1] = cpu_to_le32(x[1]);
+	out[2] = cpu_to_le32(x[2]);
+	out[3] = cpu_to_le32(x[3]);
+	out[4] = cpu_to_le32(x[12]);
+	out[5] = cpu_to_le32(x[13]);
+	out[6] = cpu_to_le32(x[14]);
+	out[7] = cpu_to_le32(x[15]);
+}
+
+/* Derived key should be 32-bit aligned */
+void hchacha20(u8 derived_key[CHACHA20_KEY_SIZE],
+	       const u8 nonce[HCHACHA20_NONCE_SIZE],
+	       const u8 key[HCHACHA20_KEY_SIZE], simd_context_t simd_context)
+{
+	if (!hchacha20_arch(derived_key, nonce, key, simd_context))
+		hchacha20_generic(derived_key, nonce, key);
+}
+/* Deliberately not EXPORT_SYMBOL'd, since there are few reasons why somebody
+ * should be using this directly, rather than via xchacha20. Revisit only in
+ * the unlikely event that somebody has a good reason to export this.
+ */
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
index ceece33ff5a7..7e8e84b706b7 100644
--- a/lib/zinc/main.c
+++ b/lib/zinc/main.c
@@ -3,6 +3,8 @@
  * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
+#include <zinc/chacha20.h>
+
 #include <linux/init.h>
 #include <linux/module.h>
 
@@ -17,6 +19,9 @@
 
 static int __init mod_init(void)
 {
+#ifdef CONFIG_ZINC_CHACHA20
+	chacha20_fpu_init();
+#endif
 	return 0;
 }
 
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 02/20] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Jean-Philippe Aumasson
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

Zinc stands for "Zinc Is Neat Crypto" or "Zinc as IN Crypto" or maybe
just "Zx2c4's INsane Cryptolib." It's also short, easy to type, and
plays nicely with the recent trend of naming crypto libraries after
elements. The guiding principle is "don't overdo it". It's less of a
library and more of a directory tree for organizing well-curated direct
implementations of cryptography primitives.

Zinc is a new cryptography API that is much more minimal and lower-level
than the current one. It intends to complement it and provide a basis
upon which the current crypto API might build, as the provider of
software implementations of cryptographic primitives. It is motivated by
three primary observations in crypto API design:

  * Highly composable "cipher modes" and related abstractions from
    90s cryptographers did not turn out to be as terrific an idea as
    hoped, leading to a host of API misuse problems.

  * Most programmers are afraid of crypto code, and so prefer to
    integrate it into libraries in a highly abstracted manner, so as to
    shield themselves from implementation details. Cryptographers, on
    the other hand, prefer simple direct implementations, which they're
    able to verify for high assurance and optimize in accordance with
    their expertise.

  * Overly abstracted and flexible cryptography APIs lead to a host of
    dangerous problems and performance issues. The kernel is in the
    business usually not of coming up with new uses of crypto, but
    rather implementing various constructions, which means it essentially
    needs a library of primitives, not a highly abstracted enterprise-ready
    pluggable system, with a few particular exceptions.

This last observation has seen itself play out several times over and
over again within the kernel:

  * The perennial move of actual primitives away from crypto/ and into
    lib/, so that users can actually call these functions directly with
    no overhead and without lots of allocations, function pointers,
    string specifier parsing, and general clunkiness. For example:
    sha256, chacha20, siphash, sha1, and so forth live in lib/ rather
    than in crypto/. Zinc intends to stop the cluttering of lib/ and
    introduce these direct primitives into their proper place, lib/zinc/.

  * An abundance of misuse bugs with the present crypto API that have
    been very unpleasant to clean up.

  * A hesitance to even use cryptography, because of the overhead and
    headaches involved in accessing the routines.

Zinc goes in a rather different direction. Rather than providing a
thoroughly designed and abstracted API, Zinc gives you simple functions,
which implement some primitive, or some particular and specific
construction of primitives. It is not dynamic in the least, though one
could imagine implementing a complex dynamic dispatch mechanism (such as
the current crypto API) on top of these basic functions. After all,
dynamic dispatch is usually needed for applications with cipher agility,
such as IPsec, dm-crypt, AF_ALG, and so forth, and the existing crypto
API will continue to play that role. However, Zinc will provide a non-
haphazard way of directly utilizing crypto routines in applications
that do have neither the need nor desire for abstraction and dynamic
dispatch.

It also organizes the implementations in a simple, straight-forward,
and direct manner, making it enjoyable and intuitive to work on.
Rather than moving optimized assembly implementations into arch/, it
keeps them all together in lib/zinc/, making it simple and obvious to
compare and contrast what's happening. This is, notably, exactly what
the lib/raid6/ tree does, and that seems to work out rather well. It's
also the pattern of most successful crypto libraries. The architecture-
specific glue-code is made a part of each translation unit, rather than
being in a separate one, so that generic and architecture-optimized code
are combined at compile-time, and incompatibility branches compiled out by
the optimizer.

All implementations have been extensively tested and fuzzed, and are
selected for their quality, trustworthiness, and performance. Wherever
possible and performant, formally verified implementations are used,
such as those from HACL* [1] and Fiat-Crypto [2]. The routines also take
special care to zero out secrets using memzero_explicit (and future work
is planned to have gcc do this more reliably and performantly with
compiler plugins). The performance of the selected implementations is
state-of-the-art and unrivaled on a broad array of hardware, though of
course we will continue to fine tune these to the hardware demands
needed by kernel contributors. Each implementation also comes with
extensive self-tests and crafted test vectors, pulled from various
places such as Wycheproof [9].

Regularity of function signatures is important, so that users can easily
"guess" the name of the function they want. Though, individual
primitives are oftentimes not trivially interchangeable, having been
designed for different things and requiring different parameters and
semantics, and so the function signatures they provide will directly
reflect the realities of the primitives' usages, rather than hiding it
behind (inevitably leaky) abstractions. Also, in contrast to the current
crypto API, Zinc functions can work on stack buffers, and can be called
with different keys, without requiring allocations or locking.

SIMD is used automatically when available, though some routines may
benefit from either having their SIMD disabled for particular
invocations, or to have the SIMD initialization calls amortized over
several invocations of the function, and so Zinc utilizes function
signatures enabling that in conjunction with the recently introduced
simd_context_t.

More generally, Zinc provides function signatures that allow just what
is required by the various callers. This isn't to say that users of the
functions will be permitted to pollute the function semantics with weird
particular needs, but we are trying very hard not to overdo it, and that
means looking carefully at what's actually necessary, and doing just that,
and not much more than that. Remember: practicality and cleanliness rather
than over-zealous infrastructure.

Zinc provides also an opening for the best implementers in academia to
contribute their time and effort to the kernel, by being sufficiently
simple and inviting. In discussing this commit with some of the best and
brightest over the last few years, there are many who are eager to
devote rare talent and energy to this effort.

Following the merging of this, I expect for the primitives that
currently exist in lib/ to work their way into lib/zinc/, after intense
scrutiny of each implementation, potentially replacing them with either
formally-verified implementations, or better studied and faster
state-of-the-art implementations.

Also following the merging of this, I expect for the old crypto API
implementations to be ported over to use Zinc for their software-based
implementations.

As Zinc is simply library code, its config options are un-menued, with
the exception of CONFIG_ZINC_DEBUG, which enables various selftests and
BUG_ONs.

[1] https://github.com/project-everest/hacl-star
[2] https://github.com/mit-plv/fiat-crypto
[3] https://cr.yp.to/ecdh.html
[4] https://cr.yp.to/chacha.html
[5] https://cr.yp.to/snuffle/xsalsa-20081128.pdf
[6] https://cr.yp.to/mac.html
[7] https://blake2.net/
[8] https://tools.ietf.org/html/rfc8439
[9] https://github.com/google/wycheproof

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
---
 MAINTAINERS       |  8 ++++++++
 lib/Kconfig       |  2 ++
 lib/Makefile      |  2 ++
 lib/zinc/Kconfig  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/zinc/Makefile |  8 ++++++++
 lib/zinc/main.c   | 31 +++++++++++++++++++++++++++++++
 6 files changed, 97 insertions(+)
 create mode 100644 lib/zinc/Kconfig
 create mode 100644 lib/zinc/Makefile
 create mode 100644 lib/zinc/main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2ef884b883c3..d2092e52320d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16160,6 +16160,14 @@ Q:	https://patchwork.linuxtv.org/project/linux-media/list/
 S:	Maintained
 F:	drivers/media/dvb-frontends/zd1301_demod*
 
+ZINC CRYPTOGRAPHY LIBRARY
+M:	Jason A. Donenfeld <Jason@zx2c4.com>
+M:	Samuel Neves <sneves@dei.uc.pt>
+S:	Maintained
+F:	lib/zinc/
+F:	include/zinc/
+L:	linux-crypto@vger.kernel.org
+
 ZPOOL COMPRESSED PAGE STORAGE API
 M:	Dan Streetman <ddstreet@ieee.org>
 L:	linux-mm@kvack.org
diff --git a/lib/Kconfig b/lib/Kconfig
index a3928d4438b5..3e6848269c66 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -485,6 +485,8 @@ config GLOB_SELFTEST
 	  module load) by a small amount, so you're welcome to play with
 	  it, but you probably don't need it.
 
+source "lib/zinc/Kconfig"
+
 #
 # Netlink attribute parsing support is select'ed if needed
 #
diff --git a/lib/Makefile b/lib/Makefile
index ca3f7ebb900d..3f16e35d2c11 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -214,6 +214,8 @@ obj-$(CONFIG_PERCPU_TEST) += percpu_test.o
 
 obj-$(CONFIG_ASN1) += asn1_decoder.o
 
+obj-$(CONFIG_ZINC) += zinc/
+
 obj-$(CONFIG_FONT_SUPPORT) += fonts/
 
 obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
new file mode 100644
index 000000000000..5980c411af0d
--- /dev/null
+++ b/lib/zinc/Kconfig
@@ -0,0 +1,46 @@
+config ZINC
+	tristate
+
+config ZINC_DEBUG
+	bool "Zinc cryptography library debugging and self-tests"
+	depends on ZINC
+	help
+	  This builds a series of self-tests for the Zinc crypto library, which
+	  help diagnose any cryptographic algorithm implementation issues that
+	  might be at the root cause of potential bugs. It also adds various
+	  debugging traps.
+
+	  Unless you're developing and testing cryptographic routines, or are
+	  especially paranoid about correctness on your hardware, you may say
+	  N here.
+
+config ZINC_ARCH_ARM
+	def_bool y
+	depends on ARM
+	depends on ZINC
+	imply VFP
+	imply VFPv3 if CPU_V7
+	imply NEON if CPU_V7
+	imply KERNEL_MODE_NEON if CPU_V7
+
+config ZINC_ARCH_ARM64
+	def_bool y
+	depends on ARM64
+	depends on ZINC
+
+config ZINC_ARCH_X86_64
+	def_bool y
+	depends on X86_64
+	depends on !UML
+	depends on ZINC
+
+config ZINC_ARCH_MIPS
+	def_bool y
+	depends on MIPS
+	depends on ZINC
+
+config ZINC_ARCH_MIPS64
+	def_bool y
+	depends on MIPS
+	depends on 64BIT
+	depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
new file mode 100644
index 000000000000..dad47573de42
--- /dev/null
+++ b/lib/zinc/Makefile
@@ -0,0 +1,8 @@
+ccflags-y := -O3
+ccflags-y += -Wframe-larger-than=8192
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+ccflags-$(CONFIG_ZINC_DEBUG) += -DDEBUG
+
+zinc-y += main.o
+
+obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
new file mode 100644
index 000000000000..ceece33ff5a7
--- /dev/null
+++ b/lib/zinc/main.c
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#ifdef DEBUG
+#define selftest(which) do { \
+	if (!which ## _selftest()) \
+		return -ENOTRECOVERABLE; \
+} while (0)
+#else
+#define selftest(which)
+#endif
+
+static int __init mod_init(void)
+{
+	return 0;
+}
+
+static void __exit mod_exit(void)
+{
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Zinc cryptography library");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 01/20] asm: simd context helper API
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh
  Cc: Jason A. Donenfeld, Samuel Neves, Andy Lutomirski,
	Thomas Gleixner, linux-arch
In-Reply-To: <20180914162240.7925-1-Jason@zx2c4.com>

Sometimes it's useful to amortize calls to XSAVE/XRSTOR and the related
FPU/SIMD functions over a number of calls, because FPU restoration is
quite expensive. This adds a simple header for carrying out this pattern:

    simd_context_t simd_context = simd_get();
    while ((item = get_item_from_queue()) != NULL) {
        encrypt_item(item, simd_context);
        simd_context = simd_relax(simd_context);
    }
    simd_put(simd_context);

The relaxation step ensures that we don't trample over preemption, and
the get/put API should be a familiar paradigm in the kernel.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Palmer Dabbelt <palmer@sifive.com>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: linux-arch@vger.kernel.org
---
 arch/alpha/include/asm/Kbuild      |  5 ++--
 arch/arc/include/asm/Kbuild        |  1 +
 arch/arm/include/asm/simd.h        | 42 ++++++++++++++++++++++++++++++
 arch/arm64/include/asm/simd.h      | 37 +++++++++++++++++++++-----
 arch/c6x/include/asm/Kbuild        |  3 ++-
 arch/h8300/include/asm/Kbuild      |  3 ++-
 arch/hexagon/include/asm/Kbuild    |  1 +
 arch/ia64/include/asm/Kbuild       |  1 +
 arch/m68k/include/asm/Kbuild       |  1 +
 arch/microblaze/include/asm/Kbuild |  1 +
 arch/mips/include/asm/Kbuild       |  1 +
 arch/nds32/include/asm/Kbuild      |  7 ++---
 arch/nios2/include/asm/Kbuild      |  1 +
 arch/openrisc/include/asm/Kbuild   |  7 ++---
 arch/parisc/include/asm/Kbuild     |  1 +
 arch/powerpc/include/asm/Kbuild    |  3 ++-
 arch/riscv/include/asm/Kbuild      |  3 ++-
 arch/s390/include/asm/Kbuild       |  3 ++-
 arch/sh/include/asm/Kbuild         |  1 +
 arch/sparc/include/asm/Kbuild      |  1 +
 arch/um/include/asm/Kbuild         |  3 ++-
 arch/unicore32/include/asm/Kbuild  |  1 +
 arch/x86/include/asm/simd.h        | 30 ++++++++++++++++++++-
 arch/xtensa/include/asm/Kbuild     |  1 +
 include/asm-generic/simd.h         | 15 +++++++++++
 include/linux/simd.h               | 28 ++++++++++++++++++++
 26 files changed, 180 insertions(+), 21 deletions(-)
 create mode 100644 arch/arm/include/asm/simd.h
 create mode 100644 include/linux/simd.h

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 0580cb8c84b2..07b2c1025d34 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -2,14 +2,15 @@
 
 
 generic-y += compat.h
+generic-y += current.h
 generic-y += exec.h
 generic-y += export.h
 generic-y += fb.h
 generic-y += irq_work.h
+generic-y += kprobes.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += trace_clock.h
-generic-y += current.h
-generic-y += kprobes.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index feed50ce89fa..a7f4255f1649 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += parport.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += user.h
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
new file mode 100644
index 000000000000..bf468993bbef
--- /dev/null
+++ b/arch/arm/include/asm/simd.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
+static __must_check inline bool may_use_simd(void)
+{
+	return !in_interrupt();
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = may_use_simd();
+	if (have_simd)
+		kernel_neon_begin();
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_neon_end();
+}
+#else
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+#endif
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index 6495cc51246f..058c336de38d 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -1,11 +1,10 @@
-/*
- * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+/* SPDX-License-Identifier: GPL-2.0
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
+#include <linux/simd.h>
 #ifndef __ASM_SIMD_H
 #define __ASM_SIMD_H
 
@@ -16,6 +15,8 @@
 #include <linux/types.h>
 
 #ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+#include <asm/simd.h>
 
 DECLARE_PER_CPU(bool, kernel_neon_busy);
 
@@ -40,12 +41,36 @@ static __must_check inline bool may_use_simd(void)
 		!this_cpu_read(kernel_neon_busy);
 }
 
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = may_use_simd();
+	if (have_simd)
+		kernel_neon_begin();
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_neon_end();
+}
+
 #else /* ! CONFIG_KERNEL_MODE_NEON */
 
-static __must_check inline bool may_use_simd(void) {
+static __must_check inline bool may_use_simd(void)
+{
 	return false;
 }
 
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
 #endif /* ! CONFIG_KERNEL_MODE_NEON */
 
 #endif
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 33a2c94fed0d..22f3d8333c74 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -5,8 +5,8 @@ generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
@@ -30,6 +30,7 @@ generic-y += pgalloc.h
 generic-y += preempt.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += tlbflush.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index a5d0b2991f47..f5c2f12d593e 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -8,8 +8,8 @@ generic-y += current.h
 generic-y += delay.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
@@ -39,6 +39,7 @@ generic-y += preempt.h
 generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += spinlock.h
 generic-y += timex.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index dd2fd9c0d292..217d4695fd8a 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -29,6 +29,7 @@ generic-y += rwsem.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 557bbc8ba9f5..41c5ebdf79e5 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += vtime.h
 generic-y += word-at-a-time.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index a4b8d3331a9e..73898dd1a4d0 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += mm-arch-hooks.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += spinlock.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index 569ba9e670c1..7a877eea99d3 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -25,6 +25,7 @@ generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += syscalls.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 58351e48421e..e8868e0fb2c3 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += qrwlock.h
 generic-y += qspinlock.h
 generic-y += sections.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += user.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index dbc4e5422550..603c1d020620 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -7,14 +7,14 @@ generic-y += bug.h
 generic-y += bugs.h
 generic-y += checksum.h
 generic-y += clkdev.h
-generic-y += cmpxchg.h
 generic-y += cmpxchg-local.h
+generic-y += cmpxchg.h
 generic-y += compat.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
@@ -46,14 +46,15 @@ generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += shmbuf.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += stat.h
 generic-y += switch_to.h
 generic-y += timex.h
 generic-y += topology.h
 generic-y += trace_clock.h
-generic-y += xor.h
 generic-y += unaligned.h
 generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
+generic-y += xor.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..571a9d9ad107 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -33,6 +33,7 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += spinlock.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index eb87cd8327c8..5e9f2f4c4d39 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -28,12 +28,13 @@ generic-y += module.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
-generic-y += qspinlock_types.h
-generic-y += qspinlock.h
-generic-y += qrwlock_types.h
 generic-y += qrwlock.h
+generic-y += qrwlock_types.h
+generic-y += qspinlock.h
+generic-y += qspinlock_types.h
 generic-y += sections.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += string.h
 generic-y += switch_to.h
 generic-y += topology.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 2013d639e735..97970b4d05ab 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += seccomp.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d227e351..64290f48e733 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -4,7 +4,8 @@ generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
+generic-y += msi.h
 generic-y += preempt.h
 generic-y += rwsem.h
+generic-y += simd.h
 generic-y += vtime.h
-generic-y += msi.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index efdbe311e936..6669b7374c0a 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -5,9 +5,9 @@ generic-y += compat.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-contiguous.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
@@ -46,6 +46,7 @@ generic-y += setup.h
 generic-y += shmbuf.h
 generic-y += shmparam.h
 generic-y += signal.h
+generic-y += simd.h
 generic-y += socket.h
 generic-y += sockios.h
 generic-y += stat.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index e3239772887a..7a26dc6ce815 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,9 +7,9 @@ generated-y += unistd_nr.h
 generic-y += asm-offsets.h
 generic-y += cacheflush.h
 generic-y += device.h
+generic-y += div64.h
 generic-y += dma-contiguous.h
 generic-y += dma-mapping.h
-generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += export.h
 generic-y += fb.h
@@ -22,6 +22,7 @@ generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += rwsem.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += word-at-a-time.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 6a5609a55965..8e64ff35a933 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += trace_clock.h
 generic-y += xor.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 410b263ef5c8..72b9e08fb350 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -17,5 +17,6 @@ generic-y += msi.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b10dde6cb793..d37288b08dd2 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -16,15 +16,16 @@ generic-y += io.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
+generic-y += kprobes.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += param.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += switch_to.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
-generic-y += kprobes.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index bfc7abe77905..98a908720bbd 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -27,6 +27,7 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += syscalls.h
 generic-y += topology.h
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index a341c878e977..79411178988a 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,4 +1,11 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
 
 #include <asm/fpu/api.h>
 
@@ -10,3 +17,24 @@ static __must_check inline bool may_use_simd(void)
 {
 	return irq_fpu_usable();
 }
+
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = false;
+#if !defined(CONFIG_UML)
+	have_simd = may_use_simd();
+	if (have_simd)
+		kernel_fpu_begin();
+#endif
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+#if !defined(CONFIG_UML)
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_fpu_end();
+#endif
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 82c756431b49..7950f359649d 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
diff --git a/include/asm-generic/simd.h b/include/asm-generic/simd.h
index d0343d58a74a..fad899a5a92d 100644
--- a/include/asm-generic/simd.h
+++ b/include/asm-generic/simd.h
@@ -1,5 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
 #include <linux/hardirq.h>
 
 /*
@@ -13,3 +17,14 @@ static __must_check inline bool may_use_simd(void)
 {
 	return !in_interrupt();
 }
+
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/include/linux/simd.h b/include/linux/simd.h
new file mode 100644
index 000000000000..f62d047188bf
--- /dev/null
+++ b/include/linux/simd.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _SIMD_H
+#define _SIMD_H
+
+typedef enum {
+	HAVE_NO_SIMD,
+	HAVE_FULL_SIMD
+} simd_context_t;
+
+#include <linux/sched.h>
+#include <asm/simd.h>
+
+static inline simd_context_t simd_relax(simd_context_t prior_context)
+{
+#ifdef CONFIG_PREEMPT
+	if (prior_context != HAVE_NO_SIMD && need_resched()) {
+		simd_put(prior_context);
+		return simd_get();
+	}
+#endif
+	return prior_context;
+}
+
+#endif /* _SIMD_H */
-- 
2.19.0

^ permalink raw reply related

* [PATCH net-next v4 00/20] WireGuard: Secure Network Tunnel
From: Jason A. Donenfeld @ 2018-09-14 16:22 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh; +Cc: Jason A. Donenfeld

Changes v3->v4:
  - Remove mistaken double 07/17 patch.
  - Fix whitespace issues in blake2s assembly.
  - It's not possible to put compound literals into __initconst, so
    we now instead just use boring fixed size struct members.
  - Move away from makefile ifdef maze and instead prefer kconfig values,
    which also makes the design a bit more modular too, which could help
    in the future.
  - Port old crypto API implementations (ChaCha20 and Poly1305) to Zinc.
  - Port security/keys/big_key to Zinc as second example of a good usage of
    Zinc.
  - Document precisely what is different between the kernel code and
    CRYPTOGAMS code when the CRYPTOGAMS code is used.
  - Move changelog to top of 00/20 message so that people can
    actually find it.

-----------------------------------------------------------

This patchset is available on git.kernel.org in this branch, where it may be
pulled directly for inclusion into net-next:

  * https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/linux.git/log/?h=jd/wireguard

-----------------------------------------------------------

WireGuard is a secure network tunnel written especially for Linux, which
has faced around three years of serious development, deployment, and
scrutiny. It delivers excellent performance and is extremely easy to
use and configure. It has been designed with the primary goal of being
both easy to audit by virtue of being small and highly secure from a
cryptography and systems security perspective. WireGuard is used by some
massive companies pushing enormous amounts of traffic, and likely
already today you've consumed bytes that at some point transited through
a WireGuard tunnel. Even as an out-of-tree module, WireGuard has been
integrated into various userspace tools, Linux distributions, mobile
phones, and data centers. There are ports in several languages to
several operating systems, and even commercial hardware and services
sold integrating WireGuard. It is time, therefore, for WireGuard to be
properly integrated into Linux.

Ample information, including documentation, installation instructions,
and project details, is available at:

  * https://www.wireguard.com/
  * https://www.wireguard.com/papers/wireguard.pdf

As it is currently an out-of-tree module, it lives in its own git repo
and has its own mailing list, and every commit for the module is tested
against every stable kernel since 3.10 on a variety of architectures
using an extensive test suite:

  * https://git.zx2c4.com/WireGuard
    https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/WireGuard.git/
  * https://lists.zx2c4.com/mailman/listinfo/wireguard
  * https://www.wireguard.com/build-status/

The project has been broadly discussed at conferences, and was presented
to the Netdev developers in Seoul last November, where a paper was
released detailing some interesting aspects of the project. Dave asked
me after the talk if I would consider sending in a v1 "sooner rather
than later", hence this patchset. A decision is still waiting from the
Linux Plumbers Conference, but an update on these topics may be presented
in Vancouver in a few months. Prior presentations:

  * https://www.wireguard.com/presentations/
  * https://www.wireguard.com/papers/wireguard-netdev22.pdf

The cryptography in the protocol itself has been formally verified by
several independent academic teams with positive results, and I know of
two additional efforts on their way to further corroborate those
findings. The version 1 protocol is "complete", and so the purpose of
this review is to assess the implementation of the protocol. However, it
still may be of interest to know that the thing you're reviewing uses a
protocol with various nice security properties:

  * https://www.wireguard.com/formal-verification/

This patchset is divided into four segments. The first introduces a very
simple helper for working with the FPU state for the purposes of amortizing
SIMD operations. The second segment is a small collection of cryptographic
primitives, split up into several commits by primitive and by hardware. The
third shows usage of Zinc within the existing crypto API and as a replacement
to the existing crypto API. The last is WireGuard itself, presented as an
unintrusive and self-contained virtual network driver.

It is intended that this entire patch series enter the kernel through
DaveM's net-next tree. Subsequently, WireGuard patches will go through
DaveM's net-next tree, while Zinc patches will go through Greg KH's tree.

Enjoy,
Jason

^ permalink raw reply

* [PATCH net-next v4 00/20] WireGuard: Secure Network Tunnel
From: Jason A. Donenfeld @ 2018-09-14 16:19 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-crypto, davem, gregkh; +Cc: Jason A. Donenfeld

Changes v3->v4:
  - Remove mistaken double 07/17 patch.
  - Fix whitespace issues in blake2s assembly.
  - It's not possible to put compound literals into __initconst, so
    we now instead just use boring fixed size struct members.
  - Move away from makefile ifdef maze and instead prefer kconfig values,
    which also makes the design a bit more modular too, which could help
    in the future.
  - Port old crypto API implementations (ChaCha20 and Poly1305) to Zinc.
  - Port security/keys/big_key to Zinc as second example of a good usage of
    Zinc.
  - Document precisely what is different between the kernel code and
    CRYPTOGAMS code when the CRYPTOGAMS code is used.
  - Move changelog to top of 00/20 message so that people can
    actually find it.

-----------------------------------------------------------

This patchset is available on git.kernel.org in this branch, where it may be
pulled directly for inclusion into net-next:

  * https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/linux.git/log/?h=jd/wireguard

-----------------------------------------------------------

WireGuard is a secure network tunnel written especially for Linux, which
has faced around three years of serious development, deployment, and
scrutiny. It delivers excellent performance and is extremely easy to
use and configure. It has been designed with the primary goal of being
both easy to audit by virtue of being small and highly secure from a
cryptography and systems security perspective. WireGuard is used by some
massive companies pushing enormous amounts of traffic, and likely
already today you've consumed bytes that at some point transited through
a WireGuard tunnel. Even as an out-of-tree module, WireGuard has been
integrated into various userspace tools, Linux distributions, mobile
phones, and data centers. There are ports in several languages to
several operating systems, and even commercial hardware and services
sold integrating WireGuard. It is time, therefore, for WireGuard to be
properly integrated into Linux.

Ample information, including documentation, installation instructions,
and project details, is available at:

  * https://www.wireguard.com/
  * https://www.wireguard.com/papers/wireguard.pdf

As it is currently an out-of-tree module, it lives in its own git repo
and has its own mailing list, and every commit for the module is tested
against every stable kernel since 3.10 on a variety of architectures
using an extensive test suite:

  * https://git.zx2c4.com/WireGuard
    https://git.kernel.org/pub/scm/linux/kernel/git/zx2c4/WireGuard.git/
  * https://lists.zx2c4.com/mailman/listinfo/wireguard
  * https://www.wireguard.com/build-status/

The project has been broadly discussed at conferences, and was presented
to the Netdev developers in Seoul last November, where a paper was
released detailing some interesting aspects of the project. Dave asked
me after the talk if I would consider sending in a v1 "sooner rather
than later", hence this patchset. A decision is still waiting from the
Linux Plumbers Conference, but an update on these topics may be presented
in Vancouver in a few months. Prior presentations:

  * https://www.wireguard.com/presentations/
  * https://www.wireguard.com/papers/wireguard-netdev22.pdf

The cryptography in the protocol itself has been formally verified by
several independent academic teams with positive results, and I know of
two additional efforts on their way to further corroborate those
findings. The version 1 protocol is "complete", and so the purpose of
this review is to assess the implementation of the protocol. However, it
still may be of interest to know that the thing you're reviewing uses a
protocol with various nice security properties:

  * https://www.wireguard.com/formal-verification/

This patchset is divided into four segments. The first introduces a very
simple helper for working with the FPU state for the purposes of amortizing
SIMD operations. The second segment is a small collection of cryptographic
primitives, split up into several commits by primitive and by hardware. The
third shows usage of Zinc within the existing crypto API and as a replacement
to the existing crypto API. The last is WireGuard itself, presented as an
unintrusive and self-contained virtual network driver.

It is intended that this entire patch series enter the kernel through
DaveM's net-next tree. Subsequently, WireGuard patches will go through
DaveM's net-next tree, while Zinc patches will go through Greg KH's tree.

Enjoy,
Jason

^ permalink raw reply

* Re: [PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock
From: Vlad Buslov @ 2018-09-14 10:46 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller
In-Reply-To: <CAM_iQpUQeu0xeurA6e4pC8C7Ha04srZEGcZVCAwstj-NLg-Xmw@mail.gmail.com>


On Thu 13 Sep 2018 at 17:13, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Wed, Sep 12, 2018 at 1:51 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>>
>> On Fri 07 Sep 2018 at 19:12, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> > On Fri, Sep 7, 2018 at 6:52 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>> >>
>> >> Action API was changed to work with actions and action_idr in concurrency
>> >> safe manner, however tcf_del_walker() still uses actions without taking a
>> >> reference or idrinfo->lock first, and deletes them directly, disregarding
>> >> possible concurrent delete.
>> >>
>> >> Add tc_action_wq workqueue to action API. Implement
>> >> tcf_idr_release_unsafe() that assumes external synchronization by caller
>> >> and delays blocking action cleanup part to tc_action_wq workqueue. Extend
>> >> tcf_action_cleanup() with 'async' argument to indicate that function should
>> >> free action asynchronously.
>> >
>> > Where exactly is blocking in tcf_action_cleanup()?
>> >
>> > From your code, it looks like free_tcf(), but from my observation,
>> > the only blocking function inside is tcf_action_goto_chain_fini()
>> > which calls __tcf_chain_put(). But, __tcf_chain_put() is blocking
>> > _ONLY_ when tc_chain_notify() is called, for tc action it is never
>> > called.
>> >
>> > So, what else is blocking?
>>
>> __tcf_chain_put() calls tc_chain_tmplt_del(), which calls
>> ops->tmplt_destroy(). This last function uses hw offload API, which is
>> blocking.
>
> Good to know.
>
> Can we just make ops->tmplt_destroy() to use workqueue?
> Making tc action to workqueue seems overkill, for me.

How about changing tcf_chain_put_by_act() to use tc_filter_wq, instead
of directly calling __tcf_chain_put()? IMO it is a better solution
because it benefits all classifiers, instead of requiring every
classifier with templates support to implement non-blocking
ops->tmplt_destroy().

^ permalink raw reply

* Re: [PATCH net-next 08/13] net: sched: rename tcf_block_get{_ext}() and tcf_block_put{_ext}()
From: Vlad Buslov @ 2018-09-14 10:38 UTC (permalink / raw)
  To: Cong Wang, Jiri Pirko
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller, Stephen Hemminger, Kirill Tkhai, Paul E. McKenney,
	Nicolas Dichtel, Leon Romanovsky, Greg KH, mark.rutland,
	Florian Westphal, David Ahern, lucien xin, Jakub Kicinski,
	Christian Brauner, Jiri Benc
In-Reply-To: <CAM_iQpV5b4asaUOR1Ly7s_y7x1M-Y7q3Z2DDe6y_r1eKqx8ZDA@mail.gmail.com>


On Thu 13 Sep 2018 at 17:21, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Wed, Sep 12, 2018 at 1:24 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>>
>> On Fri 07 Sep 2018 at 20:09, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> > On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>> >>
>> >> Functions tcf_block_get{_ext}() and tcf_block_put{_ext}() actually
>> >> attach/detach block to specific Qdisc besides just taking/putting
>> >> reference. Rename them according to their purpose.
>> >
>> > Where exactly does it attach to?
>> >
>> > Each qdisc provides a pointer to a pointer of a block, like
>> > &cl->block. It is where the result is saved to. It takes a parameter
>> > of Qdisc* merely for read-only purpose.
>>
>> tcf_block_attach_ext() passes qdisc parameter to tcf_block_owner_add()
>> which saves qdisc to new tcf_block_owner_item and adds the item to
>> block's owner list. I proposed several naming options for these
>> functions to Jiri on internal review and he suggested "attach" as better
>> option.
>
> But that is merely item->q = q, this is why I said it is read-only,
> hard to claim this is attaching.
>
>
>>
>> >
>> > So, renaming it to *attach() is even confusing, at least not
>> > any better. Please find other names or leave them as they are.
>>
>> What would you recommend?
>
> I don't know, perhaps "acquire"?
>
> Or, leaving tcf_block_get() as it is but rename your refcnt
> increment function to be something like tcf_block_refcnt_get()?

Cong, I'm okay with both options.

Jiri, which naming would you prefer?

^ permalink raw reply

* Re: [PATCH 0/2] net: ethernet: neterion: use linux/io-64-nonatomic-lo-hi.h
From: David Miller @ 2018-09-14 15:49 UTC (permalink / raw)
  To: clabbe; +Cc: jdmason, linux-kernel, netdev
In-Reply-To: <1536921190-38619-1-git-send-email-clabbe@baylibre.com>

From: Corentin Labbe <clabbe@baylibre.com>
Date: Fri, 14 Sep 2018 10:33:08 +0000

> This serie remove usage of custom writeq/readq in favor of ones
> defined in linux/io-64-nonatomic-lo-hi.h
> 
> This serie is only compile tested.

The vxge patch doesn't even apply cleanly to net-next.

^ permalink raw reply

* [PATCH] ARM: dts: at91: add new compatibility string for macb on sama5d3
From: Nicolas Ferre @ 2018-09-14 15:48 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-arm-kernel, Claudiu Beznea, Alexandre Belloni,
	Ludovic Desroches, devicetree, Nicolas Ferre
In-Reply-To: <20180914154811.12090-1-nicolas.ferre@microchip.com>

We need this new compatibility string as we experienced different behavior
for this 10/100Mbits/s macb interface on this particular SoC.
Backward compatibility is preserved as we keep the alternative strings.

Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
---
 Documentation/devicetree/bindings/net/macb.txt | 1 +
 arch/arm/boot/dts/sama5d3_emac.dtsi            | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt
index 457d5ae16f23..3e17ac1d5d58 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -10,6 +10,7 @@ Required properties:
   Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on
   the Cadence GEM, or the generic form: "cdns,gem".
   Use "atmel,sama5d2-gem" for the GEM IP (10/100) available on Atmel sama5d2 SoCs.
+  Use "atmel,sama5d3-macb" for the 10/100Mbit IP available on Atmel sama5d3 SoCs.
   Use "atmel,sama5d3-gem" for the Gigabit IP available on Atmel sama5d3 SoCs.
   Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
   Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
diff --git a/arch/arm/boot/dts/sama5d3_emac.dtsi b/arch/arm/boot/dts/sama5d3_emac.dtsi
index 7cb235ef0fb6..6e9e1c2f9def 100644
--- a/arch/arm/boot/dts/sama5d3_emac.dtsi
+++ b/arch/arm/boot/dts/sama5d3_emac.dtsi
@@ -41,7 +41,7 @@
 			};
 
 			macb1: ethernet@f802c000 {
-				compatible = "cdns,at91sam9260-macb", "cdns,macb";
+				compatible = "atmel,sama5d3-macb", "cdns,at91sam9260-macb", "cdns,macb";
 				reg = <0xf802c000 0x100>;
 				interrupts = <35 IRQ_TYPE_LEVEL_HIGH 3>;
 				pinctrl-names = "default";
-- 
2.15.1

^ permalink raw reply related

* [PATCH] net: macb: disable scatter-gather for macb on sama5d3
From: Nicolas Ferre @ 2018-09-14 15:48 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-arm-kernel, Claudiu Beznea, Alexandre Belloni,
	Ludovic Desroches, devicetree, Nicolas Ferre

Create a new configuration for the sama5d3-macb new compatibility string.
This configuration disables scatter-gather because we experienced lock down
of the macb interface of this particular SoC under very high load.

Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 16e4ef7d7185..f1a86b422617 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -3837,6 +3837,13 @@ static const struct macb_config at91sam9260_config = {
 	.init = macb_init,
 };
 
+static const struct macb_config sama5d3macb_config = {
+	.caps = MACB_CAPS_SG_DISABLED
+	      | MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII,
+	.clk_init = macb_clk_init,
+	.init = macb_init,
+};
+
 static const struct macb_config pc302gem_config = {
 	.caps = MACB_CAPS_SG_DISABLED | MACB_CAPS_GIGABIT_MODE_AVAILABLE,
 	.dma_burst_length = 16,
@@ -3904,6 +3911,7 @@ static const struct of_device_id macb_dt_ids[] = {
 	{ .compatible = "cdns,gem", .data = &pc302gem_config },
 	{ .compatible = "atmel,sama5d2-gem", .data = &sama5d2_config },
 	{ .compatible = "atmel,sama5d3-gem", .data = &sama5d3_config },
+	{ .compatible = "atmel,sama5d3-macb", .data = &sama5d3macb_config },
 	{ .compatible = "atmel,sama5d4-gem", .data = &sama5d4_config },
 	{ .compatible = "cdns,at91rm9200-emac", .data = &emac_config },
 	{ .compatible = "cdns,emac", .data = &emac_config },
-- 
2.15.1

^ permalink raw reply related

* [PATCH 2/2] net: neterion: s2io: Use linux/io-64-nonatomic-lo-hi.h
From: Corentin Labbe @ 2018-09-14 10:33 UTC (permalink / raw)
  To: davem, jdmason; +Cc: linux-kernel, netdev, Corentin Labbe
In-Reply-To: <1536921190-38619-1-git-send-email-clabbe@baylibre.com>

This patch replace the custom definition of writeq/read and use ones
defined in linux/io-64-nonatomic-lo-hi.h.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
---
 drivers/net/ethernet/neterion/s2io.c |  1 +
 drivers/net/ethernet/neterion/s2io.h | 22 +---------------------
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index b8983e7..44acc63 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -75,6 +75,7 @@
 #include <linux/tcp.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/slab.h>
 #include <linux/prefetch.h>
 #include <net/tcp.h>
diff --git a/drivers/net/ethernet/neterion/s2io.h b/drivers/net/ethernet/neterion/s2io.h
index 1a24a72..0a921f3 100644
--- a/drivers/net/ethernet/neterion/s2io.h
+++ b/drivers/net/ethernet/neterion/s2io.h
@@ -10,6 +10,7 @@
  * system is licensed under the GPL.
  * See the file COPYING in this distribution for more information.
  ************************************************************************/
+#include <linux/io-64-nonatomic-lo-hi.h>
 #ifndef _S2IO_H
 #define _S2IO_H
 
@@ -970,27 +971,6 @@ struct s2io_nic {
 #define RESET_ERROR 1
 #define CMD_ERROR   2
 
-/*  OS related system calls */
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
-	u64 ret = 0;
-	ret = readl(addr + 4);
-	ret <<= 32;
-	ret |= readl(addr);
-
-	return ret;
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
-	writel((u32) (val), addr);
-	writel((u32) (val >> 32), (addr + 4));
-}
-#endif
-
 /*
  * Some registers have to be written in a particular order to
  * expect correct hardware operation. The macro SPECIAL_REG_WRITE
-- 
2.7.4

^ permalink raw reply related

* [PATCH 1/2] net: neterion: vxge: use linux/io-64-nonatomic-lo-hi.h
From: Corentin Labbe @ 2018-09-14 10:33 UTC (permalink / raw)
  To: davem, jdmason; +Cc: linux-kernel, netdev, Corentin Labbe
In-Reply-To: <1536921190-38619-1-git-send-email-clabbe@baylibre.com>

This patch replace the custom definition of writeq/read and use ones
defined in linux/io-64-nonatomic-lo-hi.h.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
---
 drivers/net/ethernet/neterion/vxge/vxge-config.c  |  1 +
 drivers/net/ethernet/neterion/vxge/vxge-config.h  | 20 --------------------
 drivers/net/ethernet/neterion/vxge/vxge-traffic.c |  1 +
 3 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.c b/drivers/net/ethernet/neterion/vxge/vxge-config.c
index 358ed61..2b422c5 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.c
@@ -13,6 +13,7 @@
  ******************************************************************************/
 #include <linux/vmalloc.h>
 #include <linux/etherdevice.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
 #include <linux/slab.h>
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.h b/drivers/net/ethernet/neterion/vxge/vxge-config.h
index d743a37..e678ba3 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.h
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.h
@@ -2011,26 +2011,6 @@ enum vxge_hw_status vxge_hw_vpath_mtu_set(
 void
 vxge_hw_vpath_rx_doorbell_init(struct __vxge_hw_vpath_handle *vp);
 
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
-	u64 ret = 0;
-	ret = readl(addr + 4);
-	ret <<= 32;
-	ret |= readl(addr);
-
-	return ret;
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
-	writel((u32) (val), addr);
-	writel((u32) (val >> 32), (addr + 4));
-}
-#endif
-
 static inline void __vxge_hw_pio_mem_write32_upper(u32 val, void __iomem *addr)
 {
 	writel(val, addr + 4);
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-traffic.c b/drivers/net/ethernet/neterion/vxge/vxge-traffic.c
index 0c3b5de..30e5cdc 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-traffic.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-traffic.c
@@ -12,6 +12,7 @@
  * Copyright(c) 2002-2010 Exar Corp.
  ******************************************************************************/
 #include <linux/etherdevice.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/prefetch.h>
 
 #include "vxge-traffic.h"
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net] net/sched: act_sample: fix NULL dereference in the data path
From: Jiri Pirko @ 2018-09-14 10:02 UTC (permalink / raw)
  To: Davide Caratti
  Cc: mcroce, Jamal Hadi Salim, Cong Wang, David S. Miller, netdev
In-Reply-To: <2fcef6665503c5e3b17676daf45c4166cf130cdb.1536919144.git.dcaratti@redhat.com>

Fri, Sep 14, 2018 at 12:03:18PM CEST, dcaratti@redhat.com wrote:
>Matteo reported the following splat, testing the datapath of TC 'sample':
>
> BUG: KASAN: null-ptr-deref in tcf_sample_act+0xc4/0x310
> Read of size 8 at addr 0000000000000000 by task nc/433
>
> CPU: 0 PID: 433 Comm: nc Not tainted 4.19.0-rc3-kvm #17
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
> Call Trace:
>  kasan_report.cold.6+0x6c/0x2fa
>  tcf_sample_act+0xc4/0x310
>  ? dev_hard_start_xmit+0x117/0x180
>  tcf_action_exec+0xa3/0x160
>  tcf_classify+0xdd/0x1d0
>  htb_enqueue+0x18e/0x6b0
>  ? deref_stack_reg+0x7a/0xb0
>  ? htb_delete+0x4b0/0x4b0
>  ? unwind_next_frame+0x819/0x8f0
>  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
>  __dev_queue_xmit+0x722/0xca0
>  ? unwind_get_return_address_ptr+0x50/0x50
>  ? netdev_pick_tx+0xe0/0xe0
>  ? save_stack+0x8c/0xb0
>  ? kasan_kmalloc+0xbe/0xd0
>  ? __kmalloc_track_caller+0xe4/0x1c0
>  ? __kmalloc_reserve.isra.45+0x24/0x70
>  ? __alloc_skb+0xdd/0x2e0
>  ? sk_stream_alloc_skb+0x91/0x3b0
>  ? tcp_sendmsg_locked+0x71b/0x15a0
>  ? tcp_sendmsg+0x22/0x40
>  ? __sys_sendto+0x1b0/0x250
>  ? __x64_sys_sendto+0x6f/0x80
>  ? do_syscall_64+0x5d/0x150
>  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
>  ? __sys_sendto+0x1b0/0x250
>  ? __x64_sys_sendto+0x6f/0x80
>  ? do_syscall_64+0x5d/0x150
>  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
>  ip_finish_output2+0x495/0x590
>  ? ip_copy_metadata+0x2e0/0x2e0
>  ? skb_gso_validate_network_len+0x6f/0x110
>  ? ip_finish_output+0x174/0x280
>  __tcp_transmit_skb+0xb17/0x12b0
>  ? __tcp_select_window+0x380/0x380
>  tcp_write_xmit+0x913/0x1de0
>  ? __sk_mem_schedule+0x50/0x80
>  tcp_sendmsg_locked+0x49d/0x15a0
>  ? tcp_rcv_established+0x8da/0xa30
>  ? tcp_set_state+0x220/0x220
>  ? clear_user+0x1f/0x50
>  ? iov_iter_zero+0x1ae/0x590
>  ? __fget_light+0xa0/0xe0
>  tcp_sendmsg+0x22/0x40
>  __sys_sendto+0x1b0/0x250
>  ? __ia32_sys_getpeername+0x40/0x40
>  ? _copy_to_user+0x58/0x70
>  ? poll_select_copy_remaining+0x176/0x200
>  ? __pollwait+0x1c0/0x1c0
>  ? ktime_get_ts64+0x11f/0x140
>  ? kern_select+0x108/0x150
>  ? core_sys_select+0x360/0x360
>  ? vfs_read+0x127/0x150
>  ? kernel_write+0x90/0x90
>  __x64_sys_sendto+0x6f/0x80
>  do_syscall_64+0x5d/0x150
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> RIP: 0033:0x7fefef2b129d
> Code: ff ff ff ff eb b6 0f 1f 80 00 00 00 00 48 8d 05 51 37 0c 00 41 89 ca 8b 00 85 c0 75 20 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 6b f3 c3 66 0f 1f 84 00 00 00 00 00 41 56 41
> RSP: 002b:00007fff2f5350c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
> RAX: ffffffffffffffda RBX: 000056118d60c120 RCX: 00007fefef2b129d
> RDX: 0000000000002000 RSI: 000056118d629320 RDI: 0000000000000003
> RBP: 000056118d530370 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000002000
> R13: 000056118d5c2a10 R14: 000056118d5c2a10 R15: 000056118d5303b8
>
>tcf_sample_act() tried to update its per-cpu stats, but tcf_sample_init()
>forgot to allocate them, because tcf_idr_create() was called with a wrong
>value of 'cpustats'. Setting it to true proved to fix the reported crash.
>
>Reported-by: Matteo Croce <mcroce@redhat.com>
>Fixes: 65a206c01e8e ("net/sched: Change act_api and act_xxx modules to use IDR")
>Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action")
>Tested-by: Matteo Croce <mcroce@redhat.com>
>Signed-off-by: Davide Caratti <dcaratti@redhat.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Re: [PATCH net-next 0/8] bnxt_en: devlink param updates
From: Jiri Pirko @ 2018-09-14 10:01 UTC (permalink / raw)
  To: Vasundhara Volam
  Cc: jakub.kicinski, David Miller, michael.chan@broadcom.com, Netdev,
	alexander.duyck
In-Reply-To: <CAACQVJp0O8Tb+d9kF9K773okqrCg7XG-JLSiMNu9GMLjtKEA6Q@mail.gmail.com>

Fri, Sep 14, 2018 at 06:17:07AM CEST, vasundhara-v.volam@broadcom.com wrote:
>On Wed, Sep 12, 2018 at 3:20 PM Jakub Kicinski
><jakub.kicinski@netronome.com> wrote:
>>
>> On Wed, 12 Sep 2018 12:09:37 +0530, Vasundhara Volam wrote:
>> > On Tue, Sep 11, 2018 at 5:04 PM Jakub Kicinski wrote:
>> > > On Tue, 11 Sep 2018 14:14:57 +0530, Vasundhara Volam wrote:
>> > > > This patchset adds support for 4 generic and 1 driver-specific devlink
>> > > > parameters.
>> > > >
>> > > > Also, this patchset adds support to return proper error code if
>> > > > HWRM_NVM_GET/SET_VARIABLE commands return error code
>> > > > HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED.
>> > > >
>> > > > Vasundhara Volam (8):
>> > > >   devlink: Add generic parameter hw_tc_offload
>> > >
>> > > Much like Jiri, I can't help but wonder why do you need this?
>> >
>> > There is a request from our customer for a way to toggle tc_offload
>> > feature in our adapter.
>>
>> Vasundhara, again, we don't need to know who asked you to do this, but
>> _why_.  What problem are you solving?  What is the customer trying to
>> achieve?
>For Brand new big features like TC_offload, few customers are not willing
>to enable it by default in the adapter(Firmware). This was a subjective decision
>to disable TC_offload by default in the adapter.

Again, why? Why it cannot be enabled in FW and just enabled/disabled by
ethtool flag? Don't say that "customers want it" please...


>>
>> > > >   devlink: Add generic parameter ignore_ari
>> > > >   devlink: Add generic parameter msix_vec_per_pf_max
>> > > >   devlink: Add generic parameter msix_vec_per_pf_min
>> > >
>> > > IMHO more structured API would be preferable if possible.  The string
>> > > keys won't scale if you want to set the parameters per PF, and
>> > > creating more structured API for PCIe which is a relatively slow
>> > > moving HW spec seems tractable.
>> >
>> > Sorry, could you please suggest an example? We will try to adapt.
>>
>> My thinking was that the same way devlink device has ports, it should
>> have PCIe functions as objects which then have attributes.  Instead of
>> making everything a string-identified device attribute.  But I'm not
>> dead set on this if others don't think its a good idea.
>Actually this parameters are for the port but the value given to this param
>is applicable for individual PF. That's the reason I have added "per_pf" string.
>If you think this is not a good idea, I can move this params to driver-specific.

^ permalink raw reply

* [PATCH net] net/sched: act_sample: fix NULL dereference in the data path
From: Davide Caratti @ 2018-09-14 10:03 UTC (permalink / raw)
  To: mcroce, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller; +Cc: netdev

Matteo reported the following splat, testing the datapath of TC 'sample':

 BUG: KASAN: null-ptr-deref in tcf_sample_act+0xc4/0x310
 Read of size 8 at addr 0000000000000000 by task nc/433

 CPU: 0 PID: 433 Comm: nc Not tainted 4.19.0-rc3-kvm #17
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 04/01/2014
 Call Trace:
  kasan_report.cold.6+0x6c/0x2fa
  tcf_sample_act+0xc4/0x310
  ? dev_hard_start_xmit+0x117/0x180
  tcf_action_exec+0xa3/0x160
  tcf_classify+0xdd/0x1d0
  htb_enqueue+0x18e/0x6b0
  ? deref_stack_reg+0x7a/0xb0
  ? htb_delete+0x4b0/0x4b0
  ? unwind_next_frame+0x819/0x8f0
  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
  __dev_queue_xmit+0x722/0xca0
  ? unwind_get_return_address_ptr+0x50/0x50
  ? netdev_pick_tx+0xe0/0xe0
  ? save_stack+0x8c/0xb0
  ? kasan_kmalloc+0xbe/0xd0
  ? __kmalloc_track_caller+0xe4/0x1c0
  ? __kmalloc_reserve.isra.45+0x24/0x70
  ? __alloc_skb+0xdd/0x2e0
  ? sk_stream_alloc_skb+0x91/0x3b0
  ? tcp_sendmsg_locked+0x71b/0x15a0
  ? tcp_sendmsg+0x22/0x40
  ? __sys_sendto+0x1b0/0x250
  ? __x64_sys_sendto+0x6f/0x80
  ? do_syscall_64+0x5d/0x150
  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
  ? __sys_sendto+0x1b0/0x250
  ? __x64_sys_sendto+0x6f/0x80
  ? do_syscall_64+0x5d/0x150
  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
  ip_finish_output2+0x495/0x590
  ? ip_copy_metadata+0x2e0/0x2e0
  ? skb_gso_validate_network_len+0x6f/0x110
  ? ip_finish_output+0x174/0x280
  __tcp_transmit_skb+0xb17/0x12b0
  ? __tcp_select_window+0x380/0x380
  tcp_write_xmit+0x913/0x1de0
  ? __sk_mem_schedule+0x50/0x80
  tcp_sendmsg_locked+0x49d/0x15a0
  ? tcp_rcv_established+0x8da/0xa30
  ? tcp_set_state+0x220/0x220
  ? clear_user+0x1f/0x50
  ? iov_iter_zero+0x1ae/0x590
  ? __fget_light+0xa0/0xe0
  tcp_sendmsg+0x22/0x40
  __sys_sendto+0x1b0/0x250
  ? __ia32_sys_getpeername+0x40/0x40
  ? _copy_to_user+0x58/0x70
  ? poll_select_copy_remaining+0x176/0x200
  ? __pollwait+0x1c0/0x1c0
  ? ktime_get_ts64+0x11f/0x140
  ? kern_select+0x108/0x150
  ? core_sys_select+0x360/0x360
  ? vfs_read+0x127/0x150
  ? kernel_write+0x90/0x90
  __x64_sys_sendto+0x6f/0x80
  do_syscall_64+0x5d/0x150
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x7fefef2b129d
 Code: ff ff ff ff eb b6 0f 1f 80 00 00 00 00 48 8d 05 51 37 0c 00 41 89 ca 8b 00 85 c0 75 20 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 6b f3 c3 66 0f 1f 84 00 00 00 00 00 41 56 41
 RSP: 002b:00007fff2f5350c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 000056118d60c120 RCX: 00007fefef2b129d
 RDX: 0000000000002000 RSI: 000056118d629320 RDI: 0000000000000003
 RBP: 000056118d530370 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000002000
 R13: 000056118d5c2a10 R14: 000056118d5c2a10 R15: 000056118d5303b8

tcf_sample_act() tried to update its per-cpu stats, but tcf_sample_init()
forgot to allocate them, because tcf_idr_create() was called with a wrong
value of 'cpustats'. Setting it to true proved to fix the reported crash.

Reported-by: Matteo Croce <mcroce@redhat.com>
Fixes: 65a206c01e8e ("net/sched: Change act_api and act_xxx modules to use IDR")
Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action")
Tested-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_sample.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 44e9c00657bc..6b67aa13d2dd 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, parm->index, est, a,
-				     &act_sample_ops, bind, false);
+				     &act_sample_ops, bind, true);
 		if (ret) {
 			tcf_idr_cleanup(tn, parm->index);
 			return ret;
-- 
2.17.1

^ permalink raw reply related

* Re: [net-next, RFC PATCH] net: sched: cls_range: Introduce Range classifier
From: Jiri Pirko @ 2018-09-14  9:58 UTC (permalink / raw)
  To: Amritha Nambiar
  Cc: netdev, davem, alexander.h.duyck, jakub.kicinski,
	sridhar.samudrala, jhs, jesse.brandeburg, xiyou.wangcong
In-Reply-To: <153687192654.43503.1433255216543560934.stgit@anamhost.jf.intel.com>

Thu, Sep 13, 2018 at 10:52:06PM CEST, amritha.nambiar@intel.com wrote:

[...]

>+static struct cls_range_filter *range_lookup(struct cls_range_head *head,
>+					     struct range_flow_key *key,
>+					     struct range_flow_key *mkey,
>+					     bool is_skb)
>+{
>+	struct cls_range_filter *filter, *next_filter;
>+	struct range_params range;
>+	int ret;
>+	size_t cmp_size;
>+
>+	list_for_each_entry_safe(filter, next_filter, &head->filters, flist) {

This really should be list_for_each_entry_rcu()

also, as I wrote in the previous email, this should be done in
cls_flower. Look at fl_lookup() it looks-up hashtable. You just need to
add linked list traversal and range comparison to that function for the
hit in the hashtable.


>+		if (!is_skb) {
>+			/* Existing filter comparison */
>+			cmp_size = sizeof(filter->mkey);
>+		} else {
>+			/* skb classification */
>+			ret = range_compare_params(&range, filter, key,
>+						   RANGE_PORT_DST);
>+			if (ret < 0)
>+				continue;
>+
>+			ret = range_compare_params(&range, filter, key,
>+						   RANGE_PORT_SRC);
>+			if (ret < 0)
>+				continue;
>+
>+			/* skb does not have min and max values */
>+			cmp_size = RANGE_KEY_MEMBER_OFFSET(tp_min);
>+		}
>+		if (!memcmp(mkey, &filter->mkey, cmp_size))
>+			return filter;
>+	}
>+	return NULL;

[...]

^ permalink raw reply

* Re: [net-next,RFC PATCH] Introduce TC Range classifier
From: Jiri Pirko @ 2018-09-14  9:49 UTC (permalink / raw)
  To: Amritha Nambiar
  Cc: netdev, davem, alexander.h.duyck, jakub.kicinski,
	sridhar.samudrala, jhs, jesse.brandeburg, xiyou.wangcong
In-Reply-To: <153687160312.43503.11156697286063840163.stgit@anamhost.jf.intel.com>

Thu, Sep 13, 2018 at 10:52:01PM CEST, amritha.nambiar@intel.com wrote:
>This patch introduces a TC range classifier to support filtering based
>on ranges. Only port-range filters are supported currently. This can
>be combined with flower classifier to support filters that are a
>combination of port-ranges and other parameters based on existing
>fields supported by cls_flower. The 'goto chain' action can be used to
>combine the flower and range filter.
>The filter precedence is decided based on the 'prio' value.

For example Spectrum ASIC supports mask-based and range-based matching
in a single TCAM rule. No chains needed. Also, I don't really understand
why is this a separate cls. I believe that this functionality should be
put as an extension of existing cls_flower.

^ permalink raw reply

* Re: [PATCH 7/7] MIPS: mscc: add PCB120 to the ocelot fitImage
From: Alexandre Belloni @ 2018-09-14 15:00 UTC (permalink / raw)
  To: Quentin Schulz
  Cc: ralf, paul.burton, jhogan, robh+dt, mark.rutland, davem, andrew,
	f.fainelli, allan.nielsen, linux-mips, devicetree, linux-kernel,
	netdev, thomas.petazzoni, antoine.tenart
In-Reply-To: <87ab2f80e3942dfca4eab896ba087e7b69bb7b12.1536916714.git-series.quentin.schulz@bootlin.com>

On 14/09/2018 11:44:28+0200, Quentin Schulz wrote:
> PCB120 and PCB123 are both development boards based on Microsemi Ocelot
> so let's use the same fitImage for both.
> 
> Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>

> ---
>  arch/mips/generic/Kconfig                   |  6 +--
>  arch/mips/generic/Platform                  |  2 +-
>  arch/mips/generic/board-ocelot.its.S        | 40 ++++++++++++++++++++++-
>  arch/mips/generic/board-ocelot_pcb123.its.S | 23 +-------------
>  4 files changed, 44 insertions(+), 27 deletions(-)
>  create mode 100644 arch/mips/generic/board-ocelot.its.S
>  delete mode 100644 arch/mips/generic/board-ocelot_pcb123.its.S
> 
> diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig
> index 08e33c6..fd60198 100644
> --- a/arch/mips/generic/Kconfig
> +++ b/arch/mips/generic/Kconfig
> @@ -65,11 +65,11 @@ config FIT_IMAGE_FDT_XILFPGA
>  	  Enable this to include the FDT for the MIPSfpga platform
>  	  from Imagination Technologies in the FIT kernel image.
>  
> -config FIT_IMAGE_FDT_OCELOT_PCB123
> -	bool "Include FDT for Microsemi Ocelot PCB123"
> +config FIT_IMAGE_FDT_OCELOT
> +	bool "Include FDT for Microsemi Ocelot development platforms"
>  	select MSCC_OCELOT
>  	help
> -	  Enable this to include the FDT for the Ocelot PCB123 platform
> +	  Enable this to include the FDT for the Ocelot development platforms
>  	  from Microsemi in the FIT kernel image.
>  	  This requires u-boot on the platform.
>  
> diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform
> index 879cb80..eaa19d1 100644
> --- a/arch/mips/generic/Platform
> +++ b/arch/mips/generic/Platform
> @@ -16,5 +16,5 @@ all-$(CONFIG_MIPS_GENERIC)	:= vmlinux.gz.itb
>  its-y					:= vmlinux.its.S
>  its-$(CONFIG_FIT_IMAGE_FDT_BOSTON)	+= board-boston.its.S
>  its-$(CONFIG_FIT_IMAGE_FDT_NI169445)	+= board-ni169445.its.S
> -its-$(CONFIG_FIT_IMAGE_FDT_OCELOT_PCB123) += board-ocelot_pcb123.its.S
> +its-$(CONFIG_FIT_IMAGE_FDT_OCELOT)	+= board-ocelot.its.S
>  its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA)	+= board-xilfpga.its.S
> diff --git a/arch/mips/generic/board-ocelot.its.S b/arch/mips/generic/board-ocelot.its.S
> new file mode 100644
> index 0000000..3da2398
> --- /dev/null
> +++ b/arch/mips/generic/board-ocelot.its.S
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
> +/ {
> +	images {
> +		fdt@ocelot_pcb123 {
> +			description = "MSCC Ocelot PCB123 Device Tree";
> +			data = /incbin/("boot/dts/mscc/ocelot_pcb123.dtb");
> +			type = "flat_dt";
> +			arch = "mips";
> +			compression = "none";
> +			hash@0 {
> +				algo = "sha1";
> +			};
> +		};
> +
> +		fdt@ocelot_pcb120 {
> +			description = "MSCC Ocelot PCB120 Device Tree";
> +			data = /incbin/("boot/dts/mscc/ocelot_pcb120.dtb");
> +			type = "flat_dt";
> +			arch = "mips";
> +			compression = "none";
> +			hash@0 {
> +				algo = "sha1";
> +			};
> +		};
> +	};
> +
> +	configurations {
> +		conf@ocelot_pcb123 {
> +			description = "Ocelot Linux kernel";
> +			kernel = "kernel@0";
> +			fdt = "fdt@ocelot_pcb123";
> +		};
> +
> +		conf@ocelot_pcb120 {
> +			description = "Ocelot Linux kernel";
> +			kernel = "kernel@0";
> +			fdt = "fdt@ocelot_pcb120";
> +		};
> +	};
> +};
> diff --git a/arch/mips/generic/board-ocelot_pcb123.its.S b/arch/mips/generic/board-ocelot_pcb123.its.S
> deleted file mode 100644
> index 5a7d5e1..0000000
> --- a/arch/mips/generic/board-ocelot_pcb123.its.S
> +++ /dev/null
> @@ -1,23 +0,0 @@
> -/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
> -/ {
> -	images {
> -		fdt@ocelot_pcb123 {
> -			description = "MSCC Ocelot PCB123 Device Tree";
> -			data = /incbin/("boot/dts/mscc/ocelot_pcb123.dtb");
> -			type = "flat_dt";
> -			arch = "mips";
> -			compression = "none";
> -			hash@0 {
> -				algo = "sha1";
> -			};
> -		};
> -	};
> -
> -	configurations {
> -		conf@ocelot_pcb123 {
> -			description = "Ocelot Linux kernel";
> -			kernel = "kernel@0";
> -			fdt = "fdt@ocelot_pcb123";
> -		};
> -	};
> -};
> -- 
> git-series 0.9.1

-- 
Alexandre Belloni, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH 6/7] MIPS: mscc: add DT for Ocelot PCB120
From: Alexandre Belloni @ 2018-09-14 14:58 UTC (permalink / raw)
  To: Quentin Schulz
  Cc: ralf, paul.burton, jhogan, robh+dt, mark.rutland, davem, andrew,
	f.fainelli, allan.nielsen, linux-mips, devicetree, linux-kernel,
	netdev, thomas.petazzoni, antoine.tenart
In-Reply-To: <f2ef1137991cabde5e9529403982d84b5b0fe0a6.1536916714.git-series.quentin.schulz@bootlin.com>

On 14/09/2018 11:44:27+0200, Quentin Schulz wrote:
> The Ocelot PCB120 evaluation board is different from the PCB123 in that
> it has 4 external VSC8584 (or VSC8574) PHYs.
> 
> It uses the SoC's second MDIO bus for external PHYs which have a
> reversed address on the bus (i.e. PHY4 is on address 3, PHY5 is on
> address 2, PHY6 on 1 and PHY7 on 0).
> 
> Here is how the PHYs are connected to the switch ports:
> port 0: phy0 (internal)
> port 1: phy1 (internal)
> port 2: phy2 (internal)
> port 3: phy3 (internal)
> port 4: phy7
> port 5: phy4
> port 6: phy6
> port 9: phy5
> 
> Signed-off-by: Quentin Schulz <quentin.schulz@bootlin.com>
Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>

> ---
>  arch/mips/boot/dts/mscc/Makefile          |   2 +-
>  arch/mips/boot/dts/mscc/ocelot_pcb120.dts | 100 +++++++++++++++++++++++-
>  2 files changed, 101 insertions(+), 1 deletion(-)
>  create mode 100644 arch/mips/boot/dts/mscc/ocelot_pcb120.dts
> 
> diff --git a/arch/mips/boot/dts/mscc/Makefile b/arch/mips/boot/dts/mscc/Makefile
> index 9a9bb7e..ec6f5b2 100644
> --- a/arch/mips/boot/dts/mscc/Makefile
> +++ b/arch/mips/boot/dts/mscc/Makefile
> @@ -1,3 +1,3 @@
> -dtb-$(CONFIG_MSCC_OCELOT)	+= ocelot_pcb123.dtb
> +dtb-$(CONFIG_MSCC_OCELOT)	+= ocelot_pcb123.dtb ocelot_pcb120.dtb
>  
>  obj-$(CONFIG_BUILTIN_DTB)	+= $(addsuffix .o, $(dtb-y))
> diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts
> new file mode 100644
> index 0000000..8eb03a5
> --- /dev/null
> +++ b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts
> @@ -0,0 +1,100 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
> +/* Copyright (c) 2017 Microsemi Corporation */
> +
> +/dts-v1/;
> +
> +#include <dt-bindings/interrupt-controller/irq.h>
> +#include <dt-bindings/phy/phy-ocelot-serdes.h>
> +#include "ocelot.dtsi"
> +
> +/ {
> +	compatible = "mscc,ocelot-pcb120", "mscc,ocelot";
> +
> +	chosen {
> +		stdout-path = "serial0:115200n8";
> +	};
> +
> +	memory@0 {
> +		device_type = "memory";
> +		reg = <0x0 0x0e000000>;
> +	};
> +};
> +
> +&mdio0 {
> +	status = "okay";
> +};
> +
> +&mdio1 {
> +	status = "okay";
> +	pinctrl-names = "default";
> +	pinctrl-0 = <&miim1>, <&gpio4>;
> +
> +	phy7: ethernet-phy@0 {
> +		reg = <0>;
> +		interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> +		interrupt-parent = <&gpio>;
> +	};
> +	phy6: ethernet-phy@1 {
> +		reg = <1>;
> +		interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> +		interrupt-parent = <&gpio>;
> +	};
> +	phy5: ethernet-phy@2 {
> +		reg = <2>;
> +		interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> +		interrupt-parent = <&gpio>;
> +	};
> +	phy4: ethernet-phy@3 {
> +		reg = <3>;
> +		interrupts = <4 IRQ_TYPE_LEVEL_HIGH>;
> +		interrupt-parent = <&gpio>;
> +	};
> +};
> +
> +&port0 {
> +	phy-handle = <&phy0>;
> +};
> +
> +&port1 {
> +	phy-handle = <&phy1>;
> +};
> +
> +&port2 {
> +	phy-handle = <&phy2>;
> +};
> +
> +&port3 {
> +	phy-handle = <&phy3>;
> +};
> +
> +&port4 {
> +	phy-handle = <&phy7>;
> +	phy-mode = "sgmii";
> +	phys = <&serdes 4 SERDES1G_2>;
> +};
> +
> +&port5 {
> +	phy-handle = <&phy4>;
> +	phy-mode = "sgmii";
> +	phys = <&serdes 5 SERDES1G_5>;
> +};
> +
> +&port6 {
> +	phy-handle = <&phy6>;
> +	phy-mode = "sgmii";
> +	phys = <&serdes 6 SERDES1G_3>;
> +};
> +
> +&port9 {
> +	phy-handle = <&phy5>;
> +	phy-mode = "sgmii";
> +	phys = <&serdes 9 SERDES1G_4>;
> +};
> +
> +&uart0 {
> +	status = "okay";
> +};
> +
> +&uart2 {
> +	status = "okay";
> +};
> -- 
> git-series 0.9.1

-- 
Alexandre Belloni, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox