Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 07/17] zinc: Poly1305 generic C implementation and selftest
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
  To: linux-kernel, netdev, davem
  Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
	Jean-Philippe Aumasson, Andy Polyakov, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>

The C implementation is based on Andy Polyakov's implementation, heavily
modified by Samuel Neves.

Information: https://cr.yp.to/mac.html

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: linux-crypto@vger.kernel.org
---
 include/zinc/poly1305.h      |   35 +
 lib/zinc/Kconfig             |    4 +
 lib/zinc/Makefile            |    4 +
 lib/zinc/main.c              |    5 +
 lib/zinc/poly1305/poly1305.c |  257 ++++++
 lib/zinc/selftest/poly1305.h | 1566 ++++++++++++++++++++++++++++++++++
 6 files changed, 1871 insertions(+)
 create mode 100644 include/zinc/poly1305.h
 create mode 100644 lib/zinc/poly1305/poly1305.c
 create mode 100644 lib/zinc/selftest/poly1305.h

diff --git a/include/zinc/poly1305.h b/include/zinc/poly1305.h
new file mode 100644
index 000000000000..9fcae37a41ee
--- /dev/null
+++ b/include/zinc/poly1305.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _ZINC_POLY1305_H
+#define _ZINC_POLY1305_H
+
+#include <linux/simd.h>
+#include <linux/types.h>
+
+enum poly1305_lengths {
+	POLY1305_BLOCK_SIZE = 16,
+	POLY1305_KEY_SIZE = 32,
+	POLY1305_MAC_SIZE = 16
+};
+
+struct poly1305_ctx {
+	u8 opaque[24 * sizeof(u64)];
+	u32 nonce[4];
+	u8 data[POLY1305_BLOCK_SIZE];
+	size_t num;
+} __aligned(8);
+
+void poly1305_fpu_init(void);
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE], simd_context_t simd_context);
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, const size_t len, simd_context_t simd_context);
+void poly1305_finish(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE], simd_context_t simd_context);
+
+#ifdef CONFIG_ZINC_DEBUG
+bool poly1305_selftest(void);
+#endif
+
+#endif /* _ZINC_POLY1305_H */
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
index 5311a0d6ba8b..86936739a05f 100644
--- a/lib/zinc/Kconfig
+++ b/lib/zinc/Kconfig
@@ -10,6 +10,10 @@ config ZINC_CHACHA20
 	bool
 	select ZINC
 
+config ZINC_POLY1305
+	bool
+	select ZINC
+
 config ZINC_DEBUG
 	bool "Zinc cryptography library debugging and self-tests"
 	depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 3ba830b51695..bf7b3a75f4c2 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -22,6 +22,10 @@ CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
 endif
 endif
 
+ifeq ($(CONFIG_ZINC_POLY1305),y)
+zinc-y += poly1305/poly1305.o
+endif
+
 zinc-y += main.o
 
 obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
index 79b85045f22f..80a2e9f963f7 100644
--- a/lib/zinc/main.c
+++ b/lib/zinc/main.c
@@ -4,6 +4,7 @@
  */
 
 #include <zinc/chacha20.h>
+#include <zinc/poly1305.h>
 
 #include <linux/init.h>
 #include <linux/module.h>
@@ -21,6 +22,10 @@ static int __init mod_init(void)
 {
 #ifdef CONFIG_ZINC_CHACHA20
 	chacha20_fpu_init();
+#endif
+#ifdef CONFIG_ZINC_POLY1305
+	poly1305_fpu_init();
+	selftest(poly1305);
 #endif
 	return 0;
 }
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
new file mode 100644
index 000000000000..fd4ad5bbb76c
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305.c
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#ifndef HAVE_POLY1305_ARCH_IMPLEMENTATION
+static inline bool poly1305_init_arch(void *ctx, const u8 key[POLY1305_KEY_SIZE], simd_context_t simd_context) { return false; }
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, const size_t len, const u32 padbit, simd_context_t simd_context) { return false; }
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], const u32 nonce[4], simd_context_t simd_context) { return false; }
+void __init poly1305_fpu_init(void) { }
+#endif
+
+struct poly1305_internal {
+	u32 h[5];
+	u32 r[4];
+};
+
+static void poly1305_init_generic(void *ctx, const u8 key[16])
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+
+	/* h = 0 */
+	st->h[0] = 0;
+	st->h[1] = 0;
+	st->h[2] = 0;
+	st->h[3] = 0;
+	st->h[4] = 0;
+
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	st->r[0] = get_unaligned_le32(&key[ 0]) & 0x0fffffff;
+	st->r[1] = get_unaligned_le32(&key[ 4]) & 0x0ffffffc;
+	st->r[2] = get_unaligned_le32(&key[ 8]) & 0x0ffffffc;
+	st->r[3] = get_unaligned_le32(&key[12]) & 0x0ffffffc;
+}
+
+static void poly1305_blocks_generic(void *ctx, const u8 *inp, size_t len, const u32 padbit)
+{
+#define CONSTANT_TIME_CARRY(a,b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+	u32 r0, r1, r2, r3;
+	u32 s1, s2, s3;
+	u32 h0, h1, h2, h3, h4, c;
+	u64 d0, d1, d2, d3;
+
+	r0 = st->r[0];
+	r1 = st->r[1];
+	r2 = st->r[2];
+	r3 = st->r[3];
+
+	s1 = r1 + (r1 >> 2);
+	s2 = r2 + (r2 >> 2);
+	s3 = r3 + (r3 >> 2);
+
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+	h3 = st->h[3];
+	h4 = st->h[4];
+
+	while (len >= POLY1305_BLOCK_SIZE) {
+		/* h += m[i] */
+		h0 = (u32)(d0 = (u64)h0 + (0       ) + get_unaligned_le32(&inp[ 0]));
+		h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + get_unaligned_le32(&inp[ 4]));
+		h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + get_unaligned_le32(&inp[ 8]));
+		h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + get_unaligned_le32(&inp[12]));
+		h4 += (u32)(d3 >> 32) + padbit;
+
+		/* h *= r "%" p, where "%" stands for "partial remainder" */
+		d0 = ((u64)h0 * r0) +
+		     ((u64)h1 * s3) +
+		     ((u64)h2 * s2) +
+		     ((u64)h3 * s1);
+		d1 = ((u64)h0 * r1) +
+		     ((u64)h1 * r0) +
+		     ((u64)h2 * s3) +
+		     ((u64)h3 * s2) +
+		     (h4 * s1);
+		d2 = ((u64)h0 * r2) +
+		     ((u64)h1 * r1) +
+		     ((u64)h2 * r0) +
+		     ((u64)h3 * s3) +
+		     (h4 * s2);
+		d3 = ((u64)h0 * r3) +
+		     ((u64)h1 * r2) +
+		     ((u64)h2 * r1) +
+		     ((u64)h3 * r0) +
+		     (h4 * s3);
+		h4 = (h4 * r0);
+
+		/* last reduction step: */
+		/* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */
+		h0 = (u32)d0;
+		h1 = (u32)(d1 += d0 >> 32);
+		h2 = (u32)(d2 += d1 >> 32);
+		h3 = (u32)(d3 += d2 >> 32);
+		h4 += (u32)(d3 >> 32);
+		/* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */
+		c = (h4 >> 2) + (h4 & ~3U);
+		h4 &= 3;
+		h0 += c;
+		h1 += (c = CONSTANT_TIME_CARRY(h0, c));
+		h2 += (c = CONSTANT_TIME_CARRY(h1, c));
+		h3 += (c = CONSTANT_TIME_CARRY(h2, c));
+		h4 += CONSTANT_TIME_CARRY(h3, c);
+		/*
+		 * Occasional overflows to 3rd bit of h4 are taken care of
+		 * "naturally". If after this point we end up at the top of
+		 * this loop, then the overflow bit will be accounted for
+		 * in next iteration. If we end up in poly1305_emit, then
+		 * comparison to modulus below will still count as "carry
+		 * into 131st bit", so that properly reduced value will be
+		 * picked in conditional move.
+		 */
+
+		inp += POLY1305_BLOCK_SIZE;
+		len -= POLY1305_BLOCK_SIZE;
+	}
+
+	st->h[0] = h0;
+	st->h[1] = h1;
+	st->h[2] = h2;
+	st->h[3] = h3;
+	st->h[4] = h4;
+#undef CONSTANT_TIME_CARRY
+}
+
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
+{
+	struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+	u32 h0, h1, h2, h3, h4;
+	u32 g0, g1, g2, g3, g4;
+	u64 t;
+	u32 mask;
+
+	h0 = st->h[0];
+	h1 = st->h[1];
+	h2 = st->h[2];
+	h3 = st->h[3];
+	h4 = st->h[4];
+
+	/* compare to modulus by computing h + -p */
+	g0 = (u32)(t = (u64)h0 + 5);
+	g1 = (u32)(t = (u64)h1 + (t >> 32));
+	g2 = (u32)(t = (u64)h2 + (t >> 32));
+	g3 = (u32)(t = (u64)h3 + (t >> 32));
+	g4 = h4 + (u32)(t >> 32);
+
+	/* if there was carry into 131st bit, h3:h0 = g3:g0 */
+	mask = 0 - (g4 >> 2);
+	g0 &= mask;
+	g1 &= mask;
+	g2 &= mask;
+	g3 &= mask;
+	mask = ~mask;
+	h0 = (h0 & mask) | g0;
+	h1 = (h1 & mask) | g1;
+	h2 = (h2 & mask) | g2;
+	h3 = (h3 & mask) | g3;
+
+	/* mac = (h + nonce) % (2^128) */
+	h0 = (u32)(t = (u64)h0 + nonce[0]);
+	h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]);
+	h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]);
+	h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]);
+
+	put_unaligned_le32(h0, &mac[ 0]);
+	put_unaligned_le32(h1, &mac[ 4]);
+	put_unaligned_le32(h2, &mac[ 8]);
+	put_unaligned_le32(h3, &mac[12]);
+}
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE], simd_context_t simd_context)
+{
+	ctx->nonce[0] = get_unaligned_le32(&key[16]);
+	ctx->nonce[1] = get_unaligned_le32(&key[20]);
+	ctx->nonce[2] = get_unaligned_le32(&key[24]);
+	ctx->nonce[3] = get_unaligned_le32(&key[28]);
+
+	if (!poly1305_init_arch(ctx->opaque, key, simd_context))
+		poly1305_init_generic(ctx->opaque, key);
+	ctx->num = 0;
+}
+EXPORT_SYMBOL(poly1305_init);
+
+static inline void poly1305_blocks(void *ctx, const u8 *inp, const size_t len, const u32 padbit, simd_context_t simd_context)
+{
+	if (!poly1305_blocks_arch(ctx, inp, len, padbit, simd_context))
+		poly1305_blocks_generic(ctx, inp, len, padbit);
+}
+
+static inline void poly1305_emit(void *ctx, u8 mac[POLY1305_KEY_SIZE], const u32 nonce[4], simd_context_t simd_context)
+{
+	if (!poly1305_emit_arch(ctx, mac, nonce, simd_context))
+		poly1305_emit_generic(ctx, mac, nonce);
+}
+
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, size_t len, simd_context_t simd_context)
+{
+	const size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+	size_t rem;
+
+	if (num) {
+		rem = POLY1305_BLOCK_SIZE - num;
+		if (len >= rem) {
+			memcpy(ctx->data + num, inp, rem);
+			poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1, simd_context);
+			inp += rem;
+			len -= rem;
+		} else {
+			/* Still not enough data to process a block. */
+			memcpy(ctx->data + num, inp, len);
+			ctx->num = num + len;
+			return;
+		}
+	}
+
+	rem = len % POLY1305_BLOCK_SIZE;
+	len -= rem;
+
+	if (len >= POLY1305_BLOCK_SIZE) {
+		poly1305_blocks(ctx->opaque, inp, len, 1, simd_context);
+		inp += len;
+	}
+
+	if (rem)
+		memcpy(ctx->data, inp, rem);
+
+	ctx->num = rem;
+}
+EXPORT_SYMBOL(poly1305_update);
+
+void poly1305_finish(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE], simd_context_t simd_context)
+{
+	size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+
+	if (num) {
+		ctx->data[num++] = 1;   /* pad bit */
+		while (num < POLY1305_BLOCK_SIZE)
+			ctx->data[num++] = 0;
+		poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0, simd_context);
+	}
+
+	poly1305_emit(ctx->opaque, mac, ctx->nonce, simd_context);
+
+	/* zero out the state */
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(poly1305_finish);
+
+#include "../selftest/poly1305.h"
diff --git a/lib/zinc/selftest/poly1305.h b/lib/zinc/selftest/poly1305.h
new file mode 100644
index 000000000000..f9fbe41b75e4
--- /dev/null
+++ b/lib/zinc/selftest/poly1305.h
@@ -0,0 +1,1566 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2016-2017 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#ifdef CONFIG_ZINC_DEBUG
+
+struct poly1305_testdata {
+	size_t size;
+	const u8 data[1024];
+};
+
+struct poly1305_testvec {
+	struct poly1305_testdata input, key, expected;
+};
+
+static const struct poly1305_testvec poly1305_testvecs[] = {
+	/*
+	 * RFC7539
+	 */
+	{
+		{
+			34,
+			{
+				0x43, 0x72, 0x79, 0x70, 0x74, 0x6f, 0x67, 0x72,
+				0x61, 0x70, 0x68, 0x69, 0x63, 0x20, 0x46, 0x6f,
+				0x72, 0x75, 0x6d, 0x20, 0x52, 0x65, 0x73, 0x65,
+				0x61, 0x72, 0x63, 0x68, 0x20, 0x47, 0x72, 0x6f,
+
+				0x75, 0x70
+			}
+		},
+		{
+			32,
+			{
+				0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33,
+				0x7f, 0x44, 0x52, 0xfe, 0x42, 0xd5, 0x06, 0xa8,
+				0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d, 0xb2, 0xfd,
+				0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b
+			}
+		},
+		{
+			16,
+			{
+				0xa8, 0x06, 0x1d, 0xc1, 0x30, 0x51, 0x36, 0xc6,
+				0xc2, 0x2b, 0x8b, 0xaf, 0x0c, 0x01, 0x27, 0xa9
+			}
+		}
+	},
+	/*
+	 * test vectors from "The Poly1305-AES message-authentication code"
+	 */
+	{
+		{
+			2,
+			{
+				0xf3, 0xf6
+			}
+		},
+		{
+			32,
+			{
+				0x85, 0x1f, 0xc4, 0x0c, 0x34, 0x67, 0xac, 0x0b,
+				0xe0, 0x5c, 0xc2, 0x04, 0x04, 0xf3, 0xf7, 0x00,
+				0x58, 0x0b, 0x3b, 0x0f, 0x94, 0x47, 0xbb, 0x1e,
+				0x69, 0xd0, 0x95, 0xb5, 0x92, 0x8b, 0x6d, 0xbc
+			}
+		},
+		{
+			16,
+			{
+				0xf4, 0xc6, 0x33, 0xc3, 0x04, 0x4f, 0xc1, 0x45,
+				0xf8, 0x4f, 0x33, 0x5c, 0xb8, 0x19, 0x53, 0xde
+			}
+		}
+	},
+	{
+		{
+			0,
+			{
+				0
+			}
+		},
+		{
+			32,
+			{
+				0xa0, 0xf3, 0x08, 0x00, 0x00, 0xf4, 0x64, 0x00,
+				0xd0, 0xc7, 0xe9, 0x07, 0x6c, 0x83, 0x44, 0x03,
+				0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+				0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7
+			}
+		},
+		{
+			16,
+			{
+				0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+				0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7
+			}
+		}
+	},
+	{
+		{
+			32,
+			{
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+			}
+		},
+		{
+			32,
+			{
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef
+			}
+		},
+		{
+			16,
+			{
+				0x0e, 0xe1, 0xc1, 0x6b, 0xb7, 0x3f, 0x0f, 0x4f,
+				0xd1, 0x98, 0x81, 0x75, 0x3c, 0x01, 0xcd, 0xbe
+			}
+		}
+	},
+	{
+		{
+			63,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0x51, 0x54, 0xad, 0x0d, 0x2c, 0xb2, 0x6e, 0x01,
+				0x27, 0x4f, 0xc5, 0x11, 0x48, 0x49, 0x1f, 0x1b
+			}
+		},
+	},
+	/*
+	 * self-generated vectors exercise "significant" lengths, such that
+	 * are handled by different code paths
+	 */
+	{
+		{
+			64,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+				0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66
+			}
+		},
+	},
+	{
+		{
+			48,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+
+			}
+		},
+		{
+			16,
+			{
+				0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+				0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61
+			}
+		},
+	},
+	{
+		{
+			96,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0xbb, 0xb6, 0x13, 0xb2, 0xb6, 0xd7, 0x53, 0xba,
+				0x07, 0x39, 0x5b, 0x91, 0x6a, 0xae, 0xce, 0x15
+			}
+		},
+	},
+	{
+		{
+			112,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0xc7, 0x94, 0xd7, 0x05, 0x7d, 0x17, 0x78, 0xc4,
+				0xbb, 0xee, 0x0a, 0x39, 0xb3, 0xd9, 0x73, 0x42
+			}
+		},
+	},
+	{
+		{
+			128,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0xff, 0xbc, 0xb9, 0xb3, 0x71, 0x42, 0x31, 0x52,
+				0xd7, 0xfc, 0xa5, 0xad, 0x04, 0x2f, 0xba, 0xa9
+			}
+		},
+	},
+	{
+		{
+			144,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+				0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+				0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0x06, 0x9e, 0xd6, 0xb8, 0xef, 0x0f, 0x20, 0x7b,
+				0x3e, 0x24, 0x3b, 0xb1, 0x01, 0x9f, 0xe6, 0x32
+			}
+		},
+	},
+	{
+		{
+			160,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+				0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+				0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+				0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+				0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0xcc, 0xa3, 0x39, 0xd9, 0xa4, 0x5f, 0xa2, 0x36,
+				0x8c, 0x2c, 0x68, 0xb3, 0xa4, 0x17, 0x91, 0x33
+			}
+		},
+	},
+	{
+		{
+			288,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+				0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+				0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+				0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+				0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0x53, 0xf6, 0xe8, 0x28, 0xa2, 0xf0, 0xfe, 0x0e,
+				0xe8, 0x15, 0xbf, 0x0b, 0xd5, 0x84, 0x1a, 0x34
+			}
+		},
+	},
+	{
+		{
+			320,
+			{
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+				0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+				0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+				0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+				0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+
+				0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+				0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+				0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+				0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+				0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+				0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+				0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+				0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+				0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+				0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+				0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+				0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+				0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+				0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+				0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+				0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+				0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+				0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+				0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+				0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61
+			}
+		},
+		{
+			32,
+			{
+				0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+				0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+				0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+				0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+			}
+		},
+		{
+			16,
+			{
+				0xb8, 0x46, 0xd4, 0x4e, 0x9b, 0xbd, 0x53, 0xce,
+				0xdf, 0xfb, 0xfb, 0xb6, 0xb7, 0xfa, 0x49, 0x33
+			}
+		},
+	},
+	/*
+	 * 4th power of the key spills to 131th bit in SIMD key setup
+	 */
+	{
+		{
+			256,
+			{
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+			}
+		},
+		{
+			32,
+			{
+				0xad, 0x62, 0x81, 0x07, 0xe8, 0x35, 0x1d, 0x0f,
+				0x2c, 0x23, 0x1a, 0x05, 0xdc, 0x4a, 0x41, 0x06,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0x07, 0x14, 0x5a, 0x4c, 0x02, 0xfe, 0x5f, 0xa3,
+				0x20, 0x36, 0xde, 0x68, 0xfa, 0xbe, 0x90, 0x66
+			}
+		},
+	},
+	/*
+	 * OpenSSL's poly1305_ieee754.c failed this in final stage
+	 */
+	{
+		{
+			252,
+			{
+				0x84, 0x23, 0x64, 0xe1, 0x56, 0x33, 0x6c, 0x09,
+				0x98, 0xb9, 0x33, 0xa6, 0x23, 0x77, 0x26, 0x18,
+				0x0d, 0x9e, 0x3f, 0xdc, 0xbd, 0xe4, 0xcd, 0x5d,
+				0x17, 0x08, 0x0f, 0xc3, 0xbe, 0xb4, 0x96, 0x14,
+
+				0xd7, 0x12, 0x2c, 0x03, 0x74, 0x63, 0xff, 0x10,
+				0x4d, 0x73, 0xf1, 0x9c, 0x12, 0x70, 0x46, 0x28,
+				0xd4, 0x17, 0xc4, 0xc5, 0x4a, 0x3f, 0xe3, 0x0d,
+				0x3c, 0x3d, 0x77, 0x14, 0x38, 0x2d, 0x43, 0xb0,
+
+				0x38, 0x2a, 0x50, 0xa5, 0xde, 0xe5, 0x4b, 0xe8,
+				0x44, 0xb0, 0x76, 0xe8, 0xdf, 0x88, 0x20, 0x1a,
+				0x1c, 0xd4, 0x3b, 0x90, 0xeb, 0x21, 0x64, 0x3f,
+				0xa9, 0x6f, 0x39, 0xb5, 0x18, 0xaa, 0x83, 0x40,
+
+				0xc9, 0x42, 0xff, 0x3c, 0x31, 0xba, 0xf7, 0xc9,
+				0xbd, 0xbf, 0x0f, 0x31, 0xae, 0x3f, 0xa0, 0x96,
+				0xbf, 0x8c, 0x63, 0x03, 0x06, 0x09, 0x82, 0x9f,
+				0xe7, 0x2e, 0x17, 0x98, 0x24, 0x89, 0x0b, 0xc8,
+
+				0xe0, 0x8c, 0x31, 0x5c, 0x1c, 0xce, 0x2a, 0x83,
+				0x14, 0x4d, 0xbb, 0xff, 0x09, 0xf7, 0x4e, 0x3e,
+				0xfc, 0x77, 0x0b, 0x54, 0xd0, 0x98, 0x4a, 0x8f,
+				0x19, 0xb1, 0x47, 0x19, 0xe6, 0x36, 0x35, 0x64,
+
+				0x1d, 0x6b, 0x1e, 0xed, 0xf6, 0x3e, 0xfb, 0xf0,
+				0x80, 0xe1, 0x78, 0x3d, 0x32, 0x44, 0x54, 0x12,
+				0x11, 0x4c, 0x20, 0xde, 0x0b, 0x83, 0x7a, 0x0d,
+				0xfa, 0x33, 0xd6, 0xb8, 0x28, 0x25, 0xff, 0xf4,
+
+				0x4c, 0x9a, 0x70, 0xea, 0x54, 0xce, 0x47, 0xf0,
+				0x7d, 0xf6, 0x98, 0xe6, 0xb0, 0x33, 0x23, 0xb5,
+				0x30, 0x79, 0x36, 0x4a, 0x5f, 0xc3, 0xe9, 0xdd,
+				0x03, 0x43, 0x92, 0xbd, 0xde, 0x86, 0xdc, 0xcd,
+
+				0xda, 0x94, 0x32, 0x1c, 0x5e, 0x44, 0x06, 0x04,
+				0x89, 0x33, 0x6c, 0xb6, 0x5b, 0xf3, 0x98, 0x9c,
+				0x36, 0xf7, 0x28, 0x2c, 0x2f, 0x5d, 0x2b, 0x88,
+				0x2c, 0x17, 0x1e, 0x74
+			}
+		},
+		{
+			32,
+			{
+				0x95, 0xd5, 0xc0, 0x05, 0x50, 0x3e, 0x51, 0x0d,
+				0x8c, 0xd0, 0xaa, 0x07, 0x2c, 0x4a, 0x4d, 0x06,
+				0x6e, 0xab, 0xc5, 0x2d, 0x11, 0x65, 0x3d, 0xf4,
+				0x7f, 0xbf, 0x63, 0xab, 0x19, 0x8b, 0xcc, 0x26
+			}
+		},
+		{
+			16,
+			{
+				0xf2, 0x48, 0x31, 0x2e, 0x57, 0x8d, 0x9d, 0x58,
+				0xf8, 0xb7, 0xbb, 0x4d, 0x19, 0x10, 0x54, 0x31
+			}
+		},
+	},
+	/*
+	 * AVX2 in OpenSSL's poly1305-x86.pl failed this with 176+32 split
+	 */
+	{
+		{
+			208,
+			{
+				0x24, 0x8a, 0xc3, 0x10, 0x85, 0xb6, 0xc2, 0xad,
+				0xaa, 0xa3, 0x82, 0x59, 0xa0, 0xd7, 0x19, 0x2c,
+				0x5c, 0x35, 0xd1, 0xbb, 0x4e, 0xf3, 0x9a, 0xd9,
+				0x4c, 0x38, 0xd1, 0xc8, 0x24, 0x79, 0xe2, 0xdd,
+
+				0x21, 0x59, 0xa0, 0x77, 0x02, 0x4b, 0x05, 0x89,
+				0xbc, 0x8a, 0x20, 0x10, 0x1b, 0x50, 0x6f, 0x0a,
+				0x1a, 0xd0, 0xbb, 0xab, 0x76, 0xe8, 0x3a, 0x83,
+				0xf1, 0xb9, 0x4b, 0xe6, 0xbe, 0xae, 0x74, 0xe8,
+
+				0x74, 0xca, 0xb6, 0x92, 0xc5, 0x96, 0x3a, 0x75,
+				0x43, 0x6b, 0x77, 0x61, 0x21, 0xec, 0x9f, 0x62,
+				0x39, 0x9a, 0x3e, 0x66, 0xb2, 0xd2, 0x27, 0x07,
+				0xda, 0xe8, 0x19, 0x33, 0xb6, 0x27, 0x7f, 0x3c,
+
+				0x85, 0x16, 0xbc, 0xbe, 0x26, 0xdb, 0xbd, 0x86,
+				0xf3, 0x73, 0x10, 0x3d, 0x7c, 0xf4, 0xca, 0xd1,
+				0x88, 0x8c, 0x95, 0x21, 0x18, 0xfb, 0xfb, 0xd0,
+				0xd7, 0xb4, 0xbe, 0xdc, 0x4a, 0xe4, 0x93, 0x6a,
+
+				0xff, 0x91, 0x15, 0x7e, 0x7a, 0xa4, 0x7c, 0x54,
+				0x44, 0x2e, 0xa7, 0x8d, 0x6a, 0xc2, 0x51, 0xd3,
+				0x24, 0xa0, 0xfb, 0xe4, 0x9d, 0x89, 0xcc, 0x35,
+				0x21, 0xb6, 0x6d, 0x16, 0xe9, 0xc6, 0x6a, 0x37,
+
+				0x09, 0x89, 0x4e, 0x4e, 0xb0, 0xa4, 0xee, 0xdc,
+				0x4a, 0xe1, 0x94, 0x68, 0xe6, 0x6b, 0x81, 0xf2,
+
+				0x71, 0x35, 0x1b, 0x1d, 0x92, 0x1e, 0xa5, 0x51,
+				0x04, 0x7a, 0xbc, 0xc6, 0xb8, 0x7a, 0x90, 0x1f,
+				0xde, 0x7d, 0xb7, 0x9f, 0xa1, 0x81, 0x8c, 0x11,
+				0x33, 0x6d, 0xbc, 0x07, 0x24, 0x4a, 0x40, 0xeb
+			}
+		},
+		{
+			32,
+			{
+				0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+				0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0xbc, 0x93, 0x9b, 0xc5, 0x28, 0x14, 0x80, 0xfa,
+				0x99, 0xc6, 0xd6, 0x8c, 0x25, 0x8e, 0xc4, 0x2f
+			}
+		},
+	},
+	/*
+	 * test vectors from Google
+	 */
+	{
+		{
+			0,
+			{
+				0x00,
+			}
+		},
+		{
+			32,
+			{
+				0xc8, 0xaf, 0xaa, 0xc3, 0x31, 0xee, 0x37, 0x2c,
+				0xd6, 0x08, 0x2d, 0xe1, 0x34, 0x94, 0x3b, 0x17,
+				0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+				0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c
+			}
+		},
+		{
+			16,
+			{
+				0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+				0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c
+			}
+		},
+	},
+	{
+		{
+			12,
+			{
+				0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+				0x72, 0x6c, 0x64, 0x21
+			}
+		},
+		{
+			32,
+			{
+				0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+				0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+				0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+				0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35
+			}
+		},
+		{
+			16,
+			{
+				0xa6, 0xf7, 0x45, 0x00, 0x8f, 0x81, 0xc9, 0x16,
+				0xa2, 0x0d, 0xcc, 0x74, 0xee, 0xf2, 0xb2, 0xf0
+			}
+		},
+	},
+	{
+		{
+			32,
+			{
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			32,
+			{
+				0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+				0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+				0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+				0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35
+			}
+		},
+		{
+			16,
+			{
+				0x49, 0xec, 0x78, 0x09, 0x0e, 0x48, 0x1e, 0xc6,
+				0xc2, 0x6b, 0x33, 0xb9, 0x1c, 0xcc, 0x03, 0x07
+			}
+		},
+	},
+	{
+		{
+			128,
+			{
+				0x89, 0xda, 0xb8, 0x0b, 0x77, 0x17, 0xc1, 0xdb,
+				0x5d, 0xb4, 0x37, 0x86, 0x0a, 0x3f, 0x70, 0x21,
+				0x8e, 0x93, 0xe1, 0xb8, 0xf4, 0x61, 0xfb, 0x67,
+				0x7f, 0x16, 0xf3, 0x5f, 0x6f, 0x87, 0xe2, 0xa9,
+
+				0x1c, 0x99, 0xbc, 0x3a, 0x47, 0xac, 0xe4, 0x76,
+				0x40, 0xcc, 0x95, 0xc3, 0x45, 0xbe, 0x5e, 0xcc,
+				0xa5, 0xa3, 0x52, 0x3c, 0x35, 0xcc, 0x01, 0x89,
+				0x3a, 0xf0, 0xb6, 0x4a, 0x62, 0x03, 0x34, 0x27,
+
+				0x03, 0x72, 0xec, 0x12, 0x48, 0x2d, 0x1b, 0x1e,
+				0x36, 0x35, 0x61, 0x69, 0x8a, 0x57, 0x8b, 0x35,
+				0x98, 0x03, 0x49, 0x5b, 0xb4, 0xe2, 0xef, 0x19,
+				0x30, 0xb1, 0x7a, 0x51, 0x90, 0xb5, 0x80, 0xf1,
+
+				0x41, 0x30, 0x0d, 0xf3, 0x0a, 0xdb, 0xec, 0xa2,
+				0x8f, 0x64, 0x27, 0xa8, 0xbc, 0x1a, 0x99, 0x9f,
+				0xd5, 0x1c, 0x55, 0x4a, 0x01, 0x7d, 0x09, 0x5d,
+				0x8c, 0x3e, 0x31, 0x27, 0xda, 0xf9, 0xf5, 0x95
+			}
+		},
+		{
+			32,
+			{
+				0x2d, 0x77, 0x3b, 0xe3, 0x7a, 0xdb, 0x1e, 0x4d,
+				0x68, 0x3b, 0xf0, 0x07, 0x5e, 0x79, 0xc4, 0xee,
+				0x03, 0x79, 0x18, 0x53, 0x5a, 0x7f, 0x99, 0xcc,
+				0xb7, 0x04, 0x0f, 0xb5, 0xf5, 0xf4, 0x3a, 0xea
+			}
+		},
+		{
+			16,
+			{
+				0xc8, 0x5d, 0x15, 0xed, 0x44, 0xc3, 0x78, 0xd6,
+				0xb0, 0x0e, 0x23, 0x06, 0x4c, 0x7b, 0xcd, 0x51
+			}
+		},
+	},
+	{
+		{
+			528,
+			{
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b,
+				0x17, 0x03, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00,
+
+				0x06, 0xdb, 0x1f, 0x1f, 0x36, 0x8d, 0x69, 0x6a,
+				0x81, 0x0a, 0x34, 0x9c, 0x0c, 0x71, 0x4c, 0x9a,
+				0x5e, 0x78, 0x50, 0xc2, 0x40, 0x7d, 0x72, 0x1a,
+				0xcd, 0xed, 0x95, 0xe0, 0x18, 0xd7, 0xa8, 0x52,
+
+				0x66, 0xa6, 0xe1, 0x28, 0x9c, 0xdb, 0x4a, 0xeb,
+				0x18, 0xda, 0x5a, 0xc8, 0xa2, 0xb0, 0x02, 0x6d,
+				0x24, 0xa5, 0x9a, 0xd4, 0x85, 0x22, 0x7f, 0x3e,
+				0xae, 0xdb, 0xb2, 0xe7, 0xe3, 0x5e, 0x1c, 0x66,
+
+				0xcd, 0x60, 0xf9, 0xab, 0xf7, 0x16, 0xdc, 0xc9,
+				0xac, 0x42, 0x68, 0x2d, 0xd7, 0xda, 0xb2, 0x87,
+				0xa7, 0x02, 0x4c, 0x4e, 0xef, 0xc3, 0x21, 0xcc,
+				0x05, 0x74, 0xe1, 0x67, 0x93, 0xe3, 0x7c, 0xec,
+
+				0x03, 0xc5, 0xbd, 0xa4, 0x2b, 0x54, 0xc1, 0x14,
+				0xa8, 0x0b, 0x57, 0xaf, 0x26, 0x41, 0x6c, 0x7b,
+				0xe7, 0x42, 0x00, 0x5e, 0x20, 0x85, 0x5c, 0x73,
+				0xe2, 0x1d, 0xc8, 0xe2, 0xed, 0xc9, 0xd4, 0x35,
+
+				0xcb, 0x6f, 0x60, 0x59, 0x28, 0x00, 0x11, 0xc2,
+				0x70, 0xb7, 0x15, 0x70, 0x05, 0x1c, 0x1c, 0x9b,
+				0x30, 0x52, 0x12, 0x66, 0x20, 0xbc, 0x1e, 0x27,
+				0x30, 0xfa, 0x06, 0x6c, 0x7a, 0x50, 0x9d, 0x53,
+
+				0xc6, 0x0e, 0x5a, 0xe1, 0xb4, 0x0a, 0xa6, 0xe3,
+				0x9e, 0x49, 0x66, 0x92, 0x28, 0xc9, 0x0e, 0xec,
+				0xb4, 0xa5, 0x0d, 0xb3, 0x2a, 0x50, 0xbc, 0x49,
+				0xe9, 0x0b, 0x4f, 0x4b, 0x35, 0x9a, 0x1d, 0xfd,
+
+				0x11, 0x74, 0x9c, 0xd3, 0x86, 0x7f, 0xcf, 0x2f,
+				0xb7, 0xbb, 0x6c, 0xd4, 0x73, 0x8f, 0x6a, 0x4a,
+				0xd6, 0xf7, 0xca, 0x50, 0x58, 0xf7, 0x61, 0x88,
+				0x45, 0xaf, 0x9f, 0x02, 0x0f, 0x6c, 0x3b, 0x96,
+
+				0x7b, 0x8f, 0x4c, 0xd4, 0xa9, 0x1e, 0x28, 0x13,
+				0xb5, 0x07, 0xae, 0x66, 0xf2, 0xd3, 0x5c, 0x18,
+				0x28, 0x4f, 0x72, 0x92, 0x18, 0x60, 0x62, 0xe1,
+				0x0f, 0xd5, 0x51, 0x0d, 0x18, 0x77, 0x53, 0x51,
+
+				0xef, 0x33, 0x4e, 0x76, 0x34, 0xab, 0x47, 0x43,
+				0xf5, 0xb6, 0x8f, 0x49, 0xad, 0xca, 0xb3, 0x84,
+				0xd3, 0xfd, 0x75, 0xf7, 0x39, 0x0f, 0x40, 0x06,
+				0xef, 0x2a, 0x29, 0x5c, 0x8c, 0x7a, 0x07, 0x6a,
+
+				0xd5, 0x45, 0x46, 0xcd, 0x25, 0xd2, 0x10, 0x7f,
+				0xbe, 0x14, 0x36, 0xc8, 0x40, 0x92, 0x4a, 0xae,
+				0xbe, 0x5b, 0x37, 0x08, 0x93, 0xcd, 0x63, 0xd1,
+				0x32, 0x5b, 0x86, 0x16, 0xfc, 0x48, 0x10, 0x88,
+
+				0x6b, 0xc1, 0x52, 0xc5, 0x32, 0x21, 0xb6, 0xdf,
+				0x37, 0x31, 0x19, 0x39, 0x32, 0x55, 0xee, 0x72,
+				0xbc, 0xaa, 0x88, 0x01, 0x74, 0xf1, 0x71, 0x7f,
+				0x91, 0x84, 0xfa, 0x91, 0x64, 0x6f, 0x17, 0xa2,
+
+				0x4a, 0xc5, 0x5d, 0x16, 0xbf, 0xdd, 0xca, 0x95,
+				0x81, 0xa9, 0x2e, 0xda, 0x47, 0x92, 0x01, 0xf0,
+				0xed, 0xbf, 0x63, 0x36, 0x00, 0xd6, 0x06, 0x6d,
+				0x1a, 0xb3, 0x6d, 0x5d, 0x24, 0x15, 0xd7, 0x13,
+
+				0x51, 0xbb, 0xcd, 0x60, 0x8a, 0x25, 0x10, 0x8d,
+				0x25, 0x64, 0x19, 0x92, 0xc1, 0xf2, 0x6c, 0x53,
+				0x1c, 0xf9, 0xf9, 0x02, 0x03, 0xbc, 0x4c, 0xc1,
+				0x9f, 0x59, 0x27, 0xd8, 0x34, 0xb0, 0xa4, 0x71,
+
+				0x16, 0xd3, 0x88, 0x4b, 0xbb, 0x16, 0x4b, 0x8e,
+				0xc8, 0x83, 0xd1, 0xac, 0x83, 0x2e, 0x56, 0xb3,
+				0x91, 0x8a, 0x98, 0x60, 0x1a, 0x08, 0xd1, 0x71,
+				0x88, 0x15, 0x41, 0xd5, 0x94, 0xdb, 0x39, 0x9c,
+
+				0x6a, 0xe6, 0x15, 0x12, 0x21, 0x74, 0x5a, 0xec,
+				0x81, 0x4c, 0x45, 0xb0, 0xb0, 0x5b, 0x56, 0x54,
+				0x36, 0xfd, 0x6f, 0x13, 0x7a, 0xa1, 0x0a, 0x0c,
+				0x0b, 0x64, 0x37, 0x61, 0xdb, 0xd6, 0xf9, 0xa9,
+
+				0xdc, 0xb9, 0x9b, 0x1a, 0x6e, 0x69, 0x08, 0x54,
+				0xce, 0x07, 0x69, 0xcd, 0xe3, 0x97, 0x61, 0xd8,
+				0x2f, 0xcd, 0xec, 0x15, 0xf0, 0xd9, 0x2d, 0x7d,
+				0x8e, 0x94, 0xad, 0xe8, 0xeb, 0x83, 0xfb, 0xe0
+			}
+		},
+		{
+			32,
+			{
+				0x99, 0xe5, 0x82, 0x2d, 0xd4, 0x17, 0x3c, 0x99,
+				0x5e, 0x3d, 0xae, 0x0d, 0xde, 0xfb, 0x97, 0x74,
+				0x3f, 0xde, 0x3b, 0x08, 0x01, 0x34, 0xb3, 0x9f,
+				0x76, 0xe9, 0xbf, 0x8d, 0x0e, 0x88, 0xd5, 0x46
+			}
+		},
+		{
+			16,
+			{
+				0x26, 0x37, 0x40, 0x8f, 0xe1, 0x30, 0x86, 0xea,
+				0x73, 0xf9, 0x71, 0xe3, 0x42, 0x5e, 0x28, 0x20
+			}
+		},
+	},
+	/*
+	 * test vectors from Hanno Böck
+	 */
+	{
+		{
+			257,
+			{
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0x80, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0xcc, 0xcc, 0xcc,
+
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xc5,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe3, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+				0xcc, 0xcc, 0xcc, 0xcc, 0xac, 0xcc, 0xcc, 0xcc,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe6,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x00, 0x00, 0x00,
+				0xaf, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+				0xcc, 0xcc, 0xff, 0xff, 0xff, 0xf5, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0x00, 0xff, 0xff, 0xff, 0xe7, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x71, 0x92, 0x05, 0xa8, 0x52, 0x1d,
+
+				0xfc
+			}
+		},
+		{
+			32,
+			{
+				0x7f, 0x1b, 0x02, 0x64, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc
+			}
+		},
+		{
+			16,
+			{
+				0x85, 0x59, 0xb8, 0x76, 0xec, 0xee, 0xd6, 0x6e,
+				0xb3, 0x77, 0x98, 0xc0, 0x45, 0x7b, 0xaf, 0xf9
+			}
+		},
+	},
+	{
+		{
+			39,
+			{
+				0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+				0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+				0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+				0xaa, 0xaa, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x64
+			}
+		},
+		{
+			32,
+			{
+				0xe0, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+				0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa
+			}
+		},
+		{
+			16,
+			{
+				0x00, 0xbd, 0x12, 0x58, 0x97, 0x8e, 0x20, 0x54,
+				0x44, 0xc9, 0xaa, 0xaa, 0x82, 0x00, 0x6f, 0xed
+			}
+		},
+	},
+	{
+		{
+			2,
+			{
+				0x02, 0xfc
+			}
+		},
+		{
+			32,
+			{
+				0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+				0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+				0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+				0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c
+			}
+		},
+		{
+			16,
+			{
+				0x06, 0x12, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+				0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c
+			}
+		},
+	},
+	{
+		{
+			415,
+			{
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7a, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+				0x7b, 0x7b, 0x5c, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x6e, 0x7b, 0x00, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7a, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x5c,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+				0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+				0x7b, 0x6e, 0x7b, 0x00, 0x13, 0x00, 0x00, 0x00,
+				0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00,
+				0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+				0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x00,
+
+				0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00, 0x09,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x7a, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+
+				0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc
+			}
+		},
+		{
+			32,
+			{
+				0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7b, 0x7b
+			}
+		},
+		{
+			16,
+			{
+				0x33, 0x20, 0x5b, 0xbf, 0x9e, 0x9f, 0x8f, 0x72,
+				0x12, 0xab, 0x9e, 0x2a, 0xb9, 0xb7, 0xe4, 0xa5
+			}
+		},
+	},
+	{
+		{
+			118,
+			{
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0xff, 0xff, 0xff, 0xe9,
+				0xe9, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+				0xac, 0xac, 0xac, 0xac, 0x00, 0x00, 0xac, 0xac,
+
+				0xec, 0x01, 0x00, 0xac, 0xac, 0xac, 0x2c, 0xac,
+				0xa2, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+				0xac, 0xac, 0xac, 0xac, 0x64, 0xf2
+			}
+		},
+		{
+			32,
+			{
+				0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7f,
+				0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0xcf, 0x77, 0x77, 0x77, 0x77, 0x77,
+				0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77
+			}
+		},
+		{
+			16,
+			{
+				0x02, 0xee, 0x7c, 0x8c, 0x54, 0x6d, 0xde, 0xb1,
+				0xa4, 0x67, 0xe4, 0xc3, 0x98, 0x11, 0x58, 0xb9
+			}
+		},
+	},
+	/*
+	 * test vectors from Andrew Moon
+	 */
+	{ /* nacl */
+		{
+			131,
+			{
+				0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
+				0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
+				0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
+				0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
+
+				0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
+				0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
+				0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
+				0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
+
+				0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
+				0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
+				0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
+				0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
+
+				0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
+				0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
+				0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
+				0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
+
+				0xe3, 0x55, 0xa5
+			}
+		},
+		{
+			32,
+			{
+				0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
+				0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
+				0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
+				0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80
+			}
+		},
+		{
+			16,
+			{
+				0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
+				0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
+			}
+		},
+	},
+	{ /* wrap 2^130-5 */
+		{
+			16,
+			{
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+			}
+		},
+		{
+			32,
+			{
+				0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+	},
+	{ /* wrap 2^128 */
+		{
+			16,
+			{
+				0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			32,
+			{
+				0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+			}
+		},
+		{
+			16,
+			{
+				0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+	},
+	{ /* limb carry */
+		{
+			48,
+			{
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+				0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			32,
+			{
+				0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+	},
+	{ /* 2^130-5 */
+		{
+			48,
+			{
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xfb, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+				0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+
+				0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+				0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
+			}
+		},
+		{
+			32,
+			{
+				0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+
+			}
+		},
+	},
+	{ /* 2^130-6 */
+		{
+			16,
+			{
+				0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+			}
+		},
+		{
+			32,
+			{
+				0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+			}
+		},
+	},
+	{ /* 5*H+L reduction intermediate */
+		{
+			64,
+			{
+				0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+				0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			32,
+			{
+				0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+	},
+	{ /* 5*H+L reduction final */
+		{
+			48,
+			{
+				0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+				0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+
+			}
+		},
+		{
+			32,
+			{
+				0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		},
+		{
+			16,
+			{
+				0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+			}
+		}
+	}
+};
+
+bool __init poly1305_selftest(void)
+{
+	simd_context_t simd_context = simd_get();
+	bool success = true;
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(poly1305_testvecs); ++i) {
+		struct poly1305_ctx poly1305;
+		const u8 *in = poly1305_testvecs[i].input.data;
+		size_t inlen = poly1305_testvecs[i].input.size;
+		const u8 *key = poly1305_testvecs[i].key.data;
+		const u8 *expected = poly1305_testvecs[i].expected.data;
+		size_t expectedlen = poly1305_testvecs[i].expected.size;
+		u8 out[POLY1305_MAC_SIZE];
+
+		if (expectedlen != sizeof(out)) {
+			pr_info("poly1305 self-test %zu logic: FAIL\n", i + 1);
+			success = false;
+		}
+
+		memset(out, 0, sizeof(out));
+		memset(&poly1305, 0, sizeof(poly1305));
+		poly1305_init(&poly1305, key, simd_context);
+		poly1305_update(&poly1305, in, inlen, simd_context);
+		poly1305_finish(&poly1305, out, simd_context);
+		if (memcmp(out, expected, expectedlen)) {
+			pr_info("poly1305 self-test %zu: FAIL\n", i + 1);
+			success = false;
+		}
+
+		if (inlen > 16) {
+			memset(out, 0, sizeof(out));
+			memset(&poly1305, 0, sizeof(poly1305));
+			poly1305_init(&poly1305, key, simd_context);
+			poly1305_update(&poly1305, in, 1, simd_context);
+			poly1305_update(&poly1305, in + 1, inlen - 1, simd_context);
+			poly1305_finish(&poly1305, out, simd_context);
+			if (memcmp(out, expected, expectedlen)) {
+				pr_info("poly1305 self-test %zu/1+(N-1): FAIL\n", i + 1);
+				success = false;
+			}
+		}
+
+		if (inlen > 32) {
+			size_t half = inlen / 2;
+
+			memset(out, 0, sizeof(out));
+			memset(&poly1305, 0, sizeof(poly1305));
+			poly1305_init(&poly1305, key, simd_context);
+			poly1305_update(&poly1305, in, half, simd_context);
+			poly1305_update(&poly1305, in + half, inlen - half, simd_context);
+			poly1305_finish(&poly1305, out, simd_context);
+			if (memcmp(out, expected, expectedlen)) {
+				pr_info("poly1305 self-test %zu/2: FAIL\n", i + 1);
+				success = false;
+			}
+
+			for (half = 16; half < inlen; half += 16) {
+				memset(out, 0, sizeof(out));
+				memset(&poly1305, 0, sizeof(poly1305));
+				poly1305_init(&poly1305, key, simd_context);
+				poly1305_update(&poly1305, in, half, simd_context);
+				poly1305_update(&poly1305, in + half, inlen - half, simd_context);
+				poly1305_finish(&poly1305, out, simd_context);
+				if (memcmp(out, expected, expectedlen)) {
+					pr_info("poly1305 self-test %zu/%zu+%zu: FAIL\n", i + 1, half, inlen - half);
+					success = false;
+				}
+			}
+		}
+	}
+	simd_put(simd_context);
+
+	if (success)
+		pr_info("poly1305 self-tests: pass\n");
+
+	return success;
+}
+#endif
-- 
2.18.0

^ permalink raw reply related

* [PATCH v2 06/17] zinc: ChaCha20 MIPS32r2 implementation
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
  To: linux-kernel, netdev, davem
  Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
	Jean-Philippe Aumasson, René van Dorst, Ralf Baechle,
	Paul Burton, James Hogan, linux-mips, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>

This MIPS32r2 implementation comes from René van Dorst and me and
results in a nice speedup on the usual OpenWRT targets.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: René van Dorst <opensource@vdorst.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@linux-mips.org
Cc: linux-crypto@vger.kernel.org
---
 lib/zinc/Makefile                      |   4 +
 lib/zinc/chacha20/chacha20-mips-glue.h |  19 +
 lib/zinc/chacha20/chacha20-mips.S      | 474 +++++++++++++++++++++++++
 3 files changed, 497 insertions(+)
 create mode 100644 lib/zinc/chacha20/chacha20-mips-glue.h
 create mode 100644 lib/zinc/chacha20/chacha20-mips.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index fc6ca1c84cd1..3ba830b51695 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -16,6 +16,10 @@ ifeq ($(CONFIG_ARM64),y)
 zinc-y += chacha20/chacha20-arm64.o
 CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
 endif
+ifeq ($(CONFIG_MIPS)$(CONFIG_CPU_MIPS32_R2),yy)
+zinc-y += chacha20/chacha20-mips.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
+endif
 endif
 
 zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-mips-glue.h b/lib/zinc/chacha20/chacha20-mips-glue.h
new file mode 100644
index 000000000000..54c71a0b5bd0
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips-glue.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+
+asmlinkage void chacha20_mips(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+void __init chacha20_fpu_init(void) { }
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, const u32 key[8], const u32 counter[4], simd_context_t simd_context)
+{
+	chacha20_mips(dst, src, len, key, counter);
+	return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, const u8 *key, simd_context_t simd_context) { return false; }
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-mips.S b/lib/zinc/chacha20/chacha20-mips.S
new file mode 100644
index 000000000000..77da2c2fb240
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips.S
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32	0x3c
+#define MASK_BYTES	0x03
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE	4*16
+
+#define X0  $t0
+#define X1  $t1
+#define X2  $t2
+#define X3  $t3
+#define X4  $t4
+#define X5  $t5
+#define X6  $t6
+#define X7  $t7
+#define X8  $v1
+#define X9  $fp
+#define X10 $s7
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0  $s1
+#define T1  $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define OUT		$a0
+#define IN		$a1
+#define BYTES		$a2
+/* KEY and NONCE argument must be u32 aligned */
+#define KEY		$a3
+/* NONCE pointer is given via stack */
+#define NONCE		$t9
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0		$v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X		X15
+#define SAVED_CA	$ra
+
+#define PTR_LAST_ROUND	$t8
+
+/* ChaCha20 constants and stack location */
+#define CONSTANT_OFS_SP	48
+#define UNALIGNED_OFS_SP 40
+
+#define CONSTANT_1	0x61707865
+#define CONSTANT_2	0x3320646e
+#define CONSTANT_3	0x79622d32
+#define CONSTANT_4	0x6b206574
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define	CPU_TO_LE32(n) \
+	wsbh	n; \
+	rotr	n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define STORE_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \
+	.if ((s != NONCE) || (o != 0)); \
+		lw	T0, o(s); \
+	.endif; \
+	lwl	T1, x-4+MSB ## (IN); \
+	lwr	T1, x-4+LSB ## (IN); \
+	.if ((s == NONCE) && (o == 0)); \
+		addu	X ## a, NONCE_0; \
+	.else; \
+		addu	X ## a, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## a); \
+	xor	X ## a, T1; \
+	swl	X ## a, x-4+MSB ## (OUT); \
+	swr	X ## a, x-4+LSB ## (OUT);
+
+#define STORE_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \
+	.if ((s != NONCE) || (o != 0)); \
+		lw	T0, o(s); \
+	.endif; \
+	lw	T1, x-4 ## (IN); \
+	.if ((s == NONCE) && (o == 0)); \
+		addu	X ## a, NONCE_0; \
+	.else; \
+		addu	X ## a, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## a); \
+	xor	X ## a, T1; \
+	sw	X ## a, x-4 ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_aligned_ ## a: ; \
+	.if ((s == NONCE) && (o == 0)); \
+		move	SAVED_CA, NONCE_0; \
+	.else; \
+		lw	SAVED_CA, o(s);\
+	.endif; \
+	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
+	move	SAVED_X, X ## a;
+
+#define JMPTBL_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \
+	.if ((s == NONCE) && (o == 0)); \
+		move	SAVED_CA, NONCE_0; \
+	.else; \
+		lw	SAVED_CA, o(s);\
+	.endif; \
+	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+	move	SAVED_X, X ## a;
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	addu	X(A), X(K); \
+	addu	X(B), X(L); \
+	addu	X(C), X(M); \
+	addu	X(D), X(N); \
+	xor	X(V), X(A); \
+	xor	X(W), X(B); \
+	xor	X(Y), X(C); \
+	xor	X(Z), X(D); \
+	rotl	X(V), S;    \
+	rotl	X(W), S;    \
+	rotl	X(Y), S;    \
+	rotl	X(Z), S;
+
+.text
+.set reorder
+.set noat
+.globl chacha20_mips
+.ent   chacha20_mips
+chacha20_mips:
+	.frame $sp, STACK_SIZE, $ra
+	/* This is in the fifth argument */
+	lw	NONCE, 16($sp)
+
+	/* Return bytes = 0. */
+	.set noreorder
+	beqz	BYTES, .Lchacha20_mips_end
+	addiu	$sp, -STACK_SIZE
+	.set reorder
+
+	/* Calculate PTR_LAST_ROUND */
+	addiu	PTR_LAST_ROUND, BYTES, -1
+	ins	PTR_LAST_ROUND, $zero, 0, 6
+	addu	PTR_LAST_ROUND, OUT
+
+	/* Save s0-s7, fp, ra. */
+	sw	$ra,  0($sp)
+	sw	$fp,  4($sp)
+	sw	$s0,  8($sp)
+	sw	$s1, 12($sp)
+	sw	$s2, 16($sp)
+	sw	$s3, 20($sp)
+	sw	$s4, 24($sp)
+	sw	$s5, 28($sp)
+	sw	$s6, 32($sp)
+	sw	$s7, 36($sp)
+
+	lw	NONCE_0, 0(NONCE)
+	/* Test IN or OUT is unaligned.
+	 * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003
+	 */
+	or	T1, IN, OUT
+	andi	T1, 0x3
+
+	/* Load constant */
+	lui	X0, %hi(CONSTANT_1)
+	lui	X1, %hi(CONSTANT_2)
+	lui	X2, %hi(CONSTANT_3)
+	lui	X3, %hi(CONSTANT_4)
+	ori	X0, %lo(CONSTANT_1)
+	ori	X1, %lo(CONSTANT_2)
+	ori	X2, %lo(CONSTANT_3)
+	ori	X3, %lo(CONSTANT_4)
+
+	/* Store constant on stack. */
+	sw	X0,  0+CONSTANT_OFS_SP($sp)
+	sw	X1,  4+CONSTANT_OFS_SP($sp)
+	sw	X2,  8+CONSTANT_OFS_SP($sp)
+	sw	X3, 12+CONSTANT_OFS_SP($sp)
+
+	sw	T1, UNALIGNED_OFS_SP($sp)
+
+	.set	noreorder
+	b	.Lchacha20_rounds_start
+	andi	BYTES, (CHACHA20_BLOCK_SIZE-1)
+	.set	reorder
+
+.align 4
+.Loop_chacha20_rounds:
+	addiu	IN,  CHACHA20_BLOCK_SIZE
+	addiu	OUT, CHACHA20_BLOCK_SIZE
+	addiu	NONCE_0, 1
+
+	lw	X0,  0+CONSTANT_OFS_SP($sp)
+	lw	X1,  4+CONSTANT_OFS_SP($sp)
+	lw	X2,  8+CONSTANT_OFS_SP($sp)
+	lw	X3, 12+CONSTANT_OFS_SP($sp)
+	lw	T1,   UNALIGNED_OFS_SP($sp)
+
+.Lchacha20_rounds_start:
+	lw	X4,   0(KEY)
+	lw	X5,   4(KEY)
+	lw	X6,   8(KEY)
+	lw	X7,  12(KEY)
+	lw	X8,  16(KEY)
+	lw	X9,  20(KEY)
+	lw	X10, 24(KEY)
+	lw	X11, 28(KEY)
+
+	move	X12, NONCE_0
+	lw	X13,  4(NONCE)
+	lw	X14,  8(NONCE)
+	lw	X15, 12(NONCE)
+
+	li	$at, 9
+.Loop_chacha20_xor_rounds:
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	.set noreorder
+	bnez	$at, .Loop_chacha20_xor_rounds
+	addiu	$at, -1
+
+	/* Unaligned? Jump */
+	bnez	T1, .Loop_chacha20_unaligned
+	andi	$at, BYTES, MASK_U32
+
+	/* Last round? No jump */
+	bne	OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+
+	/* Full block? Jump */
+	beqz	BYTES, .Lchacha20_mips_xor_aligned_64_b
+	/* Calculate lower half jump table addr and offset */
+	ins	T0, $at, 2, 6
+
+	subu	T0, $at
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+
+	jr	T0
+	/* Delay slot */
+	nop
+
+	.set	reorder
+
+.Loop_chacha20_unaligned:
+	.set noreorder
+
+	/* Last round? no jump */
+	bne	OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	/* Full block? Jump */
+	beqz	BYTES, .Lchacha20_mips_xor_unaligned_64_b
+
+	/* Calculate lower half jump table addr and offset */
+	ins     T0, $at, 2, 6
+	subu	T0, $at
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	jr	T0
+	/* Delay slot */
+	nop
+
+	.set	reorder
+
+/* Aligned code path
+ */
+.align 4
+	STORE_ALIGNED(64, 15, NONCE,12)
+	STORE_ALIGNED(60, 14, NONCE, 8)
+	STORE_ALIGNED(56, 13, NONCE, 4)
+	STORE_ALIGNED(52, 12, NONCE, 0)
+	STORE_ALIGNED(48, 11, KEY, 28)
+	STORE_ALIGNED(44, 10, KEY, 24)
+	STORE_ALIGNED(40,  9, KEY, 20)
+	STORE_ALIGNED(36,  8, KEY, 16)
+	STORE_ALIGNED(32,  7, KEY, 12)
+	STORE_ALIGNED(28,  6, KEY,  8)
+	STORE_ALIGNED(24,  5, KEY,  4)
+	STORE_ALIGNED(20,  4, KEY,  0)
+	STORE_ALIGNED(16,  3, $sp, 12+CONSTANT_OFS_SP)
+	STORE_ALIGNED(12,  2, $sp,  8+CONSTANT_OFS_SP)
+	STORE_ALIGNED( 8,  1, $sp,  4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_aligned_4_b:
+	/* STORE_ALIGNED( 4,  0, $sp, 0+CONSTANT_OFS_SP) */
+	lw	T0, 0+CONSTANT_OFS_SP($sp)
+	lw	T1, 0(IN)
+	addu	X0, T0
+	CPU_TO_LE32(X0)
+	xor	X0, T1
+	.set noreorder
+	bne	OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+	sw	X0, 0(OUT)
+	.set reorder
+
+	.set noreorder
+	bne	$at, BYTES, .Lchacha20_mips_xor_bytes
+	/* Empty delayslot, Increase NONCE_0, return NONCE_0 value */
+	addiu	NONCE_0, 1
+	.set noreorder
+
+.Lchacha20_mips_xor_done:
+	/* Restore used registers */
+	lw	$ra,  0($sp)
+	lw	$fp,  4($sp)
+	lw	$s0,  8($sp)
+	lw	$s1, 12($sp)
+	lw	$s2, 16($sp)
+	lw	$s3, 20($sp)
+	lw	$s4, 24($sp)
+	lw	$s5, 28($sp)
+	lw	$s6, 32($sp)
+	lw	$s7, 36($sp)
+.Lchacha20_mips_end:
+	.set noreorder
+	jr	$ra
+	addiu	$sp, STACK_SIZE
+	.set reorder
+
+	.set noreorder
+	/* Start jump table */
+	JMPTBL_ALIGNED( 0,  0, $sp,  0+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED( 4,  1, $sp,  4+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED( 8,  2, $sp,  8+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED(12,  3, $sp, 12+CONSTANT_OFS_SP)
+	JMPTBL_ALIGNED(16,  4, KEY,  0)
+	JMPTBL_ALIGNED(20,  5, KEY,  4)
+	JMPTBL_ALIGNED(24,  6, KEY,  8)
+	JMPTBL_ALIGNED(28,  7, KEY, 12)
+	JMPTBL_ALIGNED(32,  8, KEY, 16)
+	JMPTBL_ALIGNED(36,  9, KEY, 20)
+	JMPTBL_ALIGNED(40, 10, KEY, 24)
+	JMPTBL_ALIGNED(44, 11, KEY, 28)
+	JMPTBL_ALIGNED(48, 12, NONCE, 0)
+	JMPTBL_ALIGNED(52, 13, NONCE, 4)
+	JMPTBL_ALIGNED(56, 14, NONCE, 8)
+	JMPTBL_ALIGNED(60, 15, NONCE,12)
+	/* End jump table */
+	.set reorder
+
+/* Unaligned code path
+ */
+	STORE_UNALIGNED(64, 15, NONCE,12)
+	STORE_UNALIGNED(60, 14, NONCE, 8)
+	STORE_UNALIGNED(56, 13, NONCE, 4)
+	STORE_UNALIGNED(52, 12, NONCE, 0)
+	STORE_UNALIGNED(48, 11, KEY, 28)
+	STORE_UNALIGNED(44, 10, KEY, 24)
+	STORE_UNALIGNED(40,  9, KEY, 20)
+	STORE_UNALIGNED(36,  8, KEY, 16)
+	STORE_UNALIGNED(32,  7, KEY, 12)
+	STORE_UNALIGNED(28,  6, KEY,  8)
+	STORE_UNALIGNED(24,  5, KEY,  4)
+	STORE_UNALIGNED(20,  4, KEY,  0)
+	STORE_UNALIGNED(16,  3, $sp, 12+CONSTANT_OFS_SP)
+	STORE_UNALIGNED(12,  2, $sp,  8+CONSTANT_OFS_SP)
+	STORE_UNALIGNED( 8,  1, $sp,  4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_unaligned_4_b:
+	/* STORE_UNALIGNED( 4,  0, $sp, 0+CONSTANT_OFS_SP) */
+	lw	T0, 0+CONSTANT_OFS_SP($sp)
+	lwl	T1, 0+MSB(IN)
+	lwr	T1, 0+LSB(IN)
+	addu	X0, T0
+	CPU_TO_LE32(X0)
+	xor	X0, T1
+	swl	X0, 0+MSB(OUT)
+	.set noreorder
+	bne	OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+	swr	X0, 0+LSB(OUT)
+	.set reorder
+
+	/* Fall through to byte handling */
+	.set noreorder
+	beq	$at, BYTES, .Lchacha20_mips_xor_done
+	/* Empty delayslot, increase NONCE_0, return NONCE_0 value */
+.Lchacha20_mips_xor_unaligned_0_b:
+.Lchacha20_mips_xor_aligned_0_b:
+	addiu	NONCE_0, 1
+	.set reorder
+
+.Lchacha20_mips_xor_bytes:
+	addu	OUT, $at
+	addu	IN, $at
+	addu	SAVED_X, SAVED_CA
+	/* First byte */
+	lbu	T1, 0(IN)
+	andi	$at, BYTES, 2
+	CPU_TO_LE32(SAVED_X)
+	ROTR(SAVED_X)
+	xor	T1, SAVED_X
+	.set noreorder
+	beqz	$at, .Lchacha20_mips_xor_done
+	sb	T1, 0(OUT)
+	.set reorder
+	/* Second byte */
+	lbu	T1, 1(IN)
+	andi	$at, BYTES, 1
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	.set noreorder
+	beqz	$at, .Lchacha20_mips_xor_done
+	sb	T1, 1(OUT)
+	.set reorder
+	/* Third byte */
+	lbu	T1, 2(IN)
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	.set noreorder
+	b	.Lchacha20_mips_xor_done
+	sb	T1, 2(OUT)
+	.set reorder
+.set noreorder
+
+.Lchacha20_mips_jmptbl_unaligned:
+	/* Start jump table */
+	JMPTBL_UNALIGNED( 0,  0, $sp,  0+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED( 4,  1, $sp,  4+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED( 8,  2, $sp,  8+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED(12,  3, $sp, 12+CONSTANT_OFS_SP)
+	JMPTBL_UNALIGNED(16,  4, KEY,  0)
+	JMPTBL_UNALIGNED(20,  5, KEY,  4)
+	JMPTBL_UNALIGNED(24,  6, KEY,  8)
+	JMPTBL_UNALIGNED(28,  7, KEY, 12)
+	JMPTBL_UNALIGNED(32,  8, KEY, 16)
+	JMPTBL_UNALIGNED(36,  9, KEY, 20)
+	JMPTBL_UNALIGNED(40, 10, KEY, 24)
+	JMPTBL_UNALIGNED(44, 11, KEY, 28)
+	JMPTBL_UNALIGNED(48, 12, NONCE, 0)
+	JMPTBL_UNALIGNED(52, 13, NONCE, 4)
+	JMPTBL_UNALIGNED(56, 14, NONCE, 8)
+	JMPTBL_UNALIGNED(60, 15, NONCE,12)
+	/* End jump table */
+.set reorder
+
+.end chacha20_mips
+.set at
-- 
2.18.0

^ permalink raw reply related

* [PATCH v2 05/17] zinc: ChaCha20 x86_64 implementation
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
  To: linux-kernel, netdev, davem
  Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
	Jean-Philippe Aumasson, Andy Polyakov, Thomas Gleixner,
	Ingo Molnar, x86, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>

This provides SSSE3, AVX-2, AVX-512F, and AVX-512VL implementations for
ChaCha20. The AVX-512F implementation is disabled on Skylake, due to
throttling, and the VL ymm implementation is used instead. These come
from Andy Polyakov's implementation, with some heavy modifications from
Samuel Neves and me.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
Cc: linux-crypto@vger.kernel.org
---
 lib/zinc/Makefile                        |    4 +
 lib/zinc/chacha20/chacha20-x86_64-glue.h |   84 +
 lib/zinc/chacha20/chacha20-x86_64.S      | 2630 ++++++++++++++++++++++
 3 files changed, 2718 insertions(+)
 create mode 100644 lib/zinc/chacha20/chacha20-x86_64-glue.h
 create mode 100644 lib/zinc/chacha20/chacha20-x86_64.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 87be627f3354..fc6ca1c84cd1 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -4,6 +4,10 @@ ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
 
 ifeq ($(CONFIG_ZINC_CHACHA20),y)
 zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_X86_64),y)
+zinc-y += chacha20/chacha20-x86_64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-x86_64-glue.h
+endif
 ifeq ($(CONFIG_ARM),y)
 zinc-y += chacha20/chacha20-arm.o
 CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
diff --git a/lib/zinc/chacha20/chacha20-x86_64-glue.h b/lib/zinc/chacha20/chacha20-x86_64-glue.h
new file mode 100644
index 000000000000..ece8279e93e0
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64-glue.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+#ifdef CONFIG_AS_SSSE3
+asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key);
+asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_ssse3 __ro_after_init;
+static bool chacha20_use_avx2 __ro_after_init;
+static bool chacha20_use_avx512 __ro_after_init;
+static bool chacha20_use_avx512vl __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#ifndef CONFIG_UML
+	chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
+	chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+	chacha20_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+			      cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+			      boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+	chacha20_use_avx512vl = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) &&
+				cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL);
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, const u32 key[8], const u32 counter[4], simd_context_t simd_context)
+{
+	if (simd_context != HAVE_FULL_SIMD)
+		return false;
+
+#ifdef CONFIG_AS_AVX512
+	if (chacha20_use_avx512) {
+		chacha20_avx512(dst, src, len, key, counter);
+		return true;
+	}
+	if (chacha20_use_avx512vl) {
+		chacha20_avx512vl(dst, src, len, key, counter);
+		return true;
+	}
+#endif
+#ifdef CONFIG_AS_AVX2
+	if (chacha20_use_avx2) {
+		chacha20_avx2(dst, src, len, key, counter);
+		return true;
+	}
+#endif
+#ifdef CONFIG_AS_SSSE3
+	if (chacha20_use_ssse3) {
+		chacha20_ssse3(dst, src, len, key, counter);
+		return true;
+	}
+#endif
+	return false;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, const u8 *key, simd_context_t simd_context)
+{
+#if defined(CONFIG_AS_SSSE3)
+	if (simd_context == HAVE_FULL_SIMD && chacha20_use_ssse3) {
+		hchacha20_ssse3(derived_key, nonce, key);
+		return true;
+	}
+#endif
+	return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-x86_64.S b/lib/zinc/chacha20/chacha20-x86_64.S
new file mode 100644
index 000000000000..39883f3718b6
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64.S
@@ -0,0 +1,2630 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.Lzero, "aM", @progbits, 16
+.align	16
+.Lzero:
+.long	0,0,0,0
+.section .rodata.cst16.Lone, "aM", @progbits, 16
+.align	16
+.Lone:
+.long	1,0,0,0
+.section .rodata.cst16.Linc, "aM", @progbits, 16
+.align	16
+.Linc:
+.long	0,1,2,3
+.section .rodata.cst16.Lfour, "aM", @progbits, 16
+.align	16
+.Lfour:
+.long	4,4,4,4
+.section .rodata.cst32.Lincy, "aM", @progbits, 32
+.align	32
+.Lincy:
+.long	0,2,4,6,1,3,5,7
+.section .rodata.cst32.Leight, "aM", @progbits, 32
+.align	32
+.Leight:
+.long	8,8,8,8,8,8,8,8
+.section .rodata.cst16.Lrot16, "aM", @progbits, 16
+.align	16
+.Lrot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.section .rodata.cst16.Lrot24, "aM", @progbits, 16
+.align	16
+.Lrot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.section .rodata.cst16.Lsigma, "aM", @progbits, 16
+.align	16
+.Lsigma:
+.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.section .rodata.cst64.Lfourz, "aM", @progbits, 64
+.align	64
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.section .rodata.cst64.Lincz, "aM", @progbits, 64
+.align	64
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
+.align	64
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
+.align	64
+.Ltwoy:
+.long	2,0,0,0, 2,0,0,0
+
+.text
+
+#ifdef CONFIG_AS_SSSE3
+.align	32
+ENTRY(hchacha20_ssse3)
+	movdqa	.Lsigma(%rip),%xmm0
+	movdqu	(%rdx),%xmm1
+	movdqu	16(%rdx),%xmm2
+	movdqu	(%rsi),%xmm3
+	movdqa	.Lrot16(%rip),%xmm6
+	movdqa	.Lrot24(%rip),%xmm7
+	movq	$10,%r8
+	.align	32
+.Loop_hssse3:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decq	%r8
+	jnz	.Loop_hssse3
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	ret
+ENDPROC(hchacha20_ssse3)
+
+.align	32
+ENTRY(chacha20_ssse3)
+.Lchacha20_ssse3:
+	cmpq	$0,%rdx
+	je	.Lssse3_epilogue
+	leaq	8(%rsp),%r10
+
+	cmpq	$128,%rdx
+	ja	.Lchacha20_4x
+
+.Ldo_sse3_after_all:
+	subq	$64+8,%rsp
+	andq	$-32,%rsp
+	movdqa	.Lsigma(%rip),%xmm0
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	.Lrot16(%rip),%xmm6
+	movdqa	.Lrot24(%rip),%xmm7
+
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	$10,%r8
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_outer_ssse3:
+	movdqa	.Lone(%rip),%xmm3
+	movdqa	0(%rsp),%xmm0
+	movdqa	16(%rsp),%xmm1
+	movdqa	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+	movq	$10,%r8
+	movdqa	%xmm3,48(%rsp)
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_ssse3:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decq	%r8
+	jnz	.Loop_ssse3
+	paddd	0(%rsp),%xmm0
+	paddd	16(%rsp),%xmm1
+	paddd	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+
+	cmpq	$64,%rdx
+	jb	.Ltail_ssse3
+
+	movdqu	0(%rsi),%xmm4
+	movdqu	16(%rsi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%rsi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%rsi),%xmm5
+	leaq	64(%rsi),%rsi
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm1,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rdx
+	jnz	.Loop_outer_ssse3
+
+	jmp	.Ldone_ssse3
+
+.align	16
+.Ltail_ssse3:
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	xorq	%r8,%r8
+
+.Loop_tail_ssse3:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_ssse3
+
+.Ldone_ssse3:
+	leaq	-8(%r10),%rsp
+
+.Lssse3_epilogue:
+	ret
+
+.align	32
+.Lchacha20_4x:
+	leaq	8(%rsp),%r10
+
+.Lproceed4x:
+	subq	$0x140+8,%rsp
+	andq	$-32,%rsp
+	movdqa	.Lsigma(%rip),%xmm11
+	movdqu	(%rcx),%xmm15
+	movdqu	16(%rcx),%xmm7
+	movdqu	(%r8),%xmm3
+	leaq	256(%rsp),%rcx
+	leaq	.Lrot16(%rip),%r9
+	leaq	.Lrot24(%rip),%r11
+
+	pshufd	$0x00,%xmm11,%xmm8
+	pshufd	$0x55,%xmm11,%xmm9
+	movdqa	%xmm8,64(%rsp)
+	pshufd	$0xaa,%xmm11,%xmm10
+	movdqa	%xmm9,80(%rsp)
+	pshufd	$0xff,%xmm11,%xmm11
+	movdqa	%xmm10,96(%rsp)
+	movdqa	%xmm11,112(%rsp)
+
+	pshufd	$0x00,%xmm15,%xmm12
+	pshufd	$0x55,%xmm15,%xmm13
+	movdqa	%xmm12,128-256(%rcx)
+	pshufd	$0xaa,%xmm15,%xmm14
+	movdqa	%xmm13,144-256(%rcx)
+	pshufd	$0xff,%xmm15,%xmm15
+	movdqa	%xmm14,160-256(%rcx)
+	movdqa	%xmm15,176-256(%rcx)
+
+	pshufd	$0x00,%xmm7,%xmm4
+	pshufd	$0x55,%xmm7,%xmm5
+	movdqa	%xmm4,192-256(%rcx)
+	pshufd	$0xaa,%xmm7,%xmm6
+	movdqa	%xmm5,208-256(%rcx)
+	pshufd	$0xff,%xmm7,%xmm7
+	movdqa	%xmm6,224-256(%rcx)
+	movdqa	%xmm7,240-256(%rcx)
+
+	pshufd	$0x00,%xmm3,%xmm0
+	pshufd	$0x55,%xmm3,%xmm1
+	paddd	.Linc(%rip),%xmm0
+	pshufd	$0xaa,%xmm3,%xmm2
+	movdqa	%xmm1,272-256(%rcx)
+	pshufd	$0xff,%xmm3,%xmm3
+	movdqa	%xmm2,288-256(%rcx)
+	movdqa	%xmm3,304-256(%rcx)
+
+	jmp	.Loop_enter4x
+
+.align	32
+.Loop_outer4x:
+	movdqa	64(%rsp),%xmm8
+	movdqa	80(%rsp),%xmm9
+	movdqa	96(%rsp),%xmm10
+	movdqa	112(%rsp),%xmm11
+	movdqa	128-256(%rcx),%xmm12
+	movdqa	144-256(%rcx),%xmm13
+	movdqa	160-256(%rcx),%xmm14
+	movdqa	176-256(%rcx),%xmm15
+	movdqa	192-256(%rcx),%xmm4
+	movdqa	208-256(%rcx),%xmm5
+	movdqa	224-256(%rcx),%xmm6
+	movdqa	240-256(%rcx),%xmm7
+	movdqa	256-256(%rcx),%xmm0
+	movdqa	272-256(%rcx),%xmm1
+	movdqa	288-256(%rcx),%xmm2
+	movdqa	304-256(%rcx),%xmm3
+	paddd	.Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm7,48(%rsp)
+	movdqa	(%r9),%xmm7
+	movl	$10,%eax
+	movdqa	%xmm0,256-256(%rcx)
+	jmp	.Loop4x
+
+.align	32
+.Loop4x:
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+	pshufb	%xmm7,%xmm0
+	pshufb	%xmm7,%xmm1
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm6
+	pslld	$12,%xmm12
+	psrld	$20,%xmm6
+	movdqa	%xmm13,%xmm7
+	pslld	$12,%xmm13
+	por	%xmm6,%xmm12
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm13
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+	pshufb	%xmm6,%xmm0
+	pshufb	%xmm6,%xmm1
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm7
+	pslld	$7,%xmm12
+	psrld	$25,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$7,%xmm13
+	por	%xmm7,%xmm12
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm13
+	movdqa	%xmm4,0(%rsp)
+	movdqa	%xmm5,16(%rsp)
+	movdqa	32(%rsp),%xmm4
+	movdqa	48(%rsp),%xmm5
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+	pshufb	%xmm7,%xmm2
+	pshufb	%xmm7,%xmm3
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm6
+	pslld	$12,%xmm14
+	psrld	$20,%xmm6
+	movdqa	%xmm15,%xmm7
+	pslld	$12,%xmm15
+	por	%xmm6,%xmm14
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm15
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+	pshufb	%xmm6,%xmm2
+	pshufb	%xmm6,%xmm3
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm7
+	pslld	$7,%xmm14
+	psrld	$25,%xmm7
+	movdqa	%xmm15,%xmm6
+	pslld	$7,%xmm15
+	por	%xmm7,%xmm14
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm15
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+	pshufb	%xmm7,%xmm3
+	pshufb	%xmm7,%xmm0
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm6
+	pslld	$12,%xmm13
+	psrld	$20,%xmm6
+	movdqa	%xmm14,%xmm7
+	pslld	$12,%xmm14
+	por	%xmm6,%xmm13
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm14
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+	pshufb	%xmm6,%xmm3
+	pshufb	%xmm6,%xmm0
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm7
+	pslld	$7,%xmm13
+	psrld	$25,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$7,%xmm14
+	por	%xmm7,%xmm13
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm14
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm5,48(%rsp)
+	movdqa	0(%rsp),%xmm4
+	movdqa	16(%rsp),%xmm5
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+	pshufb	%xmm7,%xmm1
+	pshufb	%xmm7,%xmm2
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm6
+	pslld	$12,%xmm15
+	psrld	$20,%xmm6
+	movdqa	%xmm12,%xmm7
+	pslld	$12,%xmm12
+	por	%xmm6,%xmm15
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm12
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+	pshufb	%xmm6,%xmm1
+	pshufb	%xmm6,%xmm2
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm7
+	pslld	$7,%xmm15
+	psrld	$25,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$7,%xmm12
+	por	%xmm7,%xmm15
+	psrld	$25,%xmm6
+	movdqa	(%r9),%xmm7
+	por	%xmm6,%xmm12
+	decl	%eax
+	jnz	.Loop4x
+
+	paddd	64(%rsp),%xmm8
+	paddd	80(%rsp),%xmm9
+	paddd	96(%rsp),%xmm10
+	paddd	112(%rsp),%xmm11
+
+	movdqa	%xmm8,%xmm6
+	punpckldq	%xmm9,%xmm8
+	movdqa	%xmm10,%xmm7
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm9,%xmm6
+	punpckhdq	%xmm11,%xmm7
+	movdqa	%xmm8,%xmm9
+	punpcklqdq	%xmm10,%xmm8
+	movdqa	%xmm6,%xmm11
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm10,%xmm9
+	punpckhqdq	%xmm7,%xmm11
+	paddd	128-256(%rcx),%xmm12
+	paddd	144-256(%rcx),%xmm13
+	paddd	160-256(%rcx),%xmm14
+	paddd	176-256(%rcx),%xmm15
+
+	movdqa	%xmm8,0(%rsp)
+	movdqa	%xmm9,16(%rsp)
+	movdqa	32(%rsp),%xmm8
+	movdqa	48(%rsp),%xmm9
+
+	movdqa	%xmm12,%xmm10
+	punpckldq	%xmm13,%xmm12
+	movdqa	%xmm14,%xmm7
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm13,%xmm10
+	punpckhdq	%xmm15,%xmm7
+	movdqa	%xmm12,%xmm13
+	punpcklqdq	%xmm14,%xmm12
+	movdqa	%xmm10,%xmm15
+	punpcklqdq	%xmm7,%xmm10
+	punpckhqdq	%xmm14,%xmm13
+	punpckhqdq	%xmm7,%xmm15
+	paddd	192-256(%rcx),%xmm4
+	paddd	208-256(%rcx),%xmm5
+	paddd	224-256(%rcx),%xmm8
+	paddd	240-256(%rcx),%xmm9
+
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm11,48(%rsp)
+
+	movdqa	%xmm4,%xmm14
+	punpckldq	%xmm5,%xmm4
+	movdqa	%xmm8,%xmm7
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm5,%xmm14
+	punpckhdq	%xmm9,%xmm7
+	movdqa	%xmm4,%xmm5
+	punpcklqdq	%xmm8,%xmm4
+	movdqa	%xmm14,%xmm9
+	punpcklqdq	%xmm7,%xmm14
+	punpckhqdq	%xmm8,%xmm5
+	punpckhqdq	%xmm7,%xmm9
+	paddd	256-256(%rcx),%xmm0
+	paddd	272-256(%rcx),%xmm1
+	paddd	288-256(%rcx),%xmm2
+	paddd	304-256(%rcx),%xmm3
+
+	movdqa	%xmm0,%xmm8
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm8
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm8,%xmm3
+	punpcklqdq	%xmm7,%xmm8
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	cmpq	$256,%rdx
+	jb	.Ltail4x
+
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	48(%rsp),%xmm6
+	pxor	%xmm15,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$256,%rdx
+	jnz	.Loop_outer4x
+
+	jmp	.Ldone4x
+
+.Ltail4x:
+	cmpq	$192,%rdx
+	jae	.L192_or_more4x
+	cmpq	$128,%rdx
+	jae	.L128_or_more4x
+	cmpq	$64,%rdx
+	jae	.L64_or_more4x
+
+
+	xorq	%r9,%r9
+
+	movdqa	%xmm12,16(%rsp)
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L64_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	16(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r9,%r9
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm13,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm5,32(%rsp)
+	subq	$64,%rdx
+	movdqa	%xmm1,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L128_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	je	.Ldone4x
+
+	movdqa	32(%rsp),%xmm6
+	leaq	128(%rsi),%rsi
+	xorq	%r9,%r9
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm10,16(%rsp)
+	leaq	128(%rdi),%rdi
+	movdqa	%xmm14,32(%rsp)
+	subq	$128,%rdx
+	movdqa	%xmm8,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L192_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	48(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r9,%r9
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm15,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm9,32(%rsp)
+	subq	$192,%rdx
+	movdqa	%xmm3,48(%rsp)
+
+.Loop_tail4x:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail4x
+
+.Ldone4x:
+	leaq	-8(%r10),%rsp
+
+.L4x_epilogue:
+	ret
+ENDPROC(chacha20_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX2
+.align	32
+ENTRY(chacha20_avx2)
+.Lchacha20_avx2:
+	cmpq	$0,%rdx
+	je	.L8x_epilogue
+	leaq	8(%rsp),%r10
+
+	subq	$0x280+8,%rsp
+	andq	$-32,%rsp
+	vzeroupper
+
+	vbroadcasti128	.Lsigma(%rip),%ymm11
+	vbroadcasti128	(%rcx),%ymm3
+	vbroadcasti128	16(%rcx),%ymm15
+	vbroadcasti128	(%r8),%ymm7
+	leaq	256(%rsp),%rcx
+	leaq	512(%rsp),%rax
+	leaq	.Lrot16(%rip),%r9
+	leaq	.Lrot24(%rip),%r11
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vmovdqa	%ymm8,128-256(%rcx)
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vmovdqa	%ymm9,160-256(%rcx)
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa	%ymm10,192-256(%rcx)
+	vmovdqa	%ymm11,224-256(%rcx)
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vmovdqa	%ymm0,256-256(%rcx)
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vmovdqa	%ymm1,288-256(%rcx)
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa	%ymm2,320-256(%rcx)
+	vmovdqa	%ymm3,352-256(%rcx)
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vmovdqa	%ymm12,384-512(%rax)
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vmovdqa	%ymm13,416-512(%rax)
+	vpshufd	$0xff,%ymm15,%ymm15
+	vmovdqa	%ymm14,448-512(%rax)
+	vmovdqa	%ymm15,480-512(%rax)
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpaddd	.Lincy(%rip),%ymm4,%ymm4
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vmovdqa	%ymm5,544-512(%rax)
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa	%ymm6,576-512(%rax)
+	vmovdqa	%ymm7,608-512(%rax)
+
+	jmp	.Loop_enter8x
+
+.align	32
+.Loop_outer8x:
+	vmovdqa	128-256(%rcx),%ymm8
+	vmovdqa	160-256(%rcx),%ymm9
+	vmovdqa	192-256(%rcx),%ymm10
+	vmovdqa	224-256(%rcx),%ymm11
+	vmovdqa	256-256(%rcx),%ymm0
+	vmovdqa	288-256(%rcx),%ymm1
+	vmovdqa	320-256(%rcx),%ymm2
+	vmovdqa	352-256(%rcx),%ymm3
+	vmovdqa	384-512(%rax),%ymm12
+	vmovdqa	416-512(%rax),%ymm13
+	vmovdqa	448-512(%rax),%ymm14
+	vmovdqa	480-512(%rax),%ymm15
+	vmovdqa	512-512(%rax),%ymm4
+	vmovdqa	544-512(%rax),%ymm5
+	vmovdqa	576-512(%rax),%ymm6
+	vmovdqa	608-512(%rax),%ymm7
+	vpaddd	.Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+	vmovdqa	%ymm14,64(%rsp)
+	vmovdqa	%ymm15,96(%rsp)
+	vbroadcasti128	(%r9),%ymm15
+	vmovdqa	%ymm4,512-512(%rax)
+	movl	$10,%eax
+	jmp	.Loop8x
+
+.align	32
+.Loop8x:
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$12,%ymm0,%ymm14
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$12,%ymm1,%ymm15
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$7,%ymm0,%ymm15
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$7,%ymm1,%ymm14
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vmovdqa	%ymm12,0(%rsp)
+	vmovdqa	%ymm13,32(%rsp)
+	vmovdqa	64(%rsp),%ymm12
+	vmovdqa	96(%rsp),%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$12,%ymm2,%ymm14
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$12,%ymm3,%ymm15
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$7,%ymm2,%ymm15
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$7,%ymm3,%ymm14
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$12,%ymm1,%ymm14
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$12,%ymm2,%ymm15
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$7,%ymm1,%ymm15
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$7,%ymm2,%ymm14
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vmovdqa	%ymm12,64(%rsp)
+	vmovdqa	%ymm13,96(%rsp)
+	vmovdqa	0(%rsp),%ymm12
+	vmovdqa	32(%rsp),%ymm13
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$12,%ymm3,%ymm14
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$12,%ymm0,%ymm15
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$7,%ymm3,%ymm15
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vbroadcasti128	(%r9),%ymm15
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$7,%ymm0,%ymm14
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	decl	%eax
+	jnz	.Loop8x
+
+	leaq	512(%rsp),%rax
+	vpaddd	128-256(%rcx),%ymm8,%ymm8
+	vpaddd	160-256(%rcx),%ymm9,%ymm9
+	vpaddd	192-256(%rcx),%ymm10,%ymm10
+	vpaddd	224-256(%rcx),%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm14
+	vpunpckldq	%ymm11,%ymm10,%ymm15
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm15,%ymm14,%ymm9
+	vpunpckhqdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	256-256(%rcx),%ymm0,%ymm0
+	vpaddd	288-256(%rcx),%ymm1,%ymm1
+	vpaddd	320-256(%rcx),%ymm2,%ymm2
+	vpaddd	352-256(%rcx),%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm10
+	vpunpckldq	%ymm3,%ymm2,%ymm15
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm10,%ymm1
+	vpunpckhqdq	%ymm15,%ymm10,%ymm10
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
+	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
+	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
+	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
+	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
+	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
+	vmovdqa	%ymm15,0(%rsp)
+	vmovdqa	%ymm9,32(%rsp)
+	vmovdqa	64(%rsp),%ymm15
+	vmovdqa	96(%rsp),%ymm9
+
+	vpaddd	384-512(%rax),%ymm12,%ymm12
+	vpaddd	416-512(%rax),%ymm13,%ymm13
+	vpaddd	448-512(%rax),%ymm15,%ymm15
+	vpaddd	480-512(%rax),%ymm9,%ymm9
+
+	vpunpckldq	%ymm13,%ymm12,%ymm2
+	vpunpckldq	%ymm9,%ymm15,%ymm8
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm9,%ymm15,%ymm15
+	vpunpcklqdq	%ymm8,%ymm2,%ymm13
+	vpunpckhqdq	%ymm8,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm12,%ymm9
+	vpunpckhqdq	%ymm15,%ymm12,%ymm12
+	vpaddd	512-512(%rax),%ymm4,%ymm4
+	vpaddd	544-512(%rax),%ymm5,%ymm5
+	vpaddd	576-512(%rax),%ymm6,%ymm6
+	vpaddd	608-512(%rax),%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm15
+	vpunpckldq	%ymm7,%ymm6,%ymm8
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm8,%ymm15,%ymm5
+	vpunpckhqdq	%ymm8,%ymm15,%ymm15
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
+	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
+	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
+	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
+	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
+	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
+	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
+	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
+	vmovdqa	0(%rsp),%ymm6
+	vmovdqa	32(%rsp),%ymm12
+
+	cmpq	$512,%rdx
+	jb	.Ltail8x
+
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm12,%ymm12
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vpxor	64(%rsi),%ymm10,%ymm10
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm12,0(%rdi)
+	vmovdqu	%ymm13,32(%rdi)
+	vmovdqu	%ymm10,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm14,%ymm14
+	vpxor	32(%rsi),%ymm2,%ymm2
+	vpxor	64(%rsi),%ymm3,%ymm3
+	vpxor	96(%rsi),%ymm7,%ymm7
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm14,0(%rdi)
+	vmovdqu	%ymm2,32(%rdi)
+	vmovdqu	%ymm3,64(%rdi)
+	vmovdqu	%ymm7,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm11,%ymm11
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm0,%ymm0
+	vpxor	96(%rsi),%ymm4,%ymm4
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm11,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm0,64(%rdi)
+	vmovdqu	%ymm4,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$512,%rdx
+	jnz	.Loop_outer8x
+
+	jmp	.Ldone8x
+
+.Ltail8x:
+	cmpq	$448,%rdx
+	jae	.L448_or_more8x
+	cmpq	$384,%rdx
+	jae	.L384_or_more8x
+	cmpq	$320,%rdx
+	jae	.L320_or_more8x
+	cmpq	$256,%rdx
+	jae	.L256_or_more8x
+	cmpq	$192,%rdx
+	jae	.L192_or_more8x
+	cmpq	$128,%rdx
+	jae	.L128_or_more8x
+	cmpq	$64,%rdx
+	jae	.L64_or_more8x
+
+	xorq	%r9,%r9
+	vmovdqa	%ymm6,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L64_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	je	.Ldone8x
+
+	leaq	64(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm1,0(%rsp)
+	leaq	64(%rdi),%rdi
+	subq	$64,%rdx
+	vmovdqa	%ymm5,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L128_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	je	.Ldone8x
+
+	leaq	128(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm12,0(%rsp)
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	vmovdqa	%ymm13,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L192_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	je	.Ldone8x
+
+	leaq	192(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm10,0(%rsp)
+	leaq	192(%rdi),%rdi
+	subq	$192,%rdx
+	vmovdqa	%ymm15,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L256_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	je	.Ldone8x
+
+	leaq	256(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm14,0(%rsp)
+	leaq	256(%rdi),%rdi
+	subq	$256,%rdx
+	vmovdqa	%ymm2,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L320_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	je	.Ldone8x
+
+	leaq	320(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm3,0(%rsp)
+	leaq	320(%rdi),%rdi
+	subq	$320,%rdx
+	vmovdqa	%ymm7,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L384_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	je	.Ldone8x
+
+	leaq	384(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm11,0(%rsp)
+	leaq	384(%rdi),%rdi
+	subq	$384,%rdx
+	vmovdqa	%ymm9,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L448_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vpxor	384(%rsi),%ymm11,%ymm11
+	vpxor	416(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	vmovdqu	%ymm11,384(%rdi)
+	vmovdqu	%ymm9,416(%rdi)
+	je	.Ldone8x
+
+	leaq	448(%rsi),%rsi
+	xorq	%r9,%r9
+	vmovdqa	%ymm0,0(%rsp)
+	leaq	448(%rdi),%rdi
+	subq	$448,%rdx
+	vmovdqa	%ymm4,32(%rsp)
+
+.Loop_tail8x:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail8x
+
+.Ldone8x:
+	vzeroall
+	leaq	-8(%r10),%rsp
+
+.L8x_epilogue:
+	ret
+ENDPROC(chacha20_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align	32
+ENTRY(chacha20_avx512)
+.Lchacha20_avx512:
+	cmpq	$0,%rdx
+	je	.Lavx512_epilogue
+	leaq	8(%rsp),%r10
+
+	cmpq	$512,%rdx
+	ja	.Lchacha20_16x
+
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vbroadcasti32x4	.Lsigma(%rip),%zmm0
+	vbroadcasti32x4	(%rcx),%zmm1
+	vbroadcasti32x4	16(%rcx),%zmm2
+	vbroadcasti32x4	(%r8),%zmm3
+
+	vmovdqa32	%zmm0,%zmm16
+	vmovdqa32	%zmm1,%zmm17
+	vmovdqa32	%zmm2,%zmm18
+	vpaddd	.Lzeroz(%rip),%zmm3,%zmm3
+	vmovdqa32	.Lfourz(%rip),%zmm20
+	movq	$10,%r8
+	vmovdqa32	%zmm3,%zmm19
+	jmp	.Loop_avx512
+
+.align	16
+.Loop_outer_avx512:
+	vmovdqa32	%zmm16,%zmm0
+	vmovdqa32	%zmm17,%zmm1
+	vmovdqa32	%zmm18,%zmm2
+	vpaddd	%zmm20,%zmm19,%zmm3
+	movq	$10,%r8
+	vmovdqa32	%zmm3,%zmm19
+	jmp	.Loop_avx512
+
+.align	32
+.Loop_avx512:
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$16,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$12,%zmm1,%zmm1
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$8,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$7,%zmm1,%zmm1
+	vpshufd	$78,%zmm2,%zmm2
+	vpshufd	$57,%zmm1,%zmm1
+	vpshufd	$147,%zmm3,%zmm3
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$16,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$12,%zmm1,%zmm1
+	vpaddd	%zmm1,%zmm0,%zmm0
+	vpxord	%zmm0,%zmm3,%zmm3
+	vprold	$8,%zmm3,%zmm3
+	vpaddd	%zmm3,%zmm2,%zmm2
+	vpxord	%zmm2,%zmm1,%zmm1
+	vprold	$7,%zmm1,%zmm1
+	vpshufd	$78,%zmm2,%zmm2
+	vpshufd	$147,%zmm1,%zmm1
+	vpshufd	$57,%zmm3,%zmm3
+	decq	%r8
+	jnz	.Loop_avx512
+	vpaddd	%zmm16,%zmm0,%zmm0
+	vpaddd	%zmm17,%zmm1,%zmm1
+	vpaddd	%zmm18,%zmm2,%zmm2
+	vpaddd	%zmm19,%zmm3,%zmm3
+
+	subq	$64,%rdx
+	jb	.Ltail64_avx512
+
+	vpxor	0(%rsi),%xmm0,%xmm4
+	vpxor	16(%rsi),%xmm1,%xmm5
+	vpxor	32(%rsi),%xmm2,%xmm6
+	vpxor	48(%rsi),%xmm3,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512
+
+	vextracti32x4	$1,%zmm0,%xmm4
+	vextracti32x4	$1,%zmm1,%xmm5
+	vextracti32x4	$1,%zmm2,%xmm6
+	vextracti32x4	$1,%zmm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512
+
+	vextracti32x4	$2,%zmm0,%xmm4
+	vextracti32x4	$2,%zmm1,%xmm5
+	vextracti32x4	$2,%zmm2,%xmm6
+	vextracti32x4	$2,%zmm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512
+
+	vextracti32x4	$3,%zmm0,%xmm4
+	vextracti32x4	$3,%zmm1,%xmm5
+	vextracti32x4	$3,%zmm2,%xmm6
+	vextracti32x4	$3,%zmm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jnz	.Loop_outer_avx512
+
+	jmp	.Ldone_avx512
+
+.align	16
+.Ltail64_avx512:
+	vmovdqa	%xmm0,0(%rsp)
+	vmovdqa	%xmm1,16(%rsp)
+	vmovdqa	%xmm2,32(%rsp)
+	vmovdqa	%xmm3,48(%rsp)
+	addq	$64,%rdx
+	jmp	.Loop_tail_avx512
+
+.align	16
+.Ltail_avx512:
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	vmovdqa	%xmm7,48(%rsp)
+	addq	$64,%rdx
+
+.Loop_tail_avx512:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_avx512
+
+	vmovdqa32	%zmm16,0(%rsp)
+
+.Ldone_avx512:
+	vzeroall
+	leaq	-8(%r10),%rsp
+
+.Lavx512_epilogue:
+	ret
+
+.align	32
+.Lchacha20_16x:
+	leaq	8(%rsp),%r10
+
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vzeroupper
+
+	leaq	.Lsigma(%rip),%r9
+	vbroadcasti32x4	(%r9),%zmm3
+	vbroadcasti32x4	(%rcx),%zmm7
+	vbroadcasti32x4	16(%rcx),%zmm11
+	vbroadcasti32x4	(%r8),%zmm15
+
+	vpshufd	$0x00,%zmm3,%zmm0
+	vpshufd	$0x55,%zmm3,%zmm1
+	vpshufd	$0xaa,%zmm3,%zmm2
+	vpshufd	$0xff,%zmm3,%zmm3
+	vmovdqa64	%zmm0,%zmm16
+	vmovdqa64	%zmm1,%zmm17
+	vmovdqa64	%zmm2,%zmm18
+	vmovdqa64	%zmm3,%zmm19
+
+	vpshufd	$0x00,%zmm7,%zmm4
+	vpshufd	$0x55,%zmm7,%zmm5
+	vpshufd	$0xaa,%zmm7,%zmm6
+	vpshufd	$0xff,%zmm7,%zmm7
+	vmovdqa64	%zmm4,%zmm20
+	vmovdqa64	%zmm5,%zmm21
+	vmovdqa64	%zmm6,%zmm22
+	vmovdqa64	%zmm7,%zmm23
+
+	vpshufd	$0x00,%zmm11,%zmm8
+	vpshufd	$0x55,%zmm11,%zmm9
+	vpshufd	$0xaa,%zmm11,%zmm10
+	vpshufd	$0xff,%zmm11,%zmm11
+	vmovdqa64	%zmm8,%zmm24
+	vmovdqa64	%zmm9,%zmm25
+	vmovdqa64	%zmm10,%zmm26
+	vmovdqa64	%zmm11,%zmm27
+
+	vpshufd	$0x00,%zmm15,%zmm12
+	vpshufd	$0x55,%zmm15,%zmm13
+	vpshufd	$0xaa,%zmm15,%zmm14
+	vpshufd	$0xff,%zmm15,%zmm15
+	vpaddd	.Lincz(%rip),%zmm12,%zmm12
+	vmovdqa64	%zmm12,%zmm28
+	vmovdqa64	%zmm13,%zmm29
+	vmovdqa64	%zmm14,%zmm30
+	vmovdqa64	%zmm15,%zmm31
+
+	movl	$10,%eax
+	jmp	.Loop16x
+
+.align	32
+.Loop_outer16x:
+	vpbroadcastd	0(%r9),%zmm0
+	vpbroadcastd	4(%r9),%zmm1
+	vpbroadcastd	8(%r9),%zmm2
+	vpbroadcastd	12(%r9),%zmm3
+	vpaddd	.Lsixteen(%rip),%zmm28,%zmm28
+	vmovdqa64	%zmm20,%zmm4
+	vmovdqa64	%zmm21,%zmm5
+	vmovdqa64	%zmm22,%zmm6
+	vmovdqa64	%zmm23,%zmm7
+	vmovdqa64	%zmm24,%zmm8
+	vmovdqa64	%zmm25,%zmm9
+	vmovdqa64	%zmm26,%zmm10
+	vmovdqa64	%zmm27,%zmm11
+	vmovdqa64	%zmm28,%zmm12
+	vmovdqa64	%zmm29,%zmm13
+	vmovdqa64	%zmm30,%zmm14
+	vmovdqa64	%zmm31,%zmm15
+
+	vmovdqa64	%zmm0,%zmm16
+	vmovdqa64	%zmm1,%zmm17
+	vmovdqa64	%zmm2,%zmm18
+	vmovdqa64	%zmm3,%zmm19
+
+	movl	$10,%eax
+	jmp	.Loop16x
+
+.align	32
+.Loop16x:
+	vpaddd	%zmm4,%zmm0,%zmm0
+	vpaddd	%zmm5,%zmm1,%zmm1
+	vpaddd	%zmm6,%zmm2,%zmm2
+	vpaddd	%zmm7,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm12,%zmm12
+	vpxord	%zmm1,%zmm13,%zmm13
+	vpxord	%zmm2,%zmm14,%zmm14
+	vpxord	%zmm3,%zmm15,%zmm15
+	vprold	$16,%zmm12,%zmm12
+	vprold	$16,%zmm13,%zmm13
+	vprold	$16,%zmm14,%zmm14
+	vprold	$16,%zmm15,%zmm15
+	vpaddd	%zmm12,%zmm8,%zmm8
+	vpaddd	%zmm13,%zmm9,%zmm9
+	vpaddd	%zmm14,%zmm10,%zmm10
+	vpaddd	%zmm15,%zmm11,%zmm11
+	vpxord	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm9,%zmm5,%zmm5
+	vpxord	%zmm10,%zmm6,%zmm6
+	vpxord	%zmm11,%zmm7,%zmm7
+	vprold	$12,%zmm4,%zmm4
+	vprold	$12,%zmm5,%zmm5
+	vprold	$12,%zmm6,%zmm6
+	vprold	$12,%zmm7,%zmm7
+	vpaddd	%zmm4,%zmm0,%zmm0
+	vpaddd	%zmm5,%zmm1,%zmm1
+	vpaddd	%zmm6,%zmm2,%zmm2
+	vpaddd	%zmm7,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm12,%zmm12
+	vpxord	%zmm1,%zmm13,%zmm13
+	vpxord	%zmm2,%zmm14,%zmm14
+	vpxord	%zmm3,%zmm15,%zmm15
+	vprold	$8,%zmm12,%zmm12
+	vprold	$8,%zmm13,%zmm13
+	vprold	$8,%zmm14,%zmm14
+	vprold	$8,%zmm15,%zmm15
+	vpaddd	%zmm12,%zmm8,%zmm8
+	vpaddd	%zmm13,%zmm9,%zmm9
+	vpaddd	%zmm14,%zmm10,%zmm10
+	vpaddd	%zmm15,%zmm11,%zmm11
+	vpxord	%zmm8,%zmm4,%zmm4
+	vpxord	%zmm9,%zmm5,%zmm5
+	vpxord	%zmm10,%zmm6,%zmm6
+	vpxord	%zmm11,%zmm7,%zmm7
+	vprold	$7,%zmm4,%zmm4
+	vprold	$7,%zmm5,%zmm5
+	vprold	$7,%zmm6,%zmm6
+	vprold	$7,%zmm7,%zmm7
+	vpaddd	%zmm5,%zmm0,%zmm0
+	vpaddd	%zmm6,%zmm1,%zmm1
+	vpaddd	%zmm7,%zmm2,%zmm2
+	vpaddd	%zmm4,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm15,%zmm15
+	vpxord	%zmm1,%zmm12,%zmm12
+	vpxord	%zmm2,%zmm13,%zmm13
+	vpxord	%zmm3,%zmm14,%zmm14
+	vprold	$16,%zmm15,%zmm15
+	vprold	$16,%zmm12,%zmm12
+	vprold	$16,%zmm13,%zmm13
+	vprold	$16,%zmm14,%zmm14
+	vpaddd	%zmm15,%zmm10,%zmm10
+	vpaddd	%zmm12,%zmm11,%zmm11
+	vpaddd	%zmm13,%zmm8,%zmm8
+	vpaddd	%zmm14,%zmm9,%zmm9
+	vpxord	%zmm10,%zmm5,%zmm5
+	vpxord	%zmm11,%zmm6,%zmm6
+	vpxord	%zmm8,%zmm7,%zmm7
+	vpxord	%zmm9,%zmm4,%zmm4
+	vprold	$12,%zmm5,%zmm5
+	vprold	$12,%zmm6,%zmm6
+	vprold	$12,%zmm7,%zmm7
+	vprold	$12,%zmm4,%zmm4
+	vpaddd	%zmm5,%zmm0,%zmm0
+	vpaddd	%zmm6,%zmm1,%zmm1
+	vpaddd	%zmm7,%zmm2,%zmm2
+	vpaddd	%zmm4,%zmm3,%zmm3
+	vpxord	%zmm0,%zmm15,%zmm15
+	vpxord	%zmm1,%zmm12,%zmm12
+	vpxord	%zmm2,%zmm13,%zmm13
+	vpxord	%zmm3,%zmm14,%zmm14
+	vprold	$8,%zmm15,%zmm15
+	vprold	$8,%zmm12,%zmm12
+	vprold	$8,%zmm13,%zmm13
+	vprold	$8,%zmm14,%zmm14
+	vpaddd	%zmm15,%zmm10,%zmm10
+	vpaddd	%zmm12,%zmm11,%zmm11
+	vpaddd	%zmm13,%zmm8,%zmm8
+	vpaddd	%zmm14,%zmm9,%zmm9
+	vpxord	%zmm10,%zmm5,%zmm5
+	vpxord	%zmm11,%zmm6,%zmm6
+	vpxord	%zmm8,%zmm7,%zmm7
+	vpxord	%zmm9,%zmm4,%zmm4
+	vprold	$7,%zmm5,%zmm5
+	vprold	$7,%zmm6,%zmm6
+	vprold	$7,%zmm7,%zmm7
+	vprold	$7,%zmm4,%zmm4
+	decl	%eax
+	jnz	.Loop16x
+
+	vpaddd	%zmm16,%zmm0,%zmm0
+	vpaddd	%zmm17,%zmm1,%zmm1
+	vpaddd	%zmm18,%zmm2,%zmm2
+	vpaddd	%zmm19,%zmm3,%zmm3
+
+	vpunpckldq	%zmm1,%zmm0,%zmm18
+	vpunpckldq	%zmm3,%zmm2,%zmm19
+	vpunpckhdq	%zmm1,%zmm0,%zmm0
+	vpunpckhdq	%zmm3,%zmm2,%zmm2
+	vpunpcklqdq	%zmm19,%zmm18,%zmm1
+	vpunpckhqdq	%zmm19,%zmm18,%zmm18
+	vpunpcklqdq	%zmm2,%zmm0,%zmm3
+	vpunpckhqdq	%zmm2,%zmm0,%zmm0
+	vpaddd	%zmm20,%zmm4,%zmm4
+	vpaddd	%zmm21,%zmm5,%zmm5
+	vpaddd	%zmm22,%zmm6,%zmm6
+	vpaddd	%zmm23,%zmm7,%zmm7
+
+	vpunpckldq	%zmm5,%zmm4,%zmm2
+	vpunpckldq	%zmm7,%zmm6,%zmm19
+	vpunpckhdq	%zmm5,%zmm4,%zmm4
+	vpunpckhdq	%zmm7,%zmm6,%zmm6
+	vpunpcklqdq	%zmm19,%zmm2,%zmm5
+	vpunpckhqdq	%zmm19,%zmm2,%zmm2
+	vpunpcklqdq	%zmm6,%zmm4,%zmm7
+	vpunpckhqdq	%zmm6,%zmm4,%zmm4
+	vshufi32x4	$0x44,%zmm5,%zmm1,%zmm19
+	vshufi32x4	$0xee,%zmm5,%zmm1,%zmm5
+	vshufi32x4	$0x44,%zmm2,%zmm18,%zmm1
+	vshufi32x4	$0xee,%zmm2,%zmm18,%zmm2
+	vshufi32x4	$0x44,%zmm7,%zmm3,%zmm18
+	vshufi32x4	$0xee,%zmm7,%zmm3,%zmm7
+	vshufi32x4	$0x44,%zmm4,%zmm0,%zmm3
+	vshufi32x4	$0xee,%zmm4,%zmm0,%zmm4
+	vpaddd	%zmm24,%zmm8,%zmm8
+	vpaddd	%zmm25,%zmm9,%zmm9
+	vpaddd	%zmm26,%zmm10,%zmm10
+	vpaddd	%zmm27,%zmm11,%zmm11
+
+	vpunpckldq	%zmm9,%zmm8,%zmm6
+	vpunpckldq	%zmm11,%zmm10,%zmm0
+	vpunpckhdq	%zmm9,%zmm8,%zmm8
+	vpunpckhdq	%zmm11,%zmm10,%zmm10
+	vpunpcklqdq	%zmm0,%zmm6,%zmm9
+	vpunpckhqdq	%zmm0,%zmm6,%zmm6
+	vpunpcklqdq	%zmm10,%zmm8,%zmm11
+	vpunpckhqdq	%zmm10,%zmm8,%zmm8
+	vpaddd	%zmm28,%zmm12,%zmm12
+	vpaddd	%zmm29,%zmm13,%zmm13
+	vpaddd	%zmm30,%zmm14,%zmm14
+	vpaddd	%zmm31,%zmm15,%zmm15
+
+	vpunpckldq	%zmm13,%zmm12,%zmm10
+	vpunpckldq	%zmm15,%zmm14,%zmm0
+	vpunpckhdq	%zmm13,%zmm12,%zmm12
+	vpunpckhdq	%zmm15,%zmm14,%zmm14
+	vpunpcklqdq	%zmm0,%zmm10,%zmm13
+	vpunpckhqdq	%zmm0,%zmm10,%zmm10
+	vpunpcklqdq	%zmm14,%zmm12,%zmm15
+	vpunpckhqdq	%zmm14,%zmm12,%zmm12
+	vshufi32x4	$0x44,%zmm13,%zmm9,%zmm0
+	vshufi32x4	$0xee,%zmm13,%zmm9,%zmm13
+	vshufi32x4	$0x44,%zmm10,%zmm6,%zmm9
+	vshufi32x4	$0xee,%zmm10,%zmm6,%zmm10
+	vshufi32x4	$0x44,%zmm15,%zmm11,%zmm6
+	vshufi32x4	$0xee,%zmm15,%zmm11,%zmm15
+	vshufi32x4	$0x44,%zmm12,%zmm8,%zmm11
+	vshufi32x4	$0xee,%zmm12,%zmm8,%zmm12
+	vshufi32x4	$0x88,%zmm0,%zmm19,%zmm16
+	vshufi32x4	$0xdd,%zmm0,%zmm19,%zmm19
+	vshufi32x4	$0x88,%zmm13,%zmm5,%zmm0
+	vshufi32x4	$0xdd,%zmm13,%zmm5,%zmm13
+	vshufi32x4	$0x88,%zmm9,%zmm1,%zmm17
+	vshufi32x4	$0xdd,%zmm9,%zmm1,%zmm1
+	vshufi32x4	$0x88,%zmm10,%zmm2,%zmm9
+	vshufi32x4	$0xdd,%zmm10,%zmm2,%zmm10
+	vshufi32x4	$0x88,%zmm6,%zmm18,%zmm14
+	vshufi32x4	$0xdd,%zmm6,%zmm18,%zmm18
+	vshufi32x4	$0x88,%zmm15,%zmm7,%zmm6
+	vshufi32x4	$0xdd,%zmm15,%zmm7,%zmm15
+	vshufi32x4	$0x88,%zmm11,%zmm3,%zmm8
+	vshufi32x4	$0xdd,%zmm11,%zmm3,%zmm3
+	vshufi32x4	$0x88,%zmm12,%zmm4,%zmm11
+	vshufi32x4	$0xdd,%zmm12,%zmm4,%zmm12
+	cmpq	$1024,%rdx
+	jb	.Ltail16x
+
+	vpxord	0(%rsi),%zmm16,%zmm16
+	vpxord	64(%rsi),%zmm17,%zmm17
+	vpxord	128(%rsi),%zmm14,%zmm14
+	vpxord	192(%rsi),%zmm8,%zmm8
+	vmovdqu32	%zmm16,0(%rdi)
+	vmovdqu32	%zmm17,64(%rdi)
+	vmovdqu32	%zmm14,128(%rdi)
+	vmovdqu32	%zmm8,192(%rdi)
+
+	vpxord	256(%rsi),%zmm19,%zmm19
+	vpxord	320(%rsi),%zmm1,%zmm1
+	vpxord	384(%rsi),%zmm18,%zmm18
+	vpxord	448(%rsi),%zmm3,%zmm3
+	vmovdqu32	%zmm19,256(%rdi)
+	vmovdqu32	%zmm1,320(%rdi)
+	vmovdqu32	%zmm18,384(%rdi)
+	vmovdqu32	%zmm3,448(%rdi)
+
+	vpxord	512(%rsi),%zmm0,%zmm0
+	vpxord	576(%rsi),%zmm9,%zmm9
+	vpxord	640(%rsi),%zmm6,%zmm6
+	vpxord	704(%rsi),%zmm11,%zmm11
+	vmovdqu32	%zmm0,512(%rdi)
+	vmovdqu32	%zmm9,576(%rdi)
+	vmovdqu32	%zmm6,640(%rdi)
+	vmovdqu32	%zmm11,704(%rdi)
+
+	vpxord	768(%rsi),%zmm13,%zmm13
+	vpxord	832(%rsi),%zmm10,%zmm10
+	vpxord	896(%rsi),%zmm15,%zmm15
+	vpxord	960(%rsi),%zmm12,%zmm12
+	leaq	1024(%rsi),%rsi
+	vmovdqu32	%zmm13,768(%rdi)
+	vmovdqu32	%zmm10,832(%rdi)
+	vmovdqu32	%zmm15,896(%rdi)
+	vmovdqu32	%zmm12,960(%rdi)
+	leaq	1024(%rdi),%rdi
+
+	subq	$1024,%rdx
+	jnz	.Loop_outer16x
+
+	jmp	.Ldone16x
+
+.align	32
+.Ltail16x:
+	xorq	%r9,%r9
+	subq	%rsi,%rdi
+	cmpq	$64,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm16,%zmm16
+	vmovdqu32	%zmm16,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm17,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$128,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm17,%zmm17
+	vmovdqu32	%zmm17,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm14,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$192,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm14,%zmm14
+	vmovdqu32	%zmm14,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm8,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$256,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm8,%zmm8
+	vmovdqu32	%zmm8,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm19,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$320,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm19,%zmm19
+	vmovdqu32	%zmm19,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm1,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$384,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm1,%zmm1
+	vmovdqu32	%zmm1,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm18,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$448,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm18,%zmm18
+	vmovdqu32	%zmm18,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm3,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$512,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm3,%zmm3
+	vmovdqu32	%zmm3,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm0,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$576,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm0,%zmm0
+	vmovdqu32	%zmm0,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm9,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$640,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm9,%zmm9
+	vmovdqu32	%zmm9,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm6,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$704,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm6,%zmm6
+	vmovdqu32	%zmm6,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm11,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$768,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm11,%zmm11
+	vmovdqu32	%zmm11,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm13,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$832,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm13,%zmm13
+	vmovdqu32	%zmm13,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm10,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$896,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm10,%zmm10
+	vmovdqu32	%zmm10,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm15,%zmm16
+	leaq	64(%rsi),%rsi
+
+	cmpq	$960,%rdx
+	jb	.Less_than_64_16x
+	vpxord	(%rsi),%zmm15,%zmm15
+	vmovdqu32	%zmm15,(%rdi,%rsi,1)
+	je	.Ldone16x
+	vmovdqa32	%zmm12,%zmm16
+	leaq	64(%rsi),%rsi
+
+.Less_than_64_16x:
+	vmovdqa32	%zmm16,0(%rsp)
+	leaq	(%rdi,%rsi,1),%rdi
+	andq	$63,%rdx
+
+.Loop_tail16x:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail16x
+
+	vpxord	%zmm16,%zmm16,%zmm16
+	vmovdqa32	%zmm16,0(%rsp)
+
+.Ldone16x:
+	vzeroall
+	leaq	-8(%r10),%rsp
+
+.L16x_epilogue:
+	ret
+ENDPROC(chacha20_avx512)
+
+.align	32
+ENTRY(chacha20_avx512vl)
+	cmpq	$0,%rdx
+	je	.Lavx512vl_epilogue
+
+	leaq	8(%rsp),%r10
+
+	cmpq	$128,%rdx
+	ja	.Lchacha20_8xvl
+
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vbroadcasti128	.Lsigma(%rip),%ymm0
+	vbroadcasti128	(%rcx),%ymm1
+	vbroadcasti128	16(%rcx),%ymm2
+	vbroadcasti128	(%r8),%ymm3
+
+	vmovdqa32	%ymm0,%ymm16
+	vmovdqa32	%ymm1,%ymm17
+	vmovdqa32	%ymm2,%ymm18
+	vpaddd	.Lzeroz(%rip),%ymm3,%ymm3
+	vmovdqa32	.Ltwoy(%rip),%ymm20
+	movq	$10,%r8
+	vmovdqa32	%ymm3,%ymm19
+	jmp	.Loop_avx512vl
+
+.align	16
+.Loop_outer_avx512vl:
+	vmovdqa32	%ymm18,%ymm2
+	vpaddd	%ymm20,%ymm19,%ymm3
+	movq	$10,%r8
+	vmovdqa32	%ymm3,%ymm19
+	jmp	.Loop_avx512vl
+
+.align	32
+.Loop_avx512vl:
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$16,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$12,%ymm1,%ymm1
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$8,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$7,%ymm1,%ymm1
+	vpshufd	$78,%ymm2,%ymm2
+	vpshufd	$57,%ymm1,%ymm1
+	vpshufd	$147,%ymm3,%ymm3
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$16,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$12,%ymm1,%ymm1
+	vpaddd	%ymm1,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm3,%ymm3
+	vprold	$8,%ymm3,%ymm3
+	vpaddd	%ymm3,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm1,%ymm1
+	vprold	$7,%ymm1,%ymm1
+	vpshufd	$78,%ymm2,%ymm2
+	vpshufd	$147,%ymm1,%ymm1
+	vpshufd	$57,%ymm3,%ymm3
+	decq	%r8
+	jnz	.Loop_avx512vl
+	vpaddd	%ymm16,%ymm0,%ymm0
+	vpaddd	%ymm17,%ymm1,%ymm1
+	vpaddd	%ymm18,%ymm2,%ymm2
+	vpaddd	%ymm19,%ymm3,%ymm3
+
+	subq	$64,%rdx
+	jb	.Ltail64_avx512vl
+
+	vpxor	0(%rsi),%xmm0,%xmm4
+	vpxor	16(%rsi),%xmm1,%xmm5
+	vpxor	32(%rsi),%xmm2,%xmm6
+	vpxor	48(%rsi),%xmm3,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	jz	.Ldone_avx512vl
+
+	vextracti128	$1,%ymm0,%xmm4
+	vextracti128	$1,%ymm1,%xmm5
+	vextracti128	$1,%ymm2,%xmm6
+	vextracti128	$1,%ymm3,%xmm7
+
+	subq	$64,%rdx
+	jb	.Ltail_avx512vl
+
+	vpxor	0(%rsi),%xmm4,%xmm4
+	vpxor	16(%rsi),%xmm5,%xmm5
+	vpxor	32(%rsi),%xmm6,%xmm6
+	vpxor	48(%rsi),%xmm7,%xmm7
+	leaq	64(%rsi),%rsi
+
+	vmovdqu	%xmm4,0(%rdi)
+	vmovdqu	%xmm5,16(%rdi)
+	vmovdqu	%xmm6,32(%rdi)
+	vmovdqu	%xmm7,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	vmovdqa32	%ymm16,%ymm0
+	vmovdqa32	%ymm17,%ymm1
+	jnz	.Loop_outer_avx512vl
+
+	jmp	.Ldone_avx512vl
+
+.align	16
+.Ltail64_avx512vl:
+	vmovdqa	%xmm0,0(%rsp)
+	vmovdqa	%xmm1,16(%rsp)
+	vmovdqa	%xmm2,32(%rsp)
+	vmovdqa	%xmm3,48(%rsp)
+	addq	$64,%rdx
+	jmp	.Loop_tail_avx512vl
+
+.align	16
+.Ltail_avx512vl:
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	vmovdqa	%xmm7,48(%rsp)
+	addq	$64,%rdx
+
+.Loop_tail_avx512vl:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_avx512vl
+
+	vmovdqa32	%ymm16,0(%rsp)
+	vmovdqa32	%ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+	vzeroall
+	leaq	-8(%r10),%rsp
+.Lavx512vl_epilogue:
+	ret
+
+.align	32
+.Lchacha20_8xvl:
+	leaq	8(%rsp),%r10
+	subq	$64+8,%rsp
+	andq	$-64,%rsp
+	vzeroupper
+
+	leaq	.Lsigma(%rip),%r9
+	vbroadcasti128	(%r9),%ymm3
+	vbroadcasti128	(%rcx),%ymm7
+	vbroadcasti128	16(%rcx),%ymm11
+	vbroadcasti128	(%r8),%ymm15
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa64	%ymm4,%ymm20
+	vmovdqa64	%ymm5,%ymm21
+	vmovdqa64	%ymm6,%ymm22
+	vmovdqa64	%ymm7,%ymm23
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa64	%ymm8,%ymm24
+	vmovdqa64	%ymm9,%ymm25
+	vmovdqa64	%ymm10,%ymm26
+	vmovdqa64	%ymm11,%ymm27
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vpshufd	$0xff,%ymm15,%ymm15
+	vpaddd	.Lincy(%rip),%ymm12,%ymm12
+	vmovdqa64	%ymm12,%ymm28
+	vmovdqa64	%ymm13,%ymm29
+	vmovdqa64	%ymm14,%ymm30
+	vmovdqa64	%ymm15,%ymm31
+
+	movl	$10,%eax
+	jmp	.Loop8xvl
+
+.align	32
+.Loop_outer8xvl:
+
+
+	vpbroadcastd	8(%r9),%ymm2
+	vpbroadcastd	12(%r9),%ymm3
+	vpaddd	.Leight(%rip),%ymm28,%ymm28
+	vmovdqa64	%ymm20,%ymm4
+	vmovdqa64	%ymm21,%ymm5
+	vmovdqa64	%ymm22,%ymm6
+	vmovdqa64	%ymm23,%ymm7
+	vmovdqa64	%ymm24,%ymm8
+	vmovdqa64	%ymm25,%ymm9
+	vmovdqa64	%ymm26,%ymm10
+	vmovdqa64	%ymm27,%ymm11
+	vmovdqa64	%ymm28,%ymm12
+	vmovdqa64	%ymm29,%ymm13
+	vmovdqa64	%ymm30,%ymm14
+	vmovdqa64	%ymm31,%ymm15
+
+	vmovdqa64	%ymm0,%ymm16
+	vmovdqa64	%ymm1,%ymm17
+	vmovdqa64	%ymm2,%ymm18
+	vmovdqa64	%ymm3,%ymm19
+
+	movl	$10,%eax
+	jmp	.Loop8xvl
+
+.align	32
+.Loop8xvl:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm3,%ymm15,%ymm15
+	vprold	$16,%ymm12,%ymm12
+	vprold	$16,%ymm13,%ymm13
+	vprold	$16,%ymm14,%ymm14
+	vprold	$16,%ymm15,%ymm15
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm11,%ymm7,%ymm7
+	vprold	$12,%ymm4,%ymm4
+	vprold	$12,%ymm5,%ymm5
+	vprold	$12,%ymm6,%ymm6
+	vprold	$12,%ymm7,%ymm7
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm3,%ymm15,%ymm15
+	vprold	$8,%ymm12,%ymm12
+	vprold	$8,%ymm13,%ymm13
+	vprold	$8,%ymm14,%ymm14
+	vprold	$8,%ymm15,%ymm15
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm11,%ymm7,%ymm7
+	vprold	$7,%ymm4,%ymm4
+	vprold	$7,%ymm5,%ymm5
+	vprold	$7,%ymm6,%ymm6
+	vprold	$7,%ymm7,%ymm7
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpaddd	%ymm4,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm15,%ymm15
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm14,%ymm14
+	vprold	$16,%ymm15,%ymm15
+	vprold	$16,%ymm12,%ymm12
+	vprold	$16,%ymm13,%ymm13
+	vprold	$16,%ymm14,%ymm14
+	vpaddd	%ymm15,%ymm10,%ymm10
+	vpaddd	%ymm12,%ymm11,%ymm11
+	vpaddd	%ymm13,%ymm8,%ymm8
+	vpaddd	%ymm14,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpxor	%ymm9,%ymm4,%ymm4
+	vprold	$12,%ymm5,%ymm5
+	vprold	$12,%ymm6,%ymm6
+	vprold	$12,%ymm7,%ymm7
+	vprold	$12,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm0,%ymm0
+	vpaddd	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm7,%ymm2,%ymm2
+	vpaddd	%ymm4,%ymm3,%ymm3
+	vpxor	%ymm0,%ymm15,%ymm15
+	vpxor	%ymm1,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm14,%ymm14
+	vprold	$8,%ymm15,%ymm15
+	vprold	$8,%ymm12,%ymm12
+	vprold	$8,%ymm13,%ymm13
+	vprold	$8,%ymm14,%ymm14
+	vpaddd	%ymm15,%ymm10,%ymm10
+	vpaddd	%ymm12,%ymm11,%ymm11
+	vpaddd	%ymm13,%ymm8,%ymm8
+	vpaddd	%ymm14,%ymm9,%ymm9
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpxor	%ymm11,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpxor	%ymm9,%ymm4,%ymm4
+	vprold	$7,%ymm5,%ymm5
+	vprold	$7,%ymm6,%ymm6
+	vprold	$7,%ymm7,%ymm7
+	vprold	$7,%ymm4,%ymm4
+	decl	%eax
+	jnz	.Loop8xvl
+
+	vpaddd	%ymm16,%ymm0,%ymm0
+	vpaddd	%ymm17,%ymm1,%ymm1
+	vpaddd	%ymm18,%ymm2,%ymm2
+	vpaddd	%ymm19,%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm18
+	vpunpckldq	%ymm3,%ymm2,%ymm19
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm19,%ymm18,%ymm1
+	vpunpckhqdq	%ymm19,%ymm18,%ymm18
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm20,%ymm4,%ymm4
+	vpaddd	%ymm21,%ymm5,%ymm5
+	vpaddd	%ymm22,%ymm6,%ymm6
+	vpaddd	%ymm23,%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm2
+	vpunpckldq	%ymm7,%ymm6,%ymm19
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm19,%ymm2,%ymm5
+	vpunpckhqdq	%ymm19,%ymm2,%ymm2
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vshufi32x4	$0,%ymm5,%ymm1,%ymm19
+	vshufi32x4	$3,%ymm5,%ymm1,%ymm5
+	vshufi32x4	$0,%ymm2,%ymm18,%ymm1
+	vshufi32x4	$3,%ymm2,%ymm18,%ymm2
+	vshufi32x4	$0,%ymm7,%ymm3,%ymm18
+	vshufi32x4	$3,%ymm7,%ymm3,%ymm7
+	vshufi32x4	$0,%ymm4,%ymm0,%ymm3
+	vshufi32x4	$3,%ymm4,%ymm0,%ymm4
+	vpaddd	%ymm24,%ymm8,%ymm8
+	vpaddd	%ymm25,%ymm9,%ymm9
+	vpaddd	%ymm26,%ymm10,%ymm10
+	vpaddd	%ymm27,%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm6
+	vpunpckldq	%ymm11,%ymm10,%ymm0
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm0,%ymm6,%ymm9
+	vpunpckhqdq	%ymm0,%ymm6,%ymm6
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm28,%ymm12,%ymm12
+	vpaddd	%ymm29,%ymm13,%ymm13
+	vpaddd	%ymm30,%ymm14,%ymm14
+	vpaddd	%ymm31,%ymm15,%ymm15
+
+	vpunpckldq	%ymm13,%ymm12,%ymm10
+	vpunpckldq	%ymm15,%ymm14,%ymm0
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm0,%ymm10,%ymm13
+	vpunpckhqdq	%ymm0,%ymm10,%ymm10
+	vpunpcklqdq	%ymm14,%ymm12,%ymm15
+	vpunpckhqdq	%ymm14,%ymm12,%ymm12
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+	vperm2i128	$0x20,%ymm10,%ymm6,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm6,%ymm10
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm6
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+	cmpq	$512,%rdx
+	jb	.Ltail8xvl
+
+	movl	$0x80,%eax
+	vpxord	0(%rsi),%ymm19,%ymm19
+	vpxor	32(%rsi),%ymm0,%ymm0
+	vpxor	64(%rsi),%ymm5,%ymm5
+	vpxor	96(%rsi),%ymm13,%ymm13
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu32	%ymm19,0(%rdi)
+	vmovdqu	%ymm0,32(%rdi)
+	vmovdqu	%ymm5,64(%rdi)
+	vmovdqu	%ymm13,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpxor	0(%rsi),%ymm1,%ymm1
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm2,%ymm2
+	vpxor	96(%rsi),%ymm10,%ymm10
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu	%ymm1,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm2,64(%rdi)
+	vmovdqu	%ymm10,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpxord	0(%rsi),%ymm18,%ymm18
+	vpxor	32(%rsi),%ymm6,%ymm6
+	vpxor	64(%rsi),%ymm7,%ymm7
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu32	%ymm18,0(%rdi)
+	vmovdqu	%ymm6,32(%rdi)
+	vmovdqu	%ymm7,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vpxor	64(%rsi),%ymm4,%ymm4
+	vpxor	96(%rsi),%ymm12,%ymm12
+	leaq	(%rsi,%rax,1),%rsi
+	vmovdqu	%ymm3,0(%rdi)
+	vmovdqu	%ymm11,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vmovdqu	%ymm12,96(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+
+	vpbroadcastd	0(%r9),%ymm0
+	vpbroadcastd	4(%r9),%ymm1
+
+	subq	$512,%rdx
+	jnz	.Loop_outer8xvl
+
+	jmp	.Ldone8xvl
+
+.align	32
+.Ltail8xvl:
+	vmovdqa64	%ymm19,%ymm8
+	xorq	%r9,%r9
+	subq	%rsi,%rdi
+	cmpq	$64,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm8,%ymm8
+	vpxor	32(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm8,0(%rdi,%rsi,1)
+	vmovdqu	%ymm0,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm5,%ymm8
+	vmovdqa	%ymm13,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$128,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm5,%ymm5
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm5,0(%rdi,%rsi,1)
+	vmovdqu	%ymm13,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm1,%ymm8
+	vmovdqa	%ymm9,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$192,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm1,%ymm1
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm1,0(%rdi,%rsi,1)
+	vmovdqu	%ymm9,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm2,%ymm8
+	vmovdqa	%ymm10,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$256,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm2,%ymm2
+	vpxor	32(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm2,0(%rdi,%rsi,1)
+	vmovdqu	%ymm10,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa32	%ymm18,%ymm8
+	vmovdqa	%ymm6,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$320,%rdx
+	jb	.Less_than_64_8xvl
+	vpxord	0(%rsi),%ymm18,%ymm18
+	vpxor	32(%rsi),%ymm6,%ymm6
+	vmovdqu32	%ymm18,0(%rdi,%rsi,1)
+	vmovdqu	%ymm6,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm7,%ymm8
+	vmovdqa	%ymm15,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$384,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm7,%ymm7
+	vpxor	32(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm7,0(%rdi,%rsi,1)
+	vmovdqu	%ymm15,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm3,%ymm8
+	vmovdqa	%ymm11,%ymm0
+	leaq	64(%rsi),%rsi
+
+	cmpq	$448,%rdx
+	jb	.Less_than_64_8xvl
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm3,0(%rdi,%rsi,1)
+	vmovdqu	%ymm11,32(%rdi,%rsi,1)
+	je	.Ldone8xvl
+	vmovdqa	%ymm4,%ymm8
+	vmovdqa	%ymm12,%ymm0
+	leaq	64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+	vmovdqa	%ymm8,0(%rsp)
+	vmovdqa	%ymm0,32(%rsp)
+	leaq	(%rdi,%rsi,1),%rdi
+	andq	$63,%rdx
+
+.Loop_tail8xvl:
+	movzbl	(%rsi,%r9,1),%eax
+	movzbl	(%rsp,%r9,1),%ecx
+	leaq	1(%r9),%r9
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r9,1)
+	decq	%rdx
+	jnz	.Loop_tail8xvl
+
+	vpxor	%ymm8,%ymm8,%ymm8
+	vmovdqa	%ymm8,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+
+.Ldone8xvl:
+	vzeroall
+	leaq	-8(%r10),%rsp
+.L8xvl_epilogue:
+	ret
+ENDPROC(chacha20_avx512vl)
+
+#endif /* CONFIG_AS_AVX512 */
-- 
2.18.0

^ permalink raw reply related

* [PATCH v2 04/17] zinc: ChaCha20 ARM and ARM64 implementations
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
  To: linux-kernel, netdev, davem
  Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
	Jean-Philippe Aumasson, Andy Polyakov, Russell King,
	linux-arm-kernel, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>

These NEON and non-NEON implementations come from Andy Polyakov's
implementation, with several modifications for being friendly to kernel
space.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-crypto@vger.kernel.org
---
 lib/zinc/Makefile                     |    8 +
 lib/zinc/chacha20/chacha20-arm-glue.h |   42 +
 lib/zinc/chacha20/chacha20-arm.S      | 1471 +++++++++++++++++++
 lib/zinc/chacha20/chacha20-arm64.S    | 1940 +++++++++++++++++++++++++
 4 files changed, 3461 insertions(+)
 create mode 100644 lib/zinc/chacha20/chacha20-arm-glue.h
 create mode 100644 lib/zinc/chacha20/chacha20-arm.S
 create mode 100644 lib/zinc/chacha20/chacha20-arm64.S

diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 6ec77feb4ab7..87be627f3354 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -4,6 +4,14 @@ ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
 
 ifeq ($(CONFIG_ZINC_CHACHA20),y)
 zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_ARM),y)
+zinc-y += chacha20/chacha20-arm.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
+ifeq ($(CONFIG_ARM64),y)
+zinc-y += chacha20/chacha20-arm64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
 endif
 
 zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-arm-glue.h b/lib/zinc/chacha20/chacha20-arm-glue.h
new file mode 100644
index 000000000000..1b072eb1e50c
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm-glue.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+
+asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (!defined(__LINUX_ARM_ARCH__) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_neon __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+	chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+	chacha20_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, const u32 key[8], const u32 counter[4], simd_context_t simd_context)
+{
+	if (simd_context != HAVE_FULL_SIMD
+#if defined(ARM_USE_NEON)
+		|| !chacha20_use_neon
+#endif
+	)
+		chacha20_arm(dst, src, len, key, counter);
+	else
+		chacha20_neon(dst, src, len, key, counter);
+	return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, const u8 *key, simd_context_t simd_context) { return false; }
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-arm.S b/lib/zinc/chacha20/chacha20-arm.S
new file mode 100644
index 000000000000..601b4e34283d
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm.S
@@ -0,0 +1,1471 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+.Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone:
+.long	1,0,0,0
+.word -1
+
+#if __LINUX_ARM_ARCH__ >= 7 && IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
+.arch	armv7-a
+.fpu	neon
+
+.align	5
+ENTRY(chacha20_neon)
+	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb		sp!,{r0-r2,r4-r11,lr}
+	cmp		r2,#0			@ len==0?
+#ifdef	__thumb2__
+	itt		eq
+#endif
+	addeq		sp,sp,#4*3
+	beq		.Lno_data_neon
+	cmp		r2,#192			@ test len
+	bls		.Lshort
+.Lchacha20_neon_begin:
+	adr		r14,.Lsigma
+	vstmdb		sp!,{d8-d15}		@ ABI spec says so
+	stmdb		sp!,{r0-r3}
+
+	vld1.32		{q1-q2},[r3]		@ load key
+	ldmia		r3,{r4-r11}		@ load key
+
+	sub		sp,sp,#4*(16+16)
+	vld1.32		{q3},[r12]		@ load counter and nonce
+	add		r12,sp,#4*8
+	ldmia		r14,{r0-r3}		@ load sigma
+	vld1.32		{q0},[r14]!		@ load sigma
+	vld1.32		{q12},[r14]		@ one
+	vst1.32		{q2-q3},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32		{q0-q1},[sp]		@ copy sigma|1/2key
+
+	str		r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str		r11,[sp,#4*(16+11)]	@ off-load "rx"
+	vshl.i32	d26,d24,#1	@ two
+	vstr		d24,[sp,#4*(16+0)]
+	vshl.i32	d28,d24,#2	@ four
+	vstr		d26,[sp,#4*(16+2)]
+	vmov		q4,q0
+	vstr		d28,[sp,#4*(16+4)]
+	vmov		q8,q0
+	vmov		q5,q1
+	vmov		q9,q1
+	b		.Loop_neon_enter
+
+.align	4
+.Loop_neon_outer:
+	ldmia		sp,{r0-r9}		@ load key material
+	cmp		r11,#64*2		@ if len<=64*2
+	bls		.Lbreak_neon		@ switch to integer-only
+	vmov		q4,q0
+	str		r11,[sp,#4*(32+2)]	@ save len
+	vmov		q8,q0
+	str		r12,  [sp,#4*(32+1)]	@ save inp
+	vmov		q5,q1
+	str		r14,  [sp,#4*(32+0)]	@ save out
+	vmov		q9,q1
+.Loop_neon_enter:
+	ldr		r11, [sp,#4*(15)]
+	vadd.i32	q7,q3,q12		@ counter+1
+	ldr		r12,[sp,#4*(12)]	@ modulo-scheduled load
+	vmov		q6,q2
+	ldr		r10, [sp,#4*(13)]
+	vmov		q10,q2
+	ldr		r14,[sp,#4*(14)]
+	vadd.i32	q11,q7,q12		@ counter+2
+	str		r11, [sp,#4*(16+15)]
+	mov		r11,#10
+	add		r12,r12,#3	@ counter+3
+	b		.Loop_neon
+
+.align	4
+.Loop_neon:
+	subs		r11,r11,#1
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r4
+	vadd.i32	q4,q4,q5
+	mov	r12,r12,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r5
+	veor	q3,q3,q0
+	mov	r10,r10,ror#16
+	veor	q7,q7,q4
+	eor	r12,r12,r0,ror#16
+	veor	q11,q11,q8
+	eor	r10,r10,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r12
+	vrev32.16	q7,q7
+	mov	r4,r4,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r10
+	vadd.i32	q2,q2,q3
+	mov	r5,r5,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r4,r4,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r5,r5,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r4
+	veor	q13,q5,q6
+	mov	r12,r12,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r5
+	vshr.u32	q1,q12,#20
+	mov	r10,r10,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r12,r12,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r10,r10,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r12
+	vsli.32	q5,q13,#12
+	mov	r4,r4,ror#25
+	vsli.32	q9,q14,#12
+	add	r9,r9,r10
+	vadd.i32	q0,q0,q1
+	mov	r5,r5,ror#25
+	vadd.i32	q4,q4,q5
+	str	r10,[sp,#4*(16+13)]
+	vadd.i32	q8,q8,q9
+	ldr	r10,[sp,#4*(16+15)]
+	veor	q12,q3,q0
+	eor	r4,r4,r8,ror#25
+	veor	q13,q7,q4
+	eor	r5,r5,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+8)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+10)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r6
+	vshr.u32	q11,q14,#24
+	mov	r14,r14,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+9)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+11)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r7
+	vadd.i32	q2,q2,q3
+	mov	r10,r10,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r14,r14,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r10,r10,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r14
+	veor	q13,q5,q6
+	mov	r6,r6,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r10
+	vshr.u32	q1,q12,#25
+	mov	r7,r7,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r6,r6,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r7,r7,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r6
+	vsli.32	q5,q13,#7
+	mov	r14,r14,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r7
+	vext.8	q2,q2,q2,#8
+	mov	r10,r10,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r14,r14,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r10,r10,r3,ror#24
+	vext.8	q1,q1,q1,#4
+	add	r8,r8,r14
+	vext.8	q5,q5,q5,#4
+	mov	r6,r6,ror#25
+	vext.8	q9,q9,q9,#4
+	add	r9,r9,r10
+	vext.8	q3,q3,q3,#12
+	mov	r7,r7,ror#25
+	vext.8	q7,q7,q7,#12
+	eor	r6,r6,r8,ror#25
+	vext.8	q11,q11,q11,#12
+	eor	r7,r7,r9,ror#25
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r5
+	vadd.i32	q4,q4,q5
+	mov	r10,r10,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r6
+	veor	q3,q3,q0
+	mov	r12,r12,ror#16
+	veor	q7,q7,q4
+	eor	r10,r10,r0,ror#16
+	veor	q11,q11,q8
+	eor	r12,r12,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r10
+	vrev32.16	q7,q7
+	mov	r5,r5,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r12
+	vadd.i32	q2,q2,q3
+	mov	r6,r6,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r5,r5,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r6,r6,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r5
+	veor	q13,q5,q6
+	mov	r10,r10,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r6
+	vshr.u32	q1,q12,#20
+	mov	r12,r12,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r10,r10,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r12,r12,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r10
+	vsli.32	q5,q13,#12
+	mov	r5,r5,ror#25
+	vsli.32	q9,q14,#12
+	str	r10,[sp,#4*(16+15)]
+	vadd.i32	q0,q0,q1
+	ldr	r10,[sp,#4*(16+13)]
+	vadd.i32	q4,q4,q5
+	add	r9,r9,r12
+	vadd.i32	q8,q8,q9
+	mov	r6,r6,ror#25
+	veor	q12,q3,q0
+	eor	r5,r5,r8,ror#25
+	veor	q13,q7,q4
+	eor	r6,r6,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+10)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+8)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r7
+	vshr.u32	q11,q14,#24
+	mov	r10,r10,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+11)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+9)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r4
+	vadd.i32	q2,q2,q3
+	mov	r14,r14,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r10,r10,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r14,r14,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r10
+	veor	q13,q5,q6
+	mov	r7,r7,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r14
+	vshr.u32	q1,q12,#25
+	mov	r4,r4,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r7,r7,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r4,r4,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r7
+	vsli.32	q5,q13,#7
+	mov	r10,r10,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r4
+	vext.8	q2,q2,q2,#8
+	mov	r14,r14,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r10,r10,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r14,r14,r3,ror#24
+	vext.8	q1,q1,q1,#12
+	add	r8,r8,r10
+	vext.8	q5,q5,q5,#12
+	mov	r7,r7,ror#25
+	vext.8	q9,q9,q9,#12
+	add	r9,r9,r14
+	vext.8	q3,q3,q3,#4
+	mov	r4,r4,ror#25
+	vext.8	q7,q7,q7,#4
+	eor	r7,r7,r8,ror#25
+	vext.8	q11,q11,q11,#4
+	eor	r4,r4,r9,ror#25
+	bne		.Loop_neon
+
+	add		r11,sp,#32
+	vld1.32		{q12-q13},[sp]		@ load key material
+	vld1.32		{q14-q15},[r11]
+
+	ldr		r11,[sp,#4*(32+2)]	@ load len
+
+	str		r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str		r9, [sp,#4*(16+9)]
+	str		r12,[sp,#4*(16+12)]
+	str		r10, [sp,#4*(16+13)]
+	str		r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	ldr		r12,[sp,#4*(32+1)]	@ load inp
+	ldr		r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	q0,q0,q12		@ accumulate key material
+	vadd.i32	q4,q4,q12
+	vadd.i32	q8,q8,q12
+	vldr		d24,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	q1,q1,q13
+	vadd.i32	q5,q5,q13
+	vadd.i32	q9,q9,q13
+	vldr		d26,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	q2,q2,q14
+	vadd.i32	q6,q6,q14
+	vadd.i32	q10,q10,q14
+	vadd.i32	d14,d14,d24	@ counter+1
+	vadd.i32	d22,d22,d26	@ counter+2
+
+	vadd.i32	q3,q3,q15
+	vadd.i32	q7,q7,q15
+	vadd.i32	q11,q11,q15
+
+	cmp		r11,#64*4
+	blo		.Ltail_neon
+
+	vld1.8		{q12-q13},[r12]!	@ load input
+	 mov		r11,sp
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12		@ xor with input
+	veor		q1,q1,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q4,q4,q12
+	 vst1.8		{q0-q1},[r14]!	@ store output
+	veor		q5,q5,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q6,q6,q14
+	 vst1.8		{q2-q3},[r14]!
+	veor		q7,q7,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q8,q8,q12
+	 vld1.32	{q0-q1},[r11]!	@ load for next iteration
+	 veor		d25,d25,d25
+	 vldr		d24,[sp,#4*(16+4)]	@ four
+	veor		q9,q9,q13
+	 vld1.32	{q2-q3},[r11]
+	veor		q10,q10,q14
+	 vst1.8		{q4-q5},[r14]!
+	veor		q11,q11,q15
+	 vst1.8		{q6-q7},[r14]!
+
+	vadd.i32	d6,d6,d24	@ next counter value
+	vldr		d24,[sp,#4*(16+0)]	@ one
+
+	ldmia		sp,{r8-r11}	@ load key material
+	add		r0,r0,r8	@ accumulate key material
+	ldr		r8,[r12],#16		@ load input
+	 vst1.8		{q8-q9},[r14]!
+	add		r1,r1,r9
+	ldr		r9,[r12,#-12]
+	 vst1.8		{q10-q11},[r14]!
+	add		r2,r2,r10
+	ldr		r10,[r12,#-8]
+	add		r3,r3,r11
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+#endif
+	eor		r0,r0,r8	@ xor with input
+	 add		r8,sp,#4*(4)
+	eor		r1,r1,r9
+	str		r0,[r14],#16		@ store output
+	eor		r2,r2,r10
+	str		r1,[r14,#-12]
+	eor		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+	str		r2,[r14,#-8]
+	str		r3,[r14,#-4]
+
+	add		r4,r4,r8	@ accumulate key material
+	ldr		r8,[r12],#16		@ load input
+	add		r5,r5,r9
+	ldr		r9,[r12,#-12]
+	add		r6,r6,r10
+	ldr		r10,[r12,#-8]
+	add		r7,r7,r11
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	eor		r4,r4,r8
+	 add		r8,sp,#4*(8)
+	eor		r5,r5,r9
+	str		r4,[r14],#16		@ store output
+	eor		r6,r6,r10
+	str		r5,[r14,#-12]
+	eor		r7,r7,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+	str		r6,[r14,#-8]
+	 add		r0,sp,#4*(16+8)
+	str		r7,[r14,#-4]
+
+	ldmia		r0,{r0-r7}	@ load second half
+
+	add		r0,r0,r8	@ accumulate key material
+	ldr		r8,[r12],#16		@ load input
+	add		r1,r1,r9
+	ldr		r9,[r12,#-12]
+#ifdef	__thumb2__
+	it	hi
+#endif
+	 strhi		r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	add		r2,r2,r10
+	ldr		r10,[r12,#-8]
+#ifdef	__thumb2__
+	it	hi
+#endif
+	 strhi		r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add		r3,r3,r11
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+#endif
+	eor		r0,r0,r8
+	 add		r8,sp,#4*(12)
+	eor		r1,r1,r9
+	str		r0,[r14],#16		@ store output
+	eor		r2,r2,r10
+	str		r1,[r14,#-12]
+	eor		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+	str		r2,[r14,#-8]
+	str		r3,[r14,#-4]
+
+	add		r4,r4,r8	@ accumulate key material
+	 add		r8,r8,#4		@ next counter value
+	add		r5,r5,r9
+	 str		r8,[sp,#4*(12)]	@ save next counter value
+	ldr		r8,[r12],#16		@ load input
+	add		r6,r6,r10
+	 add		r4,r4,#3		@ counter+3
+	ldr		r9,[r12,#-12]
+	add		r7,r7,r11
+	ldr		r10,[r12,#-8]
+	ldr		r11,[r12,#-4]
+#ifdef	__ARMEB__
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	eor		r4,r4,r8
+#ifdef	__thumb2__
+	it	hi
+#endif
+	 ldrhi		r8,[sp,#4*(32+2)]	@ re-load len
+	eor		r5,r5,r9
+	eor		r6,r6,r10
+	str		r4,[r14],#16		@ store output
+	eor		r7,r7,r11
+	str		r5,[r14,#-12]
+	 sub		r11,r8,#64*4	@ len-=64*4
+	str		r6,[r14,#-8]
+	str		r7,[r14,#-4]
+	bhi		.Loop_neon_outer
+
+	b		.Ldone_neon
+
+.align	4
+.Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str		r11, [sp,#4*(20+32+2)]	@ save len
+	 add		r11,sp,#4*(32+4)
+	str		r12,   [sp,#4*(20+32+1)]	@ save inp
+	str		r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr		r12,[sp,#4*(16+10)]
+	ldr		r14,[sp,#4*(16+11)]
+	 vldmia		r11,{d8-d15}			@ fulfill ABI requirement
+	str		r12,[sp,#4*(20+16+10)]	@ copy "rx"
+	str		r14,[sp,#4*(20+16+11)]	@ copy "rx"
+
+	ldr		r11, [sp,#4*(15)]
+	ldr		r12,[sp,#4*(12)]		@ modulo-scheduled load
+	ldr		r10, [sp,#4*(13)]
+	ldr		r14,[sp,#4*(14)]
+	str		r11, [sp,#4*(20+16+15)]
+	add		r11,sp,#4*(20)
+	vst1.32		{q0-q1},[r11]!		@ copy key
+	add		sp,sp,#4*(20)			@ switch frame
+	vst1.32		{q2-q3},[r11]
+	mov		r11,#10
+	b		.Loop				@ go integer-only
+
+.align	4
+.Ltail_neon:
+	cmp		r11,#64*3
+	bhs		.L192_or_more_neon
+	cmp		r11,#64*2
+	bhs		.L128_or_more_neon
+	cmp		r11,#64*1
+	bhs		.L64_or_more_neon
+
+	add		r8,sp,#4*(8)
+	vst1.8		{q0-q1},[sp]
+	add		r10,sp,#4*(0)
+	vst1.8		{q2-q3},[r8]
+	b		.Loop_tail_neon
+
+.align	4
+.L64_or_more_neon:
+	vld1.8		{q12-q13},[r12]!
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12
+	veor		q1,q1,q13
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vst1.8		{q0-q1},[r14]!
+	vst1.8		{q2-q3},[r14]!
+
+	beq		.Ldone_neon
+
+	add		r8,sp,#4*(8)
+	vst1.8		{q4-q5},[sp]
+	add		r10,sp,#4*(0)
+	vst1.8		{q6-q7},[r8]
+	sub		r11,r11,#64*1	@ len-=64*1
+	b		.Loop_tail_neon
+
+.align	4
+.L128_or_more_neon:
+	vld1.8		{q12-q13},[r12]!
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12
+	veor		q1,q1,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q4,q4,q12
+	veor		q5,q5,q13
+	 vst1.8		{q0-q1},[r14]!
+	veor		q6,q6,q14
+	 vst1.8		{q2-q3},[r14]!
+	veor		q7,q7,q15
+	vst1.8		{q4-q5},[r14]!
+	vst1.8		{q6-q7},[r14]!
+
+	beq		.Ldone_neon
+
+	add		r8,sp,#4*(8)
+	vst1.8		{q8-q9},[sp]
+	add		r10,sp,#4*(0)
+	vst1.8		{q10-q11},[r8]
+	sub		r11,r11,#64*2	@ len-=64*2
+	b		.Loop_tail_neon
+
+.align	4
+.L192_or_more_neon:
+	vld1.8		{q12-q13},[r12]!
+	vld1.8		{q14-q15},[r12]!
+	veor		q0,q0,q12
+	veor		q1,q1,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q2,q2,q14
+	veor		q3,q3,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q4,q4,q12
+	veor		q5,q5,q13
+	vld1.8		{q12-q13},[r12]!
+	veor		q6,q6,q14
+	 vst1.8		{q0-q1},[r14]!
+	veor		q7,q7,q15
+	vld1.8		{q14-q15},[r12]!
+
+	veor		q8,q8,q12
+	 vst1.8		{q2-q3},[r14]!
+	veor		q9,q9,q13
+	 vst1.8		{q4-q5},[r14]!
+	veor		q10,q10,q14
+	 vst1.8		{q6-q7},[r14]!
+	veor		q11,q11,q15
+	vst1.8		{q8-q9},[r14]!
+	vst1.8		{q10-q11},[r14]!
+
+	beq		.Ldone_neon
+
+	ldmia		sp,{r8-r11}	@ load key material
+	add		r0,r0,r8	@ accumulate key material
+	 add		r8,sp,#4*(4)
+	add		r1,r1,r9
+	add		r2,r2,r10
+	add		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+
+	add		r4,r4,r8	@ accumulate key material
+	 add		r8,sp,#4*(8)
+	add		r5,r5,r9
+	add		r6,r6,r10
+	add		r7,r7,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	stmia		sp,{r0-r7}
+	 add		r0,sp,#4*(16+8)
+
+	ldmia		r0,{r0-r7}	@ load second half
+
+	add		r0,r0,r8	@ accumulate key material
+	 add		r8,sp,#4*(12)
+	add		r1,r1,r9
+	add		r2,r2,r10
+	add		r3,r3,r11
+	 ldmia		r8,{r8-r11}	@ load key material
+
+	add		r4,r4,r8	@ accumulate key material
+	 add		r8,sp,#4*(8)
+	add		r5,r5,r9
+	 add		r4,r4,#3		@ counter+3
+	add		r6,r6,r10
+	add		r7,r7,r11
+	 ldr		r11,[sp,#4*(32+2)]	@ re-load len
+#ifdef	__ARMEB__
+	rev		r0,r0
+	rev		r1,r1
+	rev		r2,r2
+	rev		r3,r3
+	rev		r4,r4
+	rev		r5,r5
+	rev		r6,r6
+	rev		r7,r7
+#endif
+	stmia		r8,{r0-r7}
+	 add		r10,sp,#4*(0)
+	 sub		r11,r11,#64*3	@ len-=64*3
+
+.Loop_tail_neon:
+	ldrb		r8,[r10],#1	@ read buffer on stack
+	ldrb		r9,[r12],#1		@ read input
+	subs		r11,r11,#1
+	eor		r8,r8,r9
+	strb		r8,[r14],#1		@ store output
+	bne		.Loop_tail_neon
+
+.Ldone_neon:
+	add		sp,sp,#4*(32+4)
+	vldmia		sp,{d8-d15}
+	add		sp,sp,#4*(16+3)
+.Lno_data_neon:
+	ldmia		sp!,{r4-r11,pc}
+ENDPROC(chacha20_neon)
+#endif
+
+.align	5
+.Lsigma2:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone2:
+.long	1,0,0,0
+.word -1
+
+.align	5
+ENTRY(chacha20_arm)
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0-r2,r4-r11,lr}
+	cmp	r2,#0			@ len==0?
+#ifdef	__thumb2__
+	itt	eq
+#endif
+	addeq	sp,sp,#4*3
+	beq	.Lno_data_arm
+.Lshort:
+	ldmia	r12,{r4-r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
+	sub	r14,pc,#100		@ .Lsigma2
+#else
+	adr	r14,.Lsigma2		@ .Lsigma2
+#endif
+	stmdb	sp!,{r4-r7}		@ copy counter and nonce
+	ldmia	r3,{r4-r11}		@ load key
+	ldmia	r14,{r0-r3}		@ load sigma
+	stmdb	sp!,{r4-r11}		@ copy key
+	stmdb	sp!,{r0-r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	b	.Loop_outer_enter
+
+.align	4
+.Loop_outer:
+	ldmia	sp,{r0-r9}		@ load key material
+	str	r11,[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+.Loop_outer_enter:
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	b	.Loop
+
+.align	4
+.Loop:
+	subs	r11,r11,#1
+	add	r0,r0,r4
+	mov	r12,r12,ror#16
+	add	r1,r1,r5
+	mov	r10,r10,ror#16
+	eor	r12,r12,r0,ror#16
+	eor	r10,r10,r1,ror#16
+	add	r8,r8,r12
+	mov	r4,r4,ror#20
+	add	r9,r9,r10
+	mov	r5,r5,ror#20
+	eor	r4,r4,r8,ror#20
+	eor	r5,r5,r9,ror#20
+	add	r0,r0,r4
+	mov	r12,r12,ror#24
+	add	r1,r1,r5
+	mov	r10,r10,ror#24
+	eor	r12,r12,r0,ror#24
+	eor	r10,r10,r1,ror#24
+	add	r8,r8,r12
+	mov	r4,r4,ror#25
+	add	r9,r9,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+13)]
+	ldr	r10,[sp,#4*(16+15)]
+	eor	r4,r4,r8,ror#25
+	eor	r5,r5,r9,ror#25
+	str	r8,[sp,#4*(16+8)]
+	ldr	r8,[sp,#4*(16+10)]
+	add	r2,r2,r6
+	mov	r14,r14,ror#16
+	str	r9,[sp,#4*(16+9)]
+	ldr	r9,[sp,#4*(16+11)]
+	add	r3,r3,r7
+	mov	r10,r10,ror#16
+	eor	r14,r14,r2,ror#16
+	eor	r10,r10,r3,ror#16
+	add	r8,r8,r14
+	mov	r6,r6,ror#20
+	add	r9,r9,r10
+	mov	r7,r7,ror#20
+	eor	r6,r6,r8,ror#20
+	eor	r7,r7,r9,ror#20
+	add	r2,r2,r6
+	mov	r14,r14,ror#24
+	add	r3,r3,r7
+	mov	r10,r10,ror#24
+	eor	r14,r14,r2,ror#24
+	eor	r10,r10,r3,ror#24
+	add	r8,r8,r14
+	mov	r6,r6,ror#25
+	add	r9,r9,r10
+	mov	r7,r7,ror#25
+	eor	r6,r6,r8,ror#25
+	eor	r7,r7,r9,ror#25
+	add	r0,r0,r5
+	mov	r10,r10,ror#16
+	add	r1,r1,r6
+	mov	r12,r12,ror#16
+	eor	r10,r10,r0,ror#16
+	eor	r12,r12,r1,ror#16
+	add	r8,r8,r10
+	mov	r5,r5,ror#20
+	add	r9,r9,r12
+	mov	r6,r6,ror#20
+	eor	r5,r5,r8,ror#20
+	eor	r6,r6,r9,ror#20
+	add	r0,r0,r5
+	mov	r10,r10,ror#24
+	add	r1,r1,r6
+	mov	r12,r12,ror#24
+	eor	r10,r10,r0,ror#24
+	eor	r12,r12,r1,ror#24
+	add	r8,r8,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+15)]
+	ldr	r10,[sp,#4*(16+13)]
+	add	r9,r9,r12
+	mov	r6,r6,ror#25
+	eor	r5,r5,r8,ror#25
+	eor	r6,r6,r9,ror#25
+	str	r8,[sp,#4*(16+10)]
+	ldr	r8,[sp,#4*(16+8)]
+	add	r2,r2,r7
+	mov	r10,r10,ror#16
+	str	r9,[sp,#4*(16+11)]
+	ldr	r9,[sp,#4*(16+9)]
+	add	r3,r3,r4
+	mov	r14,r14,ror#16
+	eor	r10,r10,r2,ror#16
+	eor	r14,r14,r3,ror#16
+	add	r8,r8,r10
+	mov	r7,r7,ror#20
+	add	r9,r9,r14
+	mov	r4,r4,ror#20
+	eor	r7,r7,r8,ror#20
+	eor	r4,r4,r9,ror#20
+	add	r2,r2,r7
+	mov	r10,r10,ror#24
+	add	r3,r3,r4
+	mov	r14,r14,ror#24
+	eor	r10,r10,r2,ror#24
+	eor	r14,r14,r3,ror#24
+	add	r8,r8,r10
+	mov	r7,r7,ror#25
+	add	r9,r9,r14
+	mov	r4,r4,ror#25
+	eor	r7,r7,r8,ror#25
+	eor	r4,r4,r9,ror#25
+	bne	.Loop
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	cmp	r11,#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	r8,[sp,#4*(0)]	@ load key material
+	ldr	r9,[sp,#4*(1)]
+
+#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ < 7
+	orr	r10,r12,r14
+	tst	r10,#3		@ are input and output aligned?
+	ldr	r10,[sp,#4*(2)]
+	bne	.Lunaligned
+	cmp	r11,#64		@ restore flags
+#else
+	ldr	r10,[sp,#4*(2)]
+#endif
+	ldr	r11,[sp,#4*(3)]
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+
+	add	r2,r2,r10
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r0,r0,r8	@ xor with input
+	eorhs	r1,r1,r9
+	 add	r8,sp,#4*(4)
+	str	r0,[r14],#16		@ store output
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	 ldmia	r8,{r8-r11}	@ load key material
+	str	r1,[r14,#-12]
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+	 add	r8,sp,#4*(8)
+	str	r4,[r14],#16		@ store output
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r5,[r14,#-12]
+	 ldmia	r8,{r8-r11}	@ load key material
+	str	r6,[r14,#-8]
+	 add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0-r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	 strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	 strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r0,r0,r8
+	eorhs	r1,r1,r9
+	 add	r8,sp,#4*(12)
+	str	r0,[r14],#16		@ store output
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	str	r1,[r14,#-12]
+	 ldmia	r8,{r8-r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	 addhi	r8,r8,#1		@ next counter value
+	 strhi	r8,[sp,#4*(12)]	@ save next counter value
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+#endif
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+#ifdef	__thumb2__
+	 it	ne
+#endif
+	 ldrne	r8,[sp,#4*(32+2)]	@ re-load len
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r4,[r14],#16		@ store output
+	str	r5,[r14,#-12]
+#ifdef	__thumb2__
+	it	hs
+#endif
+	 subhs	r11,r8,#64		@ len-=64
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#if __LINUX_ARM_ARCH__ < 7
+	b	.Ltail
+
+.align	4
+.Lunaligned:				@ unaligned endian-neutral path
+	cmp	r11,#64		@ restore flags
+#endif
+#endif
+#if __LINUX_ARM_ARCH__ < 7
+	ldr	r11,[sp,#4*(3)]
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	 strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r1,[r14,#-10]
+	 strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	 strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	 strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	 strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	 strb	r2,[r14,#-5]
+	 strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+0)
+	ldmia	r8,{r8-r11}		@ load key material
+	add	r0,sp,#4*(16+8)
+	add	r4,r4,r8		@ accumulate key material
+	add	r5,r5,r9
+	add	r6,r6,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	 strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r5,[r14,#-10]
+	 strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	 strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	 strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	 strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	 strb	r6,[r14,#-5]
+	 strb	r7,[r14,#-1]
+	add	r8,sp,#4*(4+4)
+	ldmia	r8,{r8-r11}		@ load key material
+	ldmia	r0,{r0-r7}		@ load second half
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
+	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	 strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	 strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	 strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r1,[r14,#-10]
+	 strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	 strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	 strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	 strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	 strb	r2,[r14,#-5]
+	 strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+8)
+	ldmia	r8,{r8-r11}		@ load key material
+	add	r4,r4,r8		@ accumulate key material
+#ifdef	__thumb2__
+	itt	hi
+#endif
+	addhi	r8,r8,#1			@ next counter value
+	strhi	r8,[sp,#4*(12)]		@ save next counter value
+	add	r5,r5,r9
+	add	r6,r6,r10
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	 strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	 strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	 strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	 strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	 strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	 strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	 strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+#ifdef	__thumb2__
+	itt	hs
+#endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	 strb	r5,[r14,#-10]
+	 strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	 strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	 strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	 strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	 strb	r6,[r14,#-5]
+	 strb	r7,[r14,#-1]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
+#ifdef	__thumb2__
+	it	hs
+#endif
+	subhs	r11,r8,#64			@ len-=64
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#endif
+
+.Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	r9,sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+.Loop_tail:
+	ldrb	r10,[r9],#1	@ read buffer on stack
+	ldrb	r11,[r12],#1		@ read input
+	subs	r8,r8,#1
+	eor	r11,r11,r10
+	strb	r11,[r14],#1		@ store output
+	bne	.Loop_tail
+
+.Ldone:
+	add	sp,sp,#4*(32+3)
+.Lno_data_arm:
+	ldmia	sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)
diff --git a/lib/zinc/chacha20/chacha20-arm64.S b/lib/zinc/chacha20/chacha20-arm64.S
new file mode 100644
index 000000000000..c3d1243161a8
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm64.S
@@ -0,0 +1,1940 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.align	5
+.Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+.Lone:
+.long	1,0,0,0
+
+.align	5
+ENTRY(chacha20_arm)
+	cbz	x2,.Labort
+.Lshort:
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adr	x5,.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__ARMEB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+.Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+.Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,.Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	.Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	.Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+.Labort:
+	ret
+
+.align	4
+.Ltail:
+	add	x2,x2,#64
+.Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+.Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+ENDPROC(chacha20_arm)
+
+.align	5
+ENTRY(chacha20_neon)
+	cbz	x2,.Labort_neon
+	cmp	x2,#192
+	b.lo	.Lshort
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adr	x5,.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	.L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+.Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+.Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,.Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	.Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	.Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
+.Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	.Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	.Last_neon
+
+.Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	.Last_neon
+.Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	.Last_neon
+
+.align	4
+.Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+.Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	ret
+
+.L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+.Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+.Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+.Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	.Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	.Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	.Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	.Loop_outer
+
+.Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+.Labort_neon:
+	ret
+ENDPROC(chacha20_neon)
-- 
2.18.0

^ permalink raw reply related

* [PATCH v2 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
  To: linux-kernel, netdev, davem
  Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
	Jean-Philippe Aumasson, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>

Zinc stands for "Zinc Is Neat Crypto" or "Zinc as IN Crypto" or maybe
just "Zx2c4's INsane Cryptolib." It's also short, easy to type, and
plays nicely with the recent trend of naming crypto libraries after
elements. The guiding principle is "don't overdo it". It's less of a
library and more of a directory tree for organizing well-curated direct
implementations of cryptography primitives.

Zinc is a new cryptography API that is much more minimal and lower-level
than the current one. It intends to complement it and provide a basis
upon which the current crypto API might build, as the provider of
software implementations of cryptographic primitives. It is motivated by
three primary observations in crypto API design:

  * Highly composable "cipher modes" and related abstractions from
    90s cryptographers did not turn out to be as terrific an idea as
    hoped, leading to a host of API misuse problems.

  * Most programmers are afraid of crypto code, and so prefer to
    integrate it into libraries in a highly abstracted manner, so as to
    shield themselves from implementation details. Cryptographers, on
    the other hand, prefer simple direct implementations, which they're
    able to verify for high assurance and optimize in accordance with
    their expertise.

  * Overly abstracted and flexible cryptography APIs lead to a host of
    dangerous problems and performance issues. The kernel is in the
    business usually not of coming up with new uses of crypto, but
    rather implementing various constructions, which means it essentially
    needs a library of primitives, not a highly abstracted enterprise-ready
    pluggable system, with a few particular exceptions.

This last observation has seen itself play out several times over and
over again within the kernel:

  * The perennial move of actual primitives away from crypto/ and into
    lib/, so that users can actually call these functions directly with
    no overhead and without lots of allocations, function pointers,
    string specifier parsing, and general clunkiness. For example:
    sha256, chacha20, siphash, sha1, and so forth live in lib/ rather
    than in crypto/. Zinc intends to stop the cluttering of lib/ and
    introduce these direct primitives into their proper place, lib/zinc/.

  * An abundance of misuse bugs with the present crypto API that have
    been very unpleasant to clean up.

  * A hesitance to even use cryptography, because of the overhead and
    headaches involved in accessing the routines.

Zinc goes in a rather different direction. Rather than providing a
thoroughly designed and abstracted API, Zinc gives you simple functions,
which implement some primitive, or some particular and specific
construction of primitives. It is not dynamic in the least, though one
could imagine implementing a complex dynamic dispatch mechanism (such as
the current crypto API) on top of these basic functions. After all,
dynamic dispatch is usually needed for applications with cipher agility,
such as IPsec, dm-crypt, AF_ALG, and so forth, and the existing crypto
API will continue to play that role. However, Zinc will provide a non-
haphazard way of directly utilizing crypto routines in applications
that do have neither the need nor desire for abstraction and dynamic
dispatch.

It also organizes the implementations in a simple, straight-forward,
and direct manner, making it enjoyable and intuitive to work on.
Rather than moving optimized assembly implementations into arch/, it
keeps them all together in lib/zinc/, making it simple and obvious to
compare and contrast what's happening. This is, notably, exactly what
the lib/raid6/ tree does, and that seems to work out rather well. It's
also the pattern of most successful crypto libraries. The architecture-
specific glue-code is made a part of each translation unit, rather than
being in a separate one, so that generic and architecture-optimized code
are combined at compile-time, and incompatibility branches compiled out by
the optimizer.

All implementations have been extensively tested and fuzzed, and are
selected for their quality, trustworthiness, and performance. Wherever
possible and performant, formally verified implementations are used,
such as those from HACL* [1] and Fiat-Crypto [2]. The routines also take
special care to zero out secrets using memzero_explicit (and future work
is planned to have gcc do this more reliably and performantly with
compiler plugins). The performance of the selected implementations is
state-of-the-art and unrivaled on a broad array of hardware, though of
course we will continue to fine tune these to the hardware demands
needed by kernel contributors. Each implementation also comes with
extensive self-tests and crafted test vectors, pulled from various
places such as Wycheproof [9].

Regularity of function signatures is important, so that users can easily
"guess" the name of the function they want. Though, individual
primitives are oftentimes not trivially interchangeable, having been
designed for different things and requiring different parameters and
semantics, and so the function signatures they provide will directly
reflect the realities of the primitives' usages, rather than hiding it
behind (inevitably leaky) abstractions. Also, in contrast to the current
crypto API, Zinc functions can work on stack buffers, and can be called
with different keys, without requiring allocations or locking.

SIMD is used automatically when available, though some routines may
benefit from either having their SIMD disabled for particular
invocations, or to have the SIMD initialization calls amortized over
several invocations of the function, and so Zinc utilizes function
signatures enabling that in conjunction with the recently introduced
simd_context_t.

More generally, Zinc provides function signatures that allow just what
is required by the various callers. This isn't to say that users of the
functions will be permitted to pollute the function semantics with weird
particular needs, but we are trying very hard not to overdo it, and that
means looking carefully at what's actually necessary, and doing just that,
and not much more than that. Remember: practicality and cleanliness rather
than over-zealous infrastructure.

Zinc provides also an opening for the best implementers in academia to
contribute their time and effort to the kernel, by being sufficiently
simple and inviting. In discussing this commit with some of the best and
brightest over the last few years, there are many who are eager to
devote rare talent and energy to this effort.

Following the merging of this, I expect for the primitives that
currently exist in lib/ to work their way into lib/zinc/, after intense
scrutiny of each implementation, potentially replacing them with either
formally-verified implementations, or better studied and faster
state-of-the-art implementations.

Also following the merging of this, I expect for the old crypto API
implementations to be ported over to use Zinc for their software-based
implementations.

As Zinc is simply library code, its config options are un-menued, with
the exception of CONFIG_ZINC_DEBUG, which enables various selftests and
BUG_ONs.

[1] https://github.com/project-everest/hacl-star
[2] https://github.com/mit-plv/fiat-crypto
[3] https://cr.yp.to/ecdh.html
[4] https://cr.yp.to/chacha.html
[5] https://cr.yp.to/snuffle/xsalsa-20081128.pdf
[6] https://cr.yp.to/mac.html
[7] https://blake2.net/
[8] https://tools.ietf.org/html/rfc8439
[9] https://github.com/google/wycheproof

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: linux-crypto@vger.kernel.org
---
 MAINTAINERS       |  8 ++++++++
 lib/Kconfig       |  2 ++
 lib/Makefile      |  2 ++
 lib/zinc/Kconfig  | 20 ++++++++++++++++++++
 lib/zinc/Makefile |  7 +++++++
 lib/zinc/main.c   | 31 +++++++++++++++++++++++++++++++
 6 files changed, 70 insertions(+)
 create mode 100644 lib/zinc/Kconfig
 create mode 100644 lib/zinc/Makefile
 create mode 100644 lib/zinc/main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 955463f8d518..f194eda86011 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16103,6 +16103,14 @@ Q:	https://patchwork.linuxtv.org/project/linux-media/list/
 S:	Maintained
 F:	drivers/media/dvb-frontends/zd1301_demod*
 
+ZINC CRYPTOGRAPHY LIBRARY
+M:	Jason A. Donenfeld <Jason@zx2c4.com>
+M:	Samuel Neves <sneves@dei.uc.pt>
+S:	Maintained
+F:	lib/zinc/
+F:	include/zinc/
+L:	linux-crypto@vger.kernel.org
+
 ZPOOL COMPRESSED PAGE STORAGE API
 M:	Dan Streetman <ddstreet@ieee.org>
 L:	linux-mm@kvack.org
diff --git a/lib/Kconfig b/lib/Kconfig
index 706836ec314d..7ea5437e9f7d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -478,6 +478,8 @@ config GLOB_SELFTEST
 	  module load) by a small amount, so you're welcome to play with
 	  it, but you probably don't need it.
 
+source "lib/zinc/Kconfig"
+
 #
 # Netlink attribute parsing support is select'ed if needed
 #
diff --git a/lib/Makefile b/lib/Makefile
index d95bb2525101..ee41151cba7a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -212,6 +212,8 @@ obj-$(CONFIG_PERCPU_TEST) += percpu_test.o
 
 obj-$(CONFIG_ASN1) += asn1_decoder.o
 
+obj-$(CONFIG_ZINC) += zinc/
+
 obj-$(CONFIG_FONT_SUPPORT) += fonts/
 
 obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
new file mode 100644
index 000000000000..aa4f8d449d6b
--- /dev/null
+++ b/lib/zinc/Kconfig
@@ -0,0 +1,20 @@
+config ZINC
+	tristate
+	select CRYPTO_BLKCIPHER
+	select VFP
+	select VFPv3
+	select NEON
+	select KERNEL_MODE_NEON
+
+config ZINC_DEBUG
+	bool "Zinc cryptography library debugging and self-tests"
+	depends on ZINC
+	help
+	  This builds a series of self-tests for the Zinc crypto library, which
+	  help diagnose any cryptographic algorithm implementation issues that
+	  might be at the root cause of potential bugs. It also adds various
+	  debugging traps.
+
+	  Unless you're developing and testing cryptographic routines, or are
+	  especially paranoid about correctness on your hardware, you may say
+	  N here.
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
new file mode 100644
index 000000000000..8e30115217db
--- /dev/null
+++ b/lib/zinc/Makefile
@@ -0,0 +1,7 @@
+ccflags-y := -O3
+ccflags-y += -Wframe-larger-than=8192
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+
+zinc-y += main.o
+
+obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
new file mode 100644
index 000000000000..590872563955
--- /dev/null
+++ b/lib/zinc/main.c
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#ifdef CONFIG_ZINC_DEBUG
+#define selftest(which) do { \
+	if (!which ## _selftest()) \
+		return -ENOTRECOVERABLE; \
+} while (0)
+#else
+#define selftest(which)
+#endif
+
+static int __init mod_init(void)
+{
+	return 0;
+}
+
+static void __exit mod_exit(void)
+{
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Zinc cryptography library");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-- 
2.18.0

^ permalink raw reply related

* [PATCH v2 01/17] asm: simd context helper API
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
  To: linux-kernel, netdev, davem
  Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
	linux-arch
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>

Sometimes it's useful to amortize calls to XSAVE/XRSTOR and the related
FPU/SIMD functions over a number of calls, because FPU restoration is
quite expensive. This adds a simple header for carrying out this pattern:

    simd_context_t simd_context = simd_get();
    while ((item = get_item_from_queue()) != NULL) {
        encrypt_item(item, simd_context);
        simd_context = simd_relax(simd_context);
    }
    simd_put(simd_context);

The relaxation step ensures that we don't trample over preemption, and
the get/put API should be a familiar paradigm in the kernel.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: linux-arch@vger.kernel.org
---
 arch/alpha/include/asm/Kbuild      |  5 ++--
 arch/arc/include/asm/Kbuild        |  1 +
 arch/arm/include/asm/simd.h        | 42 ++++++++++++++++++++++++++++++
 arch/arm64/include/asm/simd.h      | 37 +++++++++++++++++++++-----
 arch/c6x/include/asm/Kbuild        |  3 ++-
 arch/h8300/include/asm/Kbuild      |  3 ++-
 arch/hexagon/include/asm/Kbuild    |  1 +
 arch/ia64/include/asm/Kbuild       |  1 +
 arch/m68k/include/asm/Kbuild       |  1 +
 arch/microblaze/include/asm/Kbuild |  1 +
 arch/mips/include/asm/Kbuild       |  1 +
 arch/nds32/include/asm/Kbuild      |  7 ++---
 arch/nios2/include/asm/Kbuild      |  1 +
 arch/openrisc/include/asm/Kbuild   |  7 ++---
 arch/parisc/include/asm/Kbuild     |  1 +
 arch/powerpc/include/asm/Kbuild    |  3 ++-
 arch/riscv/include/asm/Kbuild      |  3 ++-
 arch/s390/include/asm/Kbuild       |  3 ++-
 arch/sh/include/asm/Kbuild         |  1 +
 arch/sparc/include/asm/Kbuild      |  1 +
 arch/um/include/asm/Kbuild         |  3 ++-
 arch/unicore32/include/asm/Kbuild  |  1 +
 arch/x86/include/asm/simd.h        | 30 ++++++++++++++++++++-
 arch/xtensa/include/asm/Kbuild     |  1 +
 include/asm-generic/simd.h         | 15 +++++++++++
 include/linux/simd.h               | 28 ++++++++++++++++++++
 26 files changed, 180 insertions(+), 21 deletions(-)
 create mode 100644 arch/arm/include/asm/simd.h
 create mode 100644 include/linux/simd.h

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 0580cb8c84b2..07b2c1025d34 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -2,14 +2,15 @@
 
 
 generic-y += compat.h
+generic-y += current.h
 generic-y += exec.h
 generic-y += export.h
 generic-y += fb.h
 generic-y += irq_work.h
+generic-y += kprobes.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += trace_clock.h
-generic-y += current.h
-generic-y += kprobes.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index feed50ce89fa..a7f4255f1649 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += parport.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += user.h
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
new file mode 100644
index 000000000000..bf468993bbef
--- /dev/null
+++ b/arch/arm/include/asm/simd.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
+static __must_check inline bool may_use_simd(void)
+{
+	return !in_interrupt();
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = may_use_simd();
+	if (have_simd)
+		kernel_neon_begin();
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_neon_end();
+}
+#else
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+#endif
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index 6495cc51246f..058c336de38d 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -1,11 +1,10 @@
-/*
- * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+/* SPDX-License-Identifier: GPL-2.0
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  */
 
+#include <linux/simd.h>
 #ifndef __ASM_SIMD_H
 #define __ASM_SIMD_H
 
@@ -16,6 +15,8 @@
 #include <linux/types.h>
 
 #ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+#include <asm/simd.h>
 
 DECLARE_PER_CPU(bool, kernel_neon_busy);
 
@@ -40,12 +41,36 @@ static __must_check inline bool may_use_simd(void)
 		!this_cpu_read(kernel_neon_busy);
 }
 
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = may_use_simd();
+	if (have_simd)
+		kernel_neon_begin();
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_neon_end();
+}
+
 #else /* ! CONFIG_KERNEL_MODE_NEON */
 
-static __must_check inline bool may_use_simd(void) {
+static __must_check inline bool may_use_simd(void)
+{
 	return false;
 }
 
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
 #endif /* ! CONFIG_KERNEL_MODE_NEON */
 
 #endif
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 33a2c94fed0d..22f3d8333c74 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -5,8 +5,8 @@ generic-y += compat.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
@@ -30,6 +30,7 @@ generic-y += pgalloc.h
 generic-y += preempt.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += tlbflush.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index a5d0b2991f47..f5c2f12d593e 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -8,8 +8,8 @@ generic-y += current.h
 generic-y += delay.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
@@ -39,6 +39,7 @@ generic-y += preempt.h
 generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += spinlock.h
 generic-y += timex.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index dd2fd9c0d292..217d4695fd8a 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -29,6 +29,7 @@ generic-y += rwsem.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 557bbc8ba9f5..41c5ebdf79e5 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += vtime.h
 generic-y += word-at-a-time.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index a4b8d3331a9e..73898dd1a4d0 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += mm-arch-hooks.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += spinlock.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index fe6a6c6e5003..9002fb24888c 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += syscalls.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 58351e48421e..e8868e0fb2c3 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += qrwlock.h
 generic-y += qspinlock.h
 generic-y += sections.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += user.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index dbc4e5422550..603c1d020620 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -7,14 +7,14 @@ generic-y += bug.h
 generic-y += bugs.h
 generic-y += checksum.h
 generic-y += clkdev.h
-generic-y += cmpxchg.h
 generic-y += cmpxchg-local.h
+generic-y += cmpxchg.h
 generic-y += compat.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
@@ -46,14 +46,15 @@ generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += shmbuf.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += stat.h
 generic-y += switch_to.h
 generic-y += timex.h
 generic-y += topology.h
 generic-y += trace_clock.h
-generic-y += xor.h
 generic-y += unaligned.h
 generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
+generic-y += xor.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..571a9d9ad107 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -33,6 +33,7 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += spinlock.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 65964d390b10..81a39e274f6f 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -27,12 +27,13 @@ generic-y += module.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
-generic-y += qspinlock_types.h
-generic-y += qspinlock.h
-generic-y += qrwlock_types.h
 generic-y += qrwlock.h
+generic-y += qrwlock_types.h
+generic-y += qspinlock.h
+generic-y += qspinlock_types.h
 generic-y += sections.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += string.h
 generic-y += switch_to.h
 generic-y += topology.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 2013d639e735..97970b4d05ab 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += seccomp.h
 generic-y += segment.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d227e351..64290f48e733 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -4,7 +4,8 @@ generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
+generic-y += msi.h
 generic-y += preempt.h
 generic-y += rwsem.h
+generic-y += simd.h
 generic-y += vtime.h
-generic-y += msi.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 576ffdca06ba..8d3e7aef3234 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -4,9 +4,9 @@ generic-y += checksum.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
-generic-y += dma.h
 generic-y += dma-contiguous.h
 generic-y += dma-mapping.h
+generic-y += dma.h
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
@@ -45,6 +45,7 @@ generic-y += setup.h
 generic-y += shmbuf.h
 generic-y += shmparam.h
 generic-y += signal.h
+generic-y += simd.h
 generic-y += socket.h
 generic-y += sockios.h
 generic-y += stat.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index e3239772887a..7a26dc6ce815 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,9 +7,9 @@ generated-y += unistd_nr.h
 generic-y += asm-offsets.h
 generic-y += cacheflush.h
 generic-y += device.h
+generic-y += div64.h
 generic-y += dma-contiguous.h
 generic-y += dma-mapping.h
-generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += export.h
 generic-y += fb.h
@@ -22,6 +22,7 @@ generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += rwsem.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += word-at-a-time.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 6a5609a55965..8e64ff35a933 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += trace_clock.h
 generic-y += xor.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 410b263ef5c8..72b9e08fb350 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -17,5 +17,6 @@ generic-y += msi.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b10dde6cb793..d37288b08dd2 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -16,15 +16,16 @@ generic-y += io.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
+generic-y += kprobes.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += param.h
 generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
+generic-y += simd.h
 generic-y += switch_to.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
-generic-y += kprobes.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index bfc7abe77905..98a908720bbd 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -27,6 +27,7 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += syscalls.h
 generic-y += topology.h
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index a341c878e977..79411178988a 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,4 +1,11 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
 
 #include <asm/fpu/api.h>
 
@@ -10,3 +17,24 @@ static __must_check inline bool may_use_simd(void)
 {
 	return irq_fpu_usable();
 }
+
+static inline simd_context_t simd_get(void)
+{
+	bool have_simd = false;
+#if !defined(CONFIG_UML)
+	have_simd = may_use_simd();
+	if (have_simd)
+		kernel_fpu_begin();
+#endif
+	return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+#if !defined(CONFIG_UML)
+	if (prior_context != HAVE_NO_SIMD)
+		kernel_fpu_end();
+#endif
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index e5e1e61c538c..e3b194a187f9 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += percpu.h
 generic-y += preempt.h
 generic-y += rwsem.h
 generic-y += sections.h
+generic-y += simd.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
diff --git a/include/asm-generic/simd.h b/include/asm-generic/simd.h
index d0343d58a74a..fad899a5a92d 100644
--- a/include/asm-generic/simd.h
+++ b/include/asm-generic/simd.h
@@ -1,5 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
 #include <linux/hardirq.h>
 
 /*
@@ -13,3 +17,14 @@ static __must_check inline bool may_use_simd(void)
 {
 	return !in_interrupt();
 }
+
+static inline simd_context_t simd_get(void)
+{
+	return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/include/linux/simd.h b/include/linux/simd.h
new file mode 100644
index 000000000000..f62d047188bf
--- /dev/null
+++ b/include/linux/simd.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _SIMD_H
+#define _SIMD_H
+
+typedef enum {
+	HAVE_NO_SIMD,
+	HAVE_FULL_SIMD
+} simd_context_t;
+
+#include <linux/sched.h>
+#include <asm/simd.h>
+
+static inline simd_context_t simd_relax(simd_context_t prior_context)
+{
+#ifdef CONFIG_PREEMPT
+	if (prior_context != HAVE_NO_SIMD && need_resched()) {
+		simd_put(prior_context);
+		return simd_get();
+	}
+#endif
+	return prior_context;
+}
+
+#endif /* _SIMD_H */
-- 
2.18.0

^ permalink raw reply related

* Re: broken behaviour of TC filter delete
From: Roman Mashak @ 2018-08-24 16:18 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, jiri, Jamal Hadi Salim
In-Reply-To: <20180824081751.GA2931@nanopsycho>

Jiri Pirko <jiri@resnulli.us> writes:

> Thu, Aug 23, 2018 at 11:39:22PM CEST, mrv@mojatatu.com wrote:
>>
>>
>>It appears that the following commit changed the behaviour of scenario where a
>>filter is deleted twice:
>>
>>commit f71e0ca4db187af7c44987e9d21e9042c3046070
>>Author: Jiri Pirko <jiri@mellanox.com>
>>Date:   Mon Jul 23 09:23:05 2018 +0200
>>
>>    net: sched: Avoid implicit chain 0 creation
>>
>>
>>Steps to reproduce :
>>
>>1) create dummy device
>>   $ ip link add dev dummy0 type dummy
>>
>>2) create qdisc
>>   $ tc qdisc add dev dummy0 ingress
>>
>>3) create simple u32 filter with action attached
>>   $ tc filter add dev dummy0 parent ffff: protocol ip prio 1 u32 match ip src 10.10.10.1/32 action ok
>>
>>4) list the filter
>>   $ tc filter ls dev dummy0 parent ffff:
>>
>>5) delete the filter with the given protocol and priority
>>   $ tc filter del dev dummy0 parent ffff: protocol ip prio 1
>>
>>6) repeat step 5, this will return -ENOENT ("Error: Filter with specified priority/protocol not found.")
>>However, before the change at step 6 we would get -EINVAL (Error: Cannot find specified filter chain.)
>>and that makes sense.
>
> Wait, this now returns:
> Error: Cannot find specified filter chain.
> So you want it to return -EINVAL (Error: Cannot find specified filter chain.) ?
> How about for other chains?
>

I must've mixed up the return codes and messages while typing the
message.

So _before_ commit f71e0ca4db187af7c44987e9d21e9042c3046070 step 6 would
return -ENOENT with "Error: Filter with specified priority/protocol not
found." and _after_ the commit it returns -EINVAL (Error: Cannot find
specified filter chain.)

ENOENT seems to be more logical to return when there's no more filter to delete.

^ permalink raw reply

* Re: [PATCH] sh_eth: Add R7S9210 support
From: Sergei Shtylyov @ 2018-08-24 16:07 UTC (permalink / raw)
  To: Chris Brandt, David S . Miller, Rob Herring, Mark Rutland
  Cc: netdev, devicetree, linux-renesas-soc, Simon Horman
In-Reply-To: <20180822205705.20835-1-chris.brandt@renesas.com>

On 08/22/2018 11:57 PM, Chris Brandt wrote:

> Add support for the R7S9210 which is part of the RZ/A2 series.

   Is the manual publicly available? I tried to find it on Renesas' website
but it appears royally broken...

> Signed-off-by: Chris Brandt <chris.brandt@renesas.com>
> ---
>  Documentation/devicetree/bindings/net/sh_eth.txt |   1 +
>  drivers/net/ethernet/renesas/sh_eth.c            | 107 +++++++++++++++++++++++
>  drivers/net/ethernet/renesas/sh_eth.h            |   1 +
>  3 files changed, 109 insertions(+)
> 
[...]
> diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
> index 5573199c4536..f3a575e56ae4 100644
> --- a/drivers/net/ethernet/renesas/sh_eth.c
> +++ b/drivers/net/ethernet/renesas/sh_eth.c
> @@ -216,6 +216,59 @@ static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = {
>  	[RXALCR0]	= 0x008C,
>  };
>  
> +static const u16 sh_eth_offset_fast_rza2[SH_ETH_MAX_REGISTER_OFFSET] = {
> +	SH_ETH_OFFSET_DEFAULTS,
> +
> +	[EDMR]		= 0x0000,
> +	[EDTRR]		= 0x0008,
> +	[EDRRR]		= 0x0010,
> +	[EESR]		= 0x0028,
> +	[EESIPR]	= 0x0030,
> +	[TDLAR]		= 0x0018,
> +	[TBRAR]		= 0x00d4,
> +	[TDFAR]		= 0x00d8,
> +	[RDLAR]		= 0x0020,
> +	[RBWAR]		= 0x00c8,
> +	[RDFAR]		= 0x00cc,
> +	[TRSCER]	= 0x0038,
> +	[RMFCR]		= 0x0040,
> +	[TFTR]		= 0x0048,
> +	[FDR]		= 0x0050,
> +	[RMCR]		= 0x0058,
> +	[TFUCR]		= 0x0064,
> +	[RFOCR]		= 0x0068,
> +	[RPADIR]	= 0x0078,
> +	[TRIMD]		= 0x007c,
> +	[FCFTR]		= 0x0070,
> +
> +	[ECMR]		= 0x0100,
> +	[RFLR]		= 0x0108,
> +	[ECSR]		= 0x0110,
> +	[ECSIPR]	= 0x0118,
> +	[PIR]		= 0x0120,
> +	[PSR]		= 0x0128,
> +	[RDMLR]		= 0x0140,
> +	[IPGR]		= 0x0150,
> +	[APR]		= 0x0154,
> +	[MPR]		= 0x0158,
> +	[RFCF]		= 0x0160,
> +	[TPAUSER]	= 0x0164,
> +	[TPAUSECR]	= 0x0168,
> +	[BCFRR]		= 0x016c,
> +	[MAHR]		= 0x01c0,
> +	[MALR]		= 0x01c8,
> +	[TROCR]		= 0x01d0,
> +	[CDCR]		= 0x01d4,
> +	[LCCR]		= 0x01d8,
> +	[CNDCR]		= 0x01dc,
> +	[CEFCR]		= 0x01e4,
> +	[FRECR]		= 0x01e8,
> +	[TSFRCR]	= 0x01ec,
> +	[TLFRCR]	= 0x01f0,
> +	[RFCR]		= 0x01f4,
> +	[MAFCR]		= 0x01f8,
> +};
> +

   This array is absolutely the same as sh_eth_offset_fast_sh4[], except the latter
is (more or less) sorted.

>  static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = {
>  	SH_ETH_OFFSET_DEFAULTS,
>  
> @@ -635,6 +688,56 @@ static struct sh_eth_cpu_data r7s72100_data = {
>  	.no_tx_cntrs	= 1,
>  };
>  
> +/* R7S9210 */
> +static void sh_eth_set_rate_r7s9210(struct net_device *ndev)
> +{
> +	struct sh_eth_private *mdp = netdev_priv(ndev);
> +	unsigned long RTM = 0x00000004;

  Use #define, please.

> +
> +	switch (mdp->speed) {
> +	case 10: /* 10BASE */
> +		sh_eth_modify(ndev, ECMR, RTM, 0);
> +		break;
> +	case 100:/* 100BASE */
> +		sh_eth_modify(ndev, ECMR, RTM, RTM);
> +		break;
> +	}
> +}
> +

  Seems the same as sh_eth_set_rate_rcar(), except for the bit name...

> +static struct sh_eth_cpu_data r7s9210_data = {
> +	.soft_reset	= sh_eth_soft_reset,
> +
> +	.set_duplex	= sh_eth_set_duplex,
> +	.set_rate	= sh_eth_set_rate_r7s9210,
> +
> +	.register_type	= SH_ETH_REG_FAST_RZA2,
> +
> +	.edtrr_trns	= EDTRR_TRNS_ETHER,
> +	.ecsr_value	= ECSR_ICD,
> +	.ecsipr_value	= ECSIPR_ICDIP,
> +	.eesipr_value	= EESIPR_TWBIP | EESIPR_TABTIP | EESIPR_RABTIP |
> +			  EESIPR_RFCOFIP | EESIPR_ECIIP | EESIPR_FTCIP |
> +			  EESIPR_TDEIP | EESIPR_TFUFIP | EESIPR_FRIP |
> +			  EESIPR_RDEIP | EESIPR_RFOFIP | EESIPR_CNDIP |
> +			  EESIPR_DLCIP | EESIPR_CDIP | EESIPR_TROIP |
> +			  EESIPR_RMAFIP | EESIPR_RRFIP | EESIPR_RTLFIP |
> +			  EESIPR_RTSFIP | EESIPR_PREIP | EESIPR_CERFIP,
> +
> +	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_TRO,
> +	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
> +			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE,
> +
> +	.fdr_value	= 0x0000070f,
> +
> +	.apr		= 1,
> +	.mpr		= 1,
> +	.tpauser	= 1,
> +	.hw_swap	= 1,
> +	.rpadir		= 1,
> +	.no_ade		= 1,
> +	.xdfar_rw	= 1,
> +};
> +

   Can't check these w/o the manual...

>  static void sh_eth_chip_reset_r8a7740(struct net_device *ndev)
>  {
>  	sh_eth_chip_reset(ndev);
[...]

MNR, Sergei

^ permalink raw reply

* Re: [RFC 1/3 net] lorawan: Add LoRaWAN class module
From: Jian-Hong Pan @ 2018-08-24 15:58 UTC (permalink / raw)
  To: rdunlap
  Cc: Andreas Färber, netdev,
	<linux-arm-kernel@lists.infradead.org\,
	linux-kernel@vger.kernel.org>,, Jiri Pirko, Marcel Holtmann,
	David S. Miller, Matthias Brugger, Janus Piwek,
	Michael Röder, Dollar Chen, Ken Yu, Konstantin Böhm,
	Jan Jongboom, Jon Ortego, linux-kernel@vger.kernel.org>,,
	Ben Whitten
In-Reply-To: <30b75248-7589-fc98-819f-2b68844522f1@infradead.org>

Randy Dunlap <rdunlap@infradead.org> 於 2018年8月24日 週五 上午1:43寫道:
>
> Hi,
>
> On 08/23/2018 10:15 AM, Jian-Hong Pan wrote:
> > diff --git a/net/maclorawan/Kconfig b/net/maclorawan/Kconfig
> > new file mode 100644
> > index 000000000000..741992b059c6
> > --- /dev/null
> > +++ b/net/maclorawan/Kconfig
> > @@ -0,0 +1,14 @@
> > +config LORAWAN
> > +     tristate "MAC layer of LoRaWAN Network support"
> > +     select CRYPTO
> > +     select CRYPTO_CMAC
> > +     select CRYPTO_CBC
> > +     select CRYPTO_AES
> > +     ---help---
> > +       LoRaWAN defines a low data rate, low power and long range wireless
> > +       wide area networks. It was designed to organise networks of sensors,
> > +       switches and actuators, etc automation devices.  It could operate as
> > +       multiple kilometers wide.
>
> There are several things that I would change.  How about this instead?
>
>
>           LoRaWAN defines low data rate, low power and long range wireless
>           wide area networks. It was designed to organise networks of automation
>           devices, such as sensors, switches and actuators. It can operate
>           multiple kilometers wide.

That becomes better!  Thank you

> > +
> > +       Say Y here to compile LoRaWAN support into the kernel or say M to
> > +       compile it as modules.
>
>
> --
> ~Randy

^ permalink raw reply

* Re: [net 07/13] ice: Use order_base_2 to calculate higher power of 2
From: Sergei Shtylyov @ 2018-08-24 15:53 UTC (permalink / raw)
  To: Keller, Jacob E, Kirsher, Jeffrey T, davem@davemloft.net
  Cc: netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	jogreene@redhat.com, Venkataramanan, Anirudh
In-Reply-To: <02874ECE860811409154E81DA85FBB5884C6E3BF@ORSMSX115.amr.corp.intel.com>

On 08/24/2018 06:39 PM, Keller, Jacob E wrote:

>>> From: Jacob Keller <jacob.e.keller@intel.com>
>>>
>>> Currently, we use a combination of ilog2 and is_power_of_2() to
>>> calculate the next power of 2 for the qcount. This appears to be causing
>>> a warning on some combinations of GCC and the Linux kernel:
>>>
>>> MODPOST 1 modules
>>> WARNING: "____ilog2_NaN" [ice.ko] undefined!
>>>
>>> This appears to because because GCC realizes that qcount could be zero
>>
>>     One "because" is enough. :-)
> 
> Woops! Thanks for catching it.

   You probably meant to type "happen" instead of the 1st because...

> Regards,
> Jake

MBR, Sergei

^ permalink raw reply

* RE: [net 07/13] ice: Use order_base_2 to calculate higher power of 2
From: Keller, Jacob E @ 2018-08-24 15:39 UTC (permalink / raw)
  To: Sergei Shtylyov, Kirsher, Jeffrey T, davem@davemloft.net
  Cc: netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	jogreene@redhat.com, Venkataramanan, Anirudh
In-Reply-To: <9c66a3d0-9c2a-c9dc-b3e1-d7e9ea12e402@cogentembedded.com>



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Sergei Shtylyov
> Sent: Friday, August 24, 2018 2:03 AM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; davem@davemloft.net
> Cc: Keller, Jacob E <jacob.e.keller@intel.com>; netdev@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jogreene@redhat.com;
> Venkataramanan, Anirudh <anirudh.venkataramanan@intel.com>
> Subject: Re: [net 07/13] ice: Use order_base_2 to calculate higher power of 2
> 
> Hello!
> 
> On 8/23/2018 10:14 PM, Jeff Kirsher wrote:
> 
> > From: Jacob Keller <jacob.e.keller@intel.com>
> >
> > Currently, we use a combination of ilog2 and is_power_of_2() to
> > calculate the next power of 2 for the qcount. This appears to be causing
> > a warning on some combinations of GCC and the Linux kernel:
> >
> > MODPOST 1 modules
> > WARNING: "____ilog2_NaN" [ice.ko] undefined!
> >
> > This appears to because because GCC realizes that qcount could be zero
> 
>     One "because" is enough. :-)

Woops! Thanks for catching it.

Regards,
Jake



^ permalink raw reply

* Re: [PATCH v2 01/29] nvmem: add support for cell lookups
From: Boris Brezillon @ 2018-08-24 15:08 UTC (permalink / raw)
  To: Bartosz Golaszewski
  Cc: Jonathan Corbet, Sekhar Nori, Kevin Hilman, Russell King,
	Arnd Bergmann, Greg Kroah-Hartman, David Woodhouse, Brian Norris,
	Marek Vasut, Richard Weinberger, Grygorii Strashko,
	David S . Miller, Srinivas Kandagatla, Naren,
	Mauro Carvalho Chehab, Andrew Morton, Lukas Wunner, Dan Carpenter,
	Florian Fainelli
In-Reply-To: <20180810080526.27207-2-brgl@bgdev.pl>

Hi Bartosz,

On Fri, 10 Aug 2018 10:04:58 +0200
Bartosz Golaszewski <brgl@bgdev.pl> wrote:

> +struct nvmem_cell_lookup {
> +	struct nvmem_cell_info	info;
> +	struct list_head	list;
> +	const char		*nvmem_name;
> +};

Hm, maybe I don't get it right, but this looks suspicious. Usually the
consumer lookup table is here to attach device specific names to
external resources.

So what I'd expect here is:

struct nvmem_cell_lookup {
	/* The nvmem device name. */
	const char *nvmem_name;

	/* The nvmem cell name */
	const char *nvmem_cell_name;

	/*
	 * The local resource name. Basically what you have in the
	 * nvmem-cell-names prop.
	 */
	const char *conid;
};

struct nvmem_cell_lookup_table {
	struct list_head list;

	/* ID of the consumer device. */
	const char *devid;

	/* Array of cell lookup entries. */
	unsigned int ncells;
	const struct nvmem_cell_lookup *cells;
};

Looks like your nvmem_cell_lookup is more something used to attach cells
to an nvmem device, which is NVMEM provider's responsibility not the
consumer one.

Regards,

Boris

^ permalink raw reply

* Re: [PATCH v2] net: macb: do not disable MDIO bus at open/close time
From: Claudiu Beznea @ 2018-08-24 14:47 UTC (permalink / raw)
  To: Anssi Hannula, netdev, Nicolas Ferre; +Cc: David S. Miller, Andrew Lunn
In-Reply-To: <20180823074522.5663-1-anssi.hannula@bitwise.fi>



On 23.08.2018 10:45, Anssi Hannula wrote:
> macb_reset_hw() is called from macb_close() and indirectly from
> macb_open(). macb_reset_hw() zeroes the NCR register, including the MPE
> (Management Port Enable) bit.
> 
> This will prevent accessing any other PHYs for other Ethernet MACs on
> the MDIO bus, which remains registered at macb_reset_hw() time, until
> macb_init_hw() is called from macb_open() which sets the MPE bit again.
> 
> I.e. currently the MDIO bus has a short disruption at open time and is
> disabled at close time until the interface is opened again.
> 
> Fix that by only touching the RE and TE bits when enabling and disabling
> RX/TX.
> 
> v2: Make macb_init_hw() NCR write a single statement.

This usually goes after the tree '-' below.

> 
> Fixes: 6c36a7074436 ("macb: Use generic PHY layer")
> Signed-off-by: Anssi Hannula <anssi.hannula@bitwise.fi>
> ---

Here:

v2: Make macb_init_hw() NCR write a single statement.

Other than this: 

Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>

Checked that link goes up and ping is working on a SAMA5D2 Xplained:

Tested-by: Claudiu Beznea <claudiu.beznea@microchip.com>

>  drivers/net/ethernet/cadence/macb_main.c | 9 ++++++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
> index dc09f9a8a49b..225a7c8bad2d 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -2028,14 +2028,17 @@ static void macb_reset_hw(struct macb *bp)
>  {
>  	struct macb_queue *queue;
>  	unsigned int q;
> +	u32 ctrl = macb_readl(bp, NCR);
>  
>  	/* Disable RX and TX (XXX: Should we halt the transmission
>  	 * more gracefully?)
>  	 */
> -	macb_writel(bp, NCR, 0);
> +	ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
>  
>  	/* Clear the stats registers (XXX: Update stats first?) */
> -	macb_writel(bp, NCR, MACB_BIT(CLRSTAT));
> +	ctrl |= MACB_BIT(CLRSTAT);
> +
> +	macb_writel(bp, NCR, ctrl);
>  
>  	/* Clear all status flags */
>  	macb_writel(bp, TSR, -1);
> @@ -2223,7 +2226,7 @@ static void macb_init_hw(struct macb *bp)
>  	}
>  
>  	/* Enable TX and RX */
> -	macb_writel(bp, NCR, MACB_BIT(RE) | MACB_BIT(TE) | MACB_BIT(MPE));
> +	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(RE) | MACB_BIT(TE));
>  }
>  
>  /* The hash address register is 64 bits long and takes up two
> 

^ permalink raw reply

* Re: [PATCH] ixgb: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Jeff Kirsher @ 2018-08-24 18:21 UTC (permalink / raw)
  To: Colin King, David S . Miller, intel-wired-lan, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20180824165232.21585-1-colin.king@canonical.com>

[-- Attachment #1: Type: text/plain, Size: 547 bytes --]

On Fri, 2018-08-24 at 17:52 +0100, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Use dma_zalloc_coherent rather than dam_alloc_coherent and memset.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
>  drivers/net/ethernet/intel/ixgb/ixgb_main.c | 6 ++----
>  1 file changed, 2 insertions(+), 4 deletions(-)

Thanks Colin, I already have this change queued up from another community
member and I am just about to send a pull request to Dave for this change,
as well as several other fixes.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: ixgbe hangs when XDP_TX is enabled
From: Jesper Dangaard Brouer @ 2018-08-24 14:25 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: brouer, Alexander Duyck, tehnerd, Netdev, tytus.a.wasilewski,
	Tymoteusz Kielan, John Fastabend, Daniel Borkmann,
	Alexei Starovoitov
In-Reply-To: <e9762fc08e0c62c52ee5169e43e512f5d65087a2.camel@intel.com>


On Wed, 22 Aug 2018 09:22:58 -0700 Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:
> On Tue, 2018-08-21 at 11:13 -0700, Alexander Duyck wrote:
> > On Tue, Aug 21, 2018 at 9:59 AM Nikita V. Shirokov <tehnerd@tehnerd.com> wrote:
> > > 
> > > On Tue, Aug 21, 2018 at 08:58:15AM -0700, Alexander Duyck wrote:  
> > > > On Mon, Aug 20, 2018 at 12:32 PM Nikita V. Shirokov <tehnerd@tehnerd.com> wrote:
> > > > > 
> > > > > we are getting such errors:
> > > > > 
> > > > > [  408.737313] ixgbe 0000:03:00.0 eth0: Detected Tx Unit Hang (XDP)
> > > > >                   Tx Queue             <46>
> > > > >                   TDH, TDT             <0>, <2>
> > > > >                   next_to_use          <2>
> > > > >                   next_to_clean        <0>
> > > > >                 tx_buffer_info[next_to_clean]
> > > > >                   time_stamp           <0>
> > > > >                   jiffies              <1000197c0>
> > > > > [  408.804438] ixgbe 0000:03:00.0 eth0: tx hang 1 detected on queue 46, resetting adapter
> > > > > [  408.804440] ixgbe 0000:03:00.0 eth0: initiating reset due to tx timeout
> > > > > [  408.817679] ixgbe 0000:03:00.0 eth0: Reset adapter
> > > > > [  408.866091] ixgbe 0000:03:00.0 eth0: TXDCTL.ENABLE for one or more queues not cleared within the polling period
> > > > > [  409.345289] ixgbe 0000:03:00.0 eth0: detected SFP+: 3
> > > > > [  409.497232] ixgbe 0000:03:00.0 eth0: NIC Link is Up 10 Gbps, Flow Control: RX/TX
> > > > > 
> > > > > while running XDP prog on ixgbe nic.
> > > > > right now i'm seing this on bpfnext kernel
> > > > > (latest commit from Wed Aug 15 15:04:25 2018 -0700 ;
> > > > > 9a76aba02a37718242d7cdc294f0a3901928aa57)
> > > > > 
> > > > > looks like this is the same issue as reported by Brenden in
> > > > > https://www.spinics.net/lists/netdev/msg439438.html
> > > > > 
> > > > [...] The total number of CPUs on the system
> > > > would be useful to know as well.
[...]
> > > # nproc
> > > 48
> > > 
[...]
> > > ethtool -l eth0
> > > Channel parameters for eth0:
> > > Pre-set maximums:
> > > RX:             0
> > > TX:             0
> > > Other:          1
> > > Combined:       63
> > > Current hardware settings:
> > > RX:             0
> > > TX:             0
> > > Other:          1
> > > Combined:       48
[...]

> > > 
> > > workaround for now is to do the same, as Brenden did in his
> > > original
> > > finding: make sure that combined + xdp queues < max_tx_queues
> > > (e.g. w/ combined == 14 the issue goes away).
> > >   
> > > > the issue with one of the sample XDP programs provided with the
> > > > kernel such as the xdp2 which I believe uses the XDP_TX
> > > > function. We need to try and create a similar setup in our own
> > > > environment for reproduction and debugging.  
> > > 
> > > will try but this could take a while, because i'm not sure that we
> > > have ixgbe in our test lab (and it would be hard to run such test
> > > in prod)

Notice to reproduce you need a system with 48 cores. (I predict: for
less than 33 cores it will not show, and above 48 cores the XDP prog
should be rejected loading).


> > 
> > So I have been reading the datasheet
> > (
> > https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82599-10-gbe-controller-datasheet.pdf
> > )
> > and it looks like the assumption that Brenden came to in the earlier
> > referenced link is probably correct. From what I can tell there is a
> > limit of 64 queues in the base RSS mode of the device, so while it
> > supports more than 64 queues you can only make use of 64 as per table
> > 7-25.
> > 

As far as I can remember, the driver code assumes up-to 96 queue are
avail.  It sounds like the driver XDP code that allocates 'one XDP
TX-queue per core' in the system are causing this.

I have previously complained that the ixgbe driver will be able to
enable XDP on machines with many CPU cores, due to the 'one XDP TX-queue
per core' design requirement.


> > For now I think the workaround you are using is probably the only
> > viable solution. I myself don't have time to work on resolving this,
> > but I am sure on of the maintainers for ixgbe will be responding
> > shortly.  
> 
> I have notified the 10GbE maintainers, and we are working to reproduce
> the issue currently.

For reproducers, notice the correlation with the number of cores the
system have.

 
> > One possible solution we may want to look at would be to make use of
> > the 32 pool/VF mode in the MTQC register. That should enable us to
> > make use of all 128 queues but I am sure there would be other side
> > effects such as having to set the bits in the PFVFTE register in
> > order to enable the extra Tx queues.  
 
Getting access to more queue is of-cause good, as it move the bar for
how many cores a system can have before XDP will no-longer work with
the ixgbe driver.

An alternative solution is also possible, but there will be a
performance trade off.  After merge commit 10f678683e4 ("Merge branch
'xdp_xmit-bulking'") the ndo_xdp_xmit() gets bulks of 16 frames (limit
within devmap).  Thus, on systems that cannot allocate a NIC HW queue
foreach CPU core, can alternatively use locked XDP TX queue(s) (which
will be amortized due to bulking).  This mode will be slower, thus the
question is how do we "warn" the user, that this will be operating in a
slightly less optimal XDP-TX mode? (will a simple pr_info be enough,
like when there is insufficient PCIe BW).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH] wcn36xx: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Kalle Valo @ 2018-08-24 17:35 UTC (permalink / raw)
  To: Colin King
  Cc: David S . Miller, wcn36xx, linux-wireless, netdev,
	kernel-janitors, linux-kernel
In-Reply-To: <20180824171410.22539-1-colin.king@canonical.com>

Colin King <colin.king@canonical.com> writes:

> From: Colin Ian King <colin.king@canonical.com>
>
> It is preferable to use dma_zalloc_coherent rather than dam_alloc_coherent
> and memset, so use it.
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
>  drivers/net/wireless/ath/wcn36xx/dxe.c | 6 ++----
>  1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
> index 06cfe8d311f3..e66ddaa39523 100644
> --- a/drivers/net/wireless/ath/wcn36xx/dxe.c
> +++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
> @@ -174,13 +174,11 @@ static int wcn36xx_dxe_init_descs(struct device *dev, struct wcn36xx_dxe_ch *wcn
>  	int i;
>  
>  	size = wcn_ch->desc_num * sizeof(struct wcn36xx_dxe_desc);
> -	wcn_ch->cpu_addr = dma_alloc_coherent(dev, size, &wcn_ch->dma_addr,
> -					      GFP_KERNEL);
> +	wcn_ch->cpu_addr = dma_zalloc_coherent(dev, size, &wcn_ch->dma_addr,
> +					       GFP_KERNEL);
>  	if (!wcn_ch->cpu_addr)
>  		return -ENOMEM;
>  
> -	memset(wcn_ch->cpu_addr, 0, size);
> -
>  	cur_dxe = (struct wcn36xx_dxe_desc *)wcn_ch->cpu_addr;
>  	cur_ctl = wcn_ch->head_blk_ctl;

Hehe, I think you are the third person submitting a similar patch :)
Anyway, I just applied this few hours ago:

https://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git/commit/?h=ath-next&id=d410e28f3ae476e1572b8893c646ef44fae7bbbd

-- 
Kalle Valo

^ permalink raw reply

* Re: [PATCH] i40e: report correct statistics when XDP is enabled
From: Jesper Dangaard Brouer @ 2018-08-24 14:00 UTC (permalink / raw)
  To: Björn Töpel
  Cc: jeffrey.t.kirsher, intel-wired-lan, magnus.karlsson,
	magnus.karlsson, jacob.e.keller, bjorn.topel, brouer,
	netdev@vger.kernel.org
In-Reply-To: <20180824112159.32298-1-bjorn.topel@intel.com>

On Fri, 24 Aug 2018 13:21:59 +0200
Björn Töpel <bjorn.topel@intel.com> wrote:

> When XDP is enabled, the driver will report incorrect
> statistics. Received frames will reported as transmitted frames.
> 
> This commits fixes the i40e implementation of ndo_get_stats64 (struct
> net_device_ops), so that iproute2 will report correct statistics
> (e.g. when running "ip -stats link show dev eth0") even when XDP is
> enabled.
> 
> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Fixes: 74608d17fe29 ("i40e: add support for XDP_TX action")

Stable candidate:
 $ git describe --contains 74608d17fe29
 v4.13-rc1~157^2~128^2~13

> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

It works for me:

Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>

I'm explicitly _not_ ACK'ing the patch, as I think the your code changes
below makes it harder to follow whether a TX or RX ring is getting
updated. But it is 100% up to the driver maintainers to say if this is
acceptable from a maintenance PoV.

> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 24 +++++++++++----------
>  1 file changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index e40c023cc7b6..7c122dd3faa1 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -425,9 +425,9 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
>  				  struct rtnl_link_stats64 *stats)
>  {
>  	struct i40e_netdev_priv *np = netdev_priv(netdev);
> -	struct i40e_ring *tx_ring, *rx_ring;
>  	struct i40e_vsi *vsi = np->vsi;
>  	struct rtnl_link_stats64 *vsi_stats = i40e_get_vsi_stats_struct(vsi);
> +	struct i40e_ring *ring;
>  	int i;
>  
>  	if (test_bit(__I40E_VSI_DOWN, vsi->state))
> @@ -441,24 +441,26 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
>  		u64 bytes, packets;
>  		unsigned int start;
>  
> -		tx_ring = READ_ONCE(vsi->tx_rings[i]);
> -		if (!tx_ring)
> +		ring = READ_ONCE(vsi->tx_rings[i]);
> +		if (!ring)
>  			continue;
> -		i40e_get_netdev_stats_struct_tx(tx_ring, stats);
> +		i40e_get_netdev_stats_struct_tx(ring, stats);
>  
> -		rx_ring = &tx_ring[1];
> +		if (i40e_enabled_xdp_vsi(vsi)) {
> +			ring++;
> +			i40e_get_netdev_stats_struct_tx(ring, stats);
> +		}
>  
> +		ring++;
>  		do {
> -			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
> -			packets = rx_ring->stats.packets;
> -			bytes   = rx_ring->stats.bytes;
> -		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
> +			start   = u64_stats_fetch_begin_irq(&ring->syncp);
> +			packets = ring->stats.packets;
> +			bytes   = ring->stats.bytes;
> +		} while (u64_stats_fetch_retry_irq(&ring->syncp, start));
>  
>  		stats->rx_packets += packets;
>  		stats->rx_bytes   += bytes;
>  
> -		if (i40e_enabled_xdp_vsi(vsi))
> -			i40e_get_netdev_stats_struct_tx(&rx_ring[1], stats);
>  	}
>  	rcu_read_unlock();
>  



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: should __LINK_STATE_* enums really be restricted to generic queueing layer?
From: Andrew Lunn @ 2018-08-24 13:49 UTC (permalink / raw)
  To: rpjday; +Cc: netdev
In-Reply-To: <20180824071738.Horde.kWZLgt1409E2fICBH1FiM5U@crashcourse.ca>

On Fri, Aug 24, 2018 at 07:17:38AM -0400, rpjday@crashcourse.ca wrote:
>   more curious pedantry ... in netdevice.h, one reads:
> 
>   /* These flag bits are private to the generic network queueing
>    * layer; they may not be explicitly referenced by any other
>    * code.
>    */
> 
>   enum netdev_state_t {
>         __LINK_STATE_START,
>         __LINK_STATE_PRESENT,
>         __LINK_STATE_NOCARRIER,
>         __LINK_STATE_LINKWATCH_PENDING,
>         __LINK_STATE_DORMANT,
>   };
> 
> ok, but there are definitely a few examples of what look like
> non-generic network queueing code referencing those enums.
> 
>   one example, drivers/net/ethernet/amd/sun3lance.c, which openly
> (and amusingly) admits that it shouldn't be doing this:
> 
>         dev->netdev_ops = &lance_netdev_ops;
>   //      KLUDGE -- REMOVE ME
>         set_bit(__LINK_STATE_PRESENT, &dev->state);
>         return 1;
>   }
> 
> there are a few more places (admittedly, not many) -- is this
> acceptable behaviour?

Not in a new driver.

The sun3lance is a very old driver. I have misty memories of a Sun
3/60. There is nothing in this code about a driving a PHY. Maybe using
netif_carrier_on() would work, but without having access to the
hardware to test, i would just leave it alone.

       Andrew

^ permalink raw reply

* [PATCH] wcn36xx: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Colin King @ 2018-08-24 17:14 UTC (permalink / raw)
  To: Kalle Valo, David S . Miller, wcn36xx, linux-wireless, netdev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

It is preferable to use dma_zalloc_coherent rather than dam_alloc_coherent
and memset, so use it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index 06cfe8d311f3..e66ddaa39523 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -174,13 +174,11 @@ static int wcn36xx_dxe_init_descs(struct device *dev, struct wcn36xx_dxe_ch *wcn
 	int i;
 
 	size = wcn_ch->desc_num * sizeof(struct wcn36xx_dxe_desc);
-	wcn_ch->cpu_addr = dma_alloc_coherent(dev, size, &wcn_ch->dma_addr,
-					      GFP_KERNEL);
+	wcn_ch->cpu_addr = dma_zalloc_coherent(dev, size, &wcn_ch->dma_addr,
+					       GFP_KERNEL);
 	if (!wcn_ch->cpu_addr)
 		return -ENOMEM;
 
-	memset(wcn_ch->cpu_addr, 0, size);
-
 	cur_dxe = (struct wcn36xx_dxe_desc *)wcn_ch->cpu_addr;
 	cur_ctl = wcn_ch->head_blk_ctl;
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH NET 3/3] net: hns: add configuration constraints for 1000M half
From: Andrew Lunn @ 2018-08-24 13:28 UTC (permalink / raw)
  To: lipeng (Y)
  Cc: davem, netdev, linux-kernel, linuxarm, yisen.zhuang, salil.mehta
In-Reply-To: <194ac663-94d8-55e2-0231-a6be53e35f04@huawei.com>

On Fri, Aug 24, 2018 at 02:39:36PM +0800, lipeng (Y) wrote:
> 
> 
> On 2018/8/24 11:41, Andrew Lunn wrote:
> >On Fri, Aug 24, 2018 at 11:42:23AM +0800, Peng Li wrote:
> >>Hisilicon hip05 and hip06 board network card do not support
> >>1000M half configuration. Driver can not config gmac as
> >>1000M half.
> >>
> >>Signed-off-by: Peng Li <lipeng321@huawei.com>
> >Hi Peng
> >
> >Does the driver remove SUPPORTED_1000baseT_Half from
> >phydev->supported?  If you do that, the PHY should never negotiate
> >this speed.
> >
> >    Andrew
> Hi, Andrew,
> 
> The driver has removed SUPPORTED_1000baseT_Half from
> 
> phydev->supported.
> 
> the code is :
> #define MAC_GMAC_SUPPORTED \
> 	(SUPPORTED_10baseT_Half \
> 	| SUPPORTED_10baseT_Full \
> 	| SUPPORTED_100baseT_Half \
> 	| SUPPORTED_100baseT_Full \
> 	| SUPPORTED_Autoneg)
>  h->if_support = MAC_GMAC_SUPPORTED;
>  h->if_support |= SUPPORTED_1000baseT_Full;
> phydev->supported &= h->if_support;
> 
> As gmac do not support 1000M half, we add this patch to
> make sure that no users can set 1000M half in any case.

Well, not quite. What this patch does is protect the hardware when it
is asked to change to an unsupported mode. This patch has nothing to
do with user APIs.

The user API for setting link modes is hns_nic_set_link_ksettings().

What you do have is 

                if (speed == SPEED_1000 && cmd->base.duplex == DUPLEX_HALF)
                        return -EINVAL;

which should stop somebody setting up a fixed speed link at 1000Half.

But you don't do anything with cmd->link_modes.advertising. However,
when you call phy_ethtool_ksettings_set(), it gets AND'ed with
phydev->supported, which you have already removed 1000Half from.

So is this a patch for a theoretical problem? Or have you seen it
happen? If it did happen, how did the user configure it to cause this
problem? That user API needs to prevent it.

	 Andrew

^ permalink raw reply

* [PATCH] ixgb: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Colin King @ 2018-08-24 16:52 UTC (permalink / raw)
  To: Jeff Kirsher, David S . Miller, intel-wired-lan, netdev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Use dma_zalloc_coherent rather than dam_alloc_coherent and memset.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/ethernet/intel/ixgb/ixgb_main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index 43664adf7a3c..c71bc0237108 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -771,14 +771,12 @@ ixgb_setup_rx_resources(struct ixgb_adapter *adapter)
 	rxdr->size = rxdr->count * sizeof(struct ixgb_rx_desc);
 	rxdr->size = ALIGN(rxdr->size, 4096);
 
-	rxdr->desc = dma_alloc_coherent(&pdev->dev, rxdr->size, &rxdr->dma,
-					GFP_KERNEL);
-
+	rxdr->desc = dma_zalloc_coherent(&pdev->dev, rxdr->size, &rxdr->dma,
+					 GFP_KERNEL);
 	if (!rxdr->desc) {
 		vfree(rxdr->buffer_info);
 		return -ENOMEM;
 	}
-	memset(rxdr->desc, 0, rxdr->size);
 
 	rxdr->next_to_clean = 0;
 	rxdr->next_to_use = 0;
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH 4.4 18/31] r8152: napi hangup fix after disconnect
From: Ben Hutchings @ 2018-08-24 16:38 UTC (permalink / raw)
  To: Jiri Slaby, linux-usb, netdev, David S. Miller
  Cc: stable, Greg Kroah-Hartman, LKML
In-Reply-To: <20180720121340.812996969@linuxfoundation.org>

On Fri, 2018-07-20 at 14:13 +0200, Greg Kroah-Hartman wrote:
> 4.4-stable review patch.  If anyone has any objections, please let me know.
> 
> ------------------
> 
> From: Jiri Slaby <jslaby@suse.cz>
> 
> [ Upstream commit 0ee1f4734967af8321ecebaf9c74221ace34f2d5 ]
[...]
> --- a/drivers/net/usb/r8152.c
> +++ b/drivers/net/usb/r8152.c
> @@ -3139,7 +3139,8 @@ static int rtl8152_close(struct net_devi
>  #ifdef CONFIG_PM_SLEEP
>  	unregister_pm_notifier(&tp->pm_notifier);
>  #endif
> -	napi_disable(&tp->napi);
> +	if (!test_bit(RTL8152_UNPLUG, &tp->flags))
> +		napi_disable(&tp->napi);
>  	clear_bit(WORK_ENABLE, &tp->flags);
>  	usb_kill_urb(tp->intr_urb);
>  	cancel_delayed_work_sync(&tp->schedule);

This flag appears to be set only if the USB device is actually
disconnected.  In case the driver is unbound for some other reason
(like the module is removed), the same problem will occur.

What I think might work is to do:

	if (!list_empty(&tp->napi.dev_list)
		napi_disable(&tp->napi);

Ben.

-- 
Ben Hutchings, Software Developer                         Codethink Ltd
https://www.codethink.co.uk/                 Dale House, 35 Dale Street
                                     Manchester, M1 2HF, United Kingdom

^ permalink raw reply

* [PATCH net] mlxsw: spectrum_switchdev: Do not leak RIFs when removing bridge
From: Ido Schimmel @ 2018-08-24 12:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, alexpe, mlxsw, Ido Schimmel

When a bridge device is removed, the VLANs are flushed from each
configured port. This causes the ports to decrement the reference count
on the associated FIDs (filtering identifier). If the reference count of
a FID is 1 and it has a RIF (router interface), then this RIF is
destroyed.

However, if no port is member in the VLAN for which a RIF exists, then
the RIF will continue to exist after the removal of the bridge. To
reproduce:

# ip link add name br0 type bridge vlan_filtering 1
# ip link set dev swp1 master br0
# ip link add link br0 name br0.10 type vlan id 10
# ip address add 192.0.2.0/24 dev br0.10
# ip link del dev br0

The RIF associated with br0.10 continues to exist.

Fix this by iterating over all the bridge device uppers when it is
destroyed and take care of destroying their RIFs.

Fixes: 99f44bb3527b ("mlxsw: spectrum: Enable L3 interfaces on top of bridge devices")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum.h    |  2 ++
 .../ethernet/mellanox/mlxsw/spectrum_router.c | 11 ++++++++++
 .../mellanox/mlxsw/spectrum_switchdev.c       | 20 +++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 3ae930196741..3cdb7aca90b7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -414,6 +414,8 @@ mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
 void
 mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
 void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif);
+void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *dev);
 
 /* spectrum_kvdl.c */
 enum mlxsw_sp_kvdl_entry_type {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 3a96307f51b0..2ab9cf25a08a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -6234,6 +6234,17 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
 	mlxsw_sp_vr_put(mlxsw_sp, vr);
 }
 
+void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *dev)
+{
+	struct mlxsw_sp_rif *rif;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return;
+	mlxsw_sp_rif_destroy(rif);
+}
+
 static void
 mlxsw_sp_rif_subport_params_init(struct mlxsw_sp_rif_params *params,
 				 struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 0d8444aaba01..db715da7bab7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -127,6 +127,24 @@ bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
 	return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
 }
 
+static int mlxsw_sp_bridge_device_upper_rif_destroy(struct net_device *dev,
+						    void *data)
+{
+	struct mlxsw_sp *mlxsw_sp = data;
+
+	mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev);
+	return 0;
+}
+
+static void mlxsw_sp_bridge_device_rifs_destroy(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *dev)
+{
+	mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev);
+	netdev_walk_all_upper_dev_rcu(dev,
+				      mlxsw_sp_bridge_device_upper_rif_destroy,
+				      mlxsw_sp);
+}
+
 static struct mlxsw_sp_bridge_device *
 mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
 			      struct net_device *br_dev)
@@ -165,6 +183,8 @@ static void
 mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge,
 			       struct mlxsw_sp_bridge_device *bridge_device)
 {
+	mlxsw_sp_bridge_device_rifs_destroy(bridge->mlxsw_sp,
+					    bridge_device->dev);
 	list_del(&bridge_device->list);
 	if (bridge_device->vlan_enabled)
 		bridge->vlan_enabled_exists = false;
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH ghak90 (was ghak32) V4 03/10] audit: log container info of syscalls
From: Steve Grubb @ 2018-08-24 16:01 UTC (permalink / raw)
  To: linux-audit
  Cc: simo, Richard Guy Briggs, linux-api, containers, LKML, dhowells,
	carlos, netfilter-devel, ebiederm, luto, netdev, linux-fsdevel,
	eparis, serge, viro
In-Reply-To: <34017c395d03a213d6b0d49b9964429bd32b283d.1533065887.git.rgb@redhat.com>

On Tuesday, July 31, 2018 4:07:38 PM EDT Richard Guy Briggs wrote:
> Create a new audit record AUDIT_CONTAINER to document the audit
> container identifier of a process if it is present.
> 
> Called from audit_log_exit(), syscalls are covered.
> 
> A sample raw event:
> type=SYSCALL msg=audit(1519924845.499:257): arch=c000003e syscall=257
> success=yes exit=3 a0=ffffff9c a1=56374e1cef30 a2=241 a3=1b6 items=2
> ppid=606 pid=635 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0
> fsgid=0 tty=pts0 ses=3 comm="bash" exe="/usr/bin/bash"
> subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
> key="tmpcontainerid" type=CWD msg=audit(1519924845.499:257): cwd="/root"
> type=PATH msg=audit(1519924845.499:257): item=0 name="/tmp/" inode=13863
> dev=00:27 mode=041777 ouid=0 ogid=0 rdev=00:00
> obj=system_u:object_r:tmp_t:s0 nametype= PARENT cap_fp=0000000000000000
> cap_fi=0000000000000000 cap_fe=0 cap_fver=0 type=PATH
> msg=audit(1519924845.499:257): item=1 name="/tmp/tmpcontainerid"
> inode=17729 dev=00:27 mode=0100644 ouid=0 ogid=0 rdev=00:00
> obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE
> cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
> type=PROCTITLE msg=audit(1519924845.499:257):
> proctitle=62617368002D6300736C65657020313B206563686F2074657374203E202F746D
> 702F746D70636F6E7461696E65726964 type=CONTAINER
> msg=audit(1519924845.499:257): op=task contid=123458

Ack for the event format.

-Steve

> See: https://github.com/linux-audit/audit-kernel/issues/90
> See: https://github.com/linux-audit/audit-userspace/issues/51
> See: https://github.com/linux-audit/audit-testsuite/issues/64
> See:
> https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> Acked-by: Serge Hallyn <serge@hallyn.com>
> Acked-by: Steve Grubb <sgrubb@redhat.com>
> ---
>  include/linux/audit.h      |  7 +++++++
>  include/uapi/linux/audit.h |  1 +
>  kernel/audit.c             | 24 ++++++++++++++++++++++++
>  kernel/auditsc.c           |  3 +++
>  4 files changed, 35 insertions(+)
> 
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 71a6fc6..d5a48dc 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -155,6 +155,9 @@ extern void		    audit_log_key(struct audit_buffer 
*ab,
> extern int audit_log_task_context(struct audit_buffer *ab);
>  extern void audit_log_task_info(struct audit_buffer *ab,
>  				struct task_struct *tsk);
> +extern int audit_log_contid(struct task_struct *tsk,
> +				    struct audit_context *context,
> +				    char *op);
> 
>  extern int		    audit_update_lsm_rules(void);
> 
> @@ -205,6 +208,10 @@ static inline int audit_log_task_context(struct
> audit_buffer *ab) static inline void audit_log_task_info(struct
> audit_buffer *ab,
>  				       struct task_struct *tsk)
>  { }
> +static inline int audit_log_contid(struct task_struct *tsk,
> +					   struct audit_context *context,
> +					   char *op)
> +{ }
>  #define audit_enabled AUDIT_OFF
>  #endif /* CONFIG_AUDIT */
> 
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index 3474f57..dc259c7 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -115,6 +115,7 @@
>  #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd 
*/
>  #define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
>  #define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
> +#define AUDIT_CONTAINER		1332	/* Container ID */
> 
>  #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
>  #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
> diff --git a/kernel/audit.c b/kernel/audit.c
> index 2a80587..15f54c7 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -2045,6 +2045,30 @@ void audit_log_session_info(struct audit_buffer *ab)
> audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
>  }
> 
> +/*
> + * audit_log_contid - report container info
> + * @tsk: task to be recorded
> + * @context: task or local context for record
> + * @op: contid string description
> + */
> +int audit_log_contid(struct task_struct *tsk,
> +			     struct audit_context *context, char *op)
> +{
> +	struct audit_buffer *ab;
> +
> +	if (!audit_contid_set(tsk))
> +		return 0;
> +	/* Generate AUDIT_CONTAINER record with container ID */
> +	ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER);
> +	if (!ab)
> +		return -ENOMEM;
> +	audit_log_format(ab, "op=%s contid=%llu",
> +			 op, audit_get_contid(tsk));
> +	audit_log_end(ab);
> +	return 0;
> +}
> +EXPORT_SYMBOL(audit_log_contid);
> +
>  void audit_log_key(struct audit_buffer *ab, char *key)
>  {
>  	audit_log_format(ab, " key=");
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 6125cef..39e5633 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -1488,10 +1488,13 @@ static void audit_log_exit(struct audit_context
> *context, struct task_struct *ts
> 
>  	audit_log_proctitle(tsk, context);
> 
> +	audit_log_contid(tsk, context, "task");
> +
>  	/* Send end of event record to help user space know we are finished */
>  	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
>  	if (ab)
>  		audit_log_end(ab);
> +
>  	if (call_panic)
>  		audit_panic("error converting sid to string");
>  }

^ permalink raw reply

* Re: [PATCH ghak90 (was ghak32) V4 02/10] audit: add container id
From: Steve Grubb @ 2018-08-24 16:01 UTC (permalink / raw)
  To: linux-audit
  Cc: Richard Guy Briggs, containers, linux-api, linux-fsdevel, LKML,
	netdev, netfilter-devel, ebiederm, luto, carlos, dhowells, viro,
	simo, eparis, serge
In-Reply-To: <12396e378a78ee8dd38c75f7730d67d8fbb08e02.1533065887.git.rgb@redhat.com>

On Tuesday, July 31, 2018 4:07:37 PM EDT Richard Guy Briggs wrote:
> Implement the proc fs write to set the audit container identifier of a
> process, emitting an AUDIT_CONTAINER_OP record to document the event.
> 
> This is a write from the container orchestrator task to a proc entry of
> the form /proc/PID/audit_containerid where PID is the process ID of the
> newly created task that is to become the first task in a container, or
> an additional task added to a container.
> 
> The write expects up to a u64 value (unset: 18446744073709551615).
> 
> The writer must have capability CAP_AUDIT_CONTROL.
> 
> This will produce a record such as this:
>   type=CONTAINER_ID msg=audit(2018-06-06 12:39:29.636:26949) : op=set
> opid=2209 old-contid=18446744073709551615 contid=123456 pid=628 auid=root
> uid=root tty=ttyS0 ses=1
> subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=bash
> exe=/usr/bin/bash res=yes
> 
> The "op" field indicates an initial set. 

The op field seems to be entirely unnecessary. There is no unset event (nor 
should there be) so op=set is implied by the record type. Otherwise the event 
format looks fine.

-Steve

> The "pid" to "ses" fields are
> the orchestrator while the "opid" field is the object's PID, the process
> being "contained".  Old and new audit container identifier values are
> given in the "contid" fields, while res indicates its success.
> 
> It is not permitted to unset the audit container identifier.
> A child inherits its parent's audit container identifier.
> 
> See: https://github.com/linux-audit/audit-kernel/issues/90
> See: https://github.com/linux-audit/audit-userspace/issues/51
> See: https://github.com/linux-audit/audit-testsuite/issues/64
> See:
> https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> 
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> Acked-by: Serge Hallyn <serge@hallyn.com>
> Acked-by: Steve Grubb <sgrubb@redhat.com>
> ---
>  fs/proc/base.c             | 37 +++++++++++++++++++++++++
>  include/linux/audit.h      | 24 ++++++++++++++++
>  include/uapi/linux/audit.h |  2 ++
>  kernel/auditsc.c           | 68
> ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 131
> insertions(+)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index b657294..1b3cda1 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1260,6 +1260,41 @@ static ssize_t proc_sessionid_read(struct file *
> file, char __user * buf, .read		= proc_sessionid_read,
>  	.llseek		= generic_file_llseek,
>  };
> +
> +static ssize_t proc_contid_write(struct file *file, const char __user
> *buf, +				   size_t count, loff_t *ppos)
> +{
> +	struct inode *inode = file_inode(file);
> +	u64 contid;
> +	int rv;
> +	struct task_struct *task = get_proc_task(inode);
> +
> +	if (!task)
> +		return -ESRCH;
> +	if (*ppos != 0) {
> +		/* No partial writes. */
> +		put_task_struct(task);
> +		return -EINVAL;
> +	}
> +
> +	rv = kstrtou64_from_user(buf, count, 10, &contid);
> +	if (rv < 0) {
> +		put_task_struct(task);
> +		return rv;
> +	}
> +
> +	rv = audit_set_contid(task, contid);
> +	put_task_struct(task);
> +	if (rv < 0)
> +		return rv;
> +	return count;
> +}
> +
> +static const struct file_operations proc_contid_operations = {
> +	.write		= proc_contid_write,
> +	.llseek		= generic_file_llseek,
> +};
> +
>  #endif
> 
>  #ifdef CONFIG_FAULT_INJECTION
> @@ -2952,6 +2987,7 @@ static int proc_pid_patch_state(struct seq_file *m,
> struct pid_namespace *ns, #ifdef CONFIG_AUDITSYSCALL
>  	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
>  	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> +	REG("audit_containerid", S_IWUSR, proc_contid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>  	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> @@ -3337,6 +3373,7 @@ static int proc_tid_comm_permission(struct inode
> *inode, int mask) #ifdef CONFIG_AUDITSYSCALL
>  	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
>  	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> +	REG("audit_containerid", S_IWUSR, proc_contid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>  	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 8964332..71a6fc6 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -222,6 +222,7 @@ static inline void audit_log_task_info(struct
> audit_buffer *ab, struct audit_task_info {
>  	kuid_t			loginuid;
>  	unsigned int		sessionid;
> +	u64			contid;
>  	struct audit_context	*ctx;
>  };
>  extern struct audit_task_info init_struct_audit;
> @@ -334,6 +335,7 @@ static inline void audit_ptrace(struct task_struct *t)
>  extern int auditsc_get_stamp(struct audit_context *ctx,
>  			      struct timespec64 *t, unsigned int *serial);
>  extern int audit_set_loginuid(kuid_t loginuid);
> +extern int audit_set_contid(struct task_struct *tsk, u64 contid);
> 
>  static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
>  {
> @@ -351,6 +353,14 @@ static inline unsigned int audit_get_sessionid(struct
> task_struct *tsk) return AUDIT_SID_UNSET;
>  }
> 
> +static inline u64 audit_get_contid(struct task_struct *tsk)
> +{
> +	if (!tsk->audit)
> +		return AUDIT_CID_UNSET;
> +	else
> +		return tsk->audit->contid;
> +}
> +
>  extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
>  extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t
> gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm);
> @@ -545,6 +555,10 @@ static inline unsigned int audit_get_sessionid(struct
> task_struct *tsk) {
>  	return AUDIT_SID_UNSET;
>  }
> +static inline kuid_t audit_get_contid(struct task_struct *tsk)
> +{
> +	return AUDIT_CID_UNSET;
> +}
>  static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
>  { }
>  static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
> @@ -609,6 +623,16 @@ static inline bool audit_loginuid_set(struct
> task_struct *tsk) return uid_valid(audit_get_loginuid(tsk));
>  }
> 
> +static inline bool audit_contid_valid(u64 contid)
> +{
> +	return contid != AUDIT_CID_UNSET;
> +}
> +
> +static inline bool audit_contid_set(struct task_struct *tsk)
> +{
> +	return audit_contid_valid(audit_get_contid(tsk));
> +}
> +
>  static inline void audit_log_string(struct audit_buffer *ab, const char
> *buf) {
>  	audit_log_n_string(ab, buf, strlen(buf));
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index 4e3eaba..3474f57 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -71,6 +71,7 @@
>  #define AUDIT_TTY_SET		1017	/* Set TTY auditing status */
>  #define AUDIT_SET_FEATURE	1018	/* Turn an audit feature on or off */
>  #define AUDIT_GET_FEATURE	1019	/* Get which features are enabled */
> +#define AUDIT_CONTAINER_OP	1020	/* Define the container id and information
> */
> 
>  #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly
> uninteresting to kernel */ #define AUDIT_USER_AVC		1107	/* We filter this
> differently */
> @@ -468,6 +469,7 @@ struct audit_tty_status {
> 
>  #define AUDIT_UID_UNSET (unsigned int)-1
>  #define AUDIT_SID_UNSET ((unsigned int)-1)
> +#define AUDIT_CID_UNSET ((u64)-1)
> 
>  /* audit_rule_data supports filter rules with both integer and string
>   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 88779a7..6125cef 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -956,6 +956,7 @@ int audit_alloc(struct task_struct *tsk)
>  		return -ENOMEM;
>  	info->loginuid = audit_get_loginuid(current);
>  	info->sessionid = audit_get_sessionid(current);
> +	info->contid = audit_get_contid(current);
>  	tsk->audit = info;
> 
>  	if (likely(!audit_ever_enabled))
> @@ -985,6 +986,7 @@ int audit_alloc(struct task_struct *tsk)
>  struct audit_task_info init_struct_audit = {
>  	.loginuid = INVALID_UID,
>  	.sessionid = AUDIT_SID_UNSET,
> +	.contid = AUDIT_CID_UNSET,
>  	.ctx = NULL,
>  };
> 
> @@ -2112,6 +2114,72 @@ int audit_set_loginuid(kuid_t loginuid)
>  }
> 
>  /**
> + * audit_set_contid - set current task's audit_context contid
> + * @contid: contid value
> + *
> + * Returns 0 on success, -EPERM on permission failure.
> + *
> + * Called (set) from fs/proc/base.c::proc_contid_write().
> + */
> +int audit_set_contid(struct task_struct *task, u64 contid)
> +{
> +	u64 oldcontid;
> +	int rc = 0;
> +	struct audit_buffer *ab;
> +	uid_t uid;
> +	struct tty_struct *tty;
> +	char comm[sizeof(current->comm)];
> +
> +	task_lock(task);
> +	/* Can't set if audit disabled */
> +	if (!task->audit) {
> +		task_unlock(task);
> +		return -ENOPROTOOPT;
> +	}
> +	oldcontid = audit_get_contid(task);
> +	read_lock(&tasklist_lock);
> +	/* Don't allow the audit containerid to be unset */
> +	if (!audit_contid_valid(contid))
> +		rc = -EINVAL;
> +	/* if we don't have caps, reject */
> +	else if (!capable(CAP_AUDIT_CONTROL))
> +		rc = -EPERM;
> +	/* if task has children or is not single-threaded, deny */
> +	else if (!list_empty(&task->children))
> +		rc = -EBUSY;
> +	else if (!(thread_group_leader(task) && thread_group_empty(task)))
> +		rc = -EALREADY;
> +	read_unlock(&tasklist_lock);
> +	if (!rc)
> +		task->audit->contid = contid;
> +	task_unlock(task);
> +
> +	if (!audit_enabled)
> +		return rc;
> +
> +	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONTAINER_OP);
> +	if (!ab)
> +		return rc;
> +
> +	uid = from_kuid(&init_user_ns, task_uid(current));
> +	tty = audit_get_tty(current);
> +	audit_log_format(ab, "op=set opid=%d old-contid=%llu contid=%llu pid=%d
> uid=%u auid=%u tty=%s ses=%u", +			 task_tgid_nr(task), oldcontid, 
contid,
> +			 task_tgid_nr(current), uid,
> +			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
> +			 tty ? tty_name(tty) : "(none)",
> +			 audit_get_sessionid(current));
> +	audit_put_tty(tty);
> +	audit_log_task_context(ab);
> +	audit_log_format(ab, " comm=");
> +	audit_log_untrustedstring(ab, get_task_comm(comm, current));
> +	audit_log_d_path_exe(ab, current->mm);
> +	audit_log_format(ab, " res=%d", !rc);
> +	audit_log_end(ab);
> +	return rc;
> +}
> +
> +/**
>   * __audit_mq_open - record audit data for a POSIX MQ open
>   * @oflag: open flag
>   * @mode: mode bits

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox