[PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic
@ 2014-05-01 15:51 Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 11/15] arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions Ard Biesheuvel
                   ` (4 more replies)
  0 siblings, 5 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2014-05-01 15:51 UTC (permalink / raw)
  To: linux-arm-kernel

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 83f71b3004a8..42c7eecd2bb6 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += segment.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
+generic-y += simd.h
 generic-y += sizes.h
 generic-y += socket.h
 generic-y += sockios.h
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH resend 11/15] arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions
  2014-05-01 15:51 [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic Ard Biesheuvel
@ 2014-05-01 15:51 ` Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 12/15] arm64/crypto: add shared macro to test for NEED_RESCHED Ard Biesheuvel
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2014-05-01 15:51 UTC (permalink / raw)
  To: linux-arm-kernel

This adds ARMv8 implementations of AES in ECB, CBC, CTR and XTS modes,
both for ARMv8 with Crypto Extensions and for plain ARMv8 NEON.

The Crypto Extensions version can only run on ARMv8 implementations that
have support for these optional extensions.

The plain NEON version is a table based yet time invariant implementation.
All S-box substitutions are performed in parallel, leveraging the wide range
of ARMv8's tbl/tbx instructions, and the huge NEON register file, which can
comfortably hold the entire S-box and still have room to spare for doing the
actual computations.

The key expansion routines were borrowed from aes_generic.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig     |  14 ++
 arch/arm64/crypto/Makefile    |  14 ++
 arch/arm64/crypto/aes-ce.S    | 147 +++++++++++
 arch/arm64/crypto/aes-glue.c  | 446 ++++++++++++++++++++++++++++++++++
 arch/arm64/crypto/aes-modes.S | 548 ++++++++++++++++++++++++++++++++++++++++++
 arch/arm64/crypto/aes-neon.S  | 382 +++++++++++++++++++++++++++++
 6 files changed, 1551 insertions(+)
 create mode 100644 arch/arm64/crypto/aes-ce.S
 create mode 100644 arch/arm64/crypto/aes-glue.c
 create mode 100644 arch/arm64/crypto/aes-modes.S
 create mode 100644 arch/arm64/crypto/aes-neon.S

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 8fffd5af65ef..5562652c5316 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -36,4 +36,18 @@ config CRYPTO_AES_ARM64_CE_CCM
 	select CRYPTO_AES
 	select CRYPTO_AEAD
 
+config CRYPTO_AES_ARM64_CE_BLK
+	tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
+	depends on ARM64 && KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES
+	select CRYPTO_ABLK_HELPER
+
+config CRYPTO_AES_ARM64_NEON_BLK
+	tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
+	depends on ARM64 && KERNEL_MODE_NEON
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES
+	select CRYPTO_ABLK_HELPER
+
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 311287d68078..070ff7f6fc66 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -22,3 +22,17 @@ CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
 aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
+aes-ce-blk-y := aes-glue-ce.o aes-ce.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
+aes-neon-blk-y := aes-glue-neon.o aes-neon.o
+
+AFLAGS_aes-ce.o		:= -DINTERLEAVE=8  # -DINTERLEAVE_INLINE
+AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
+
+CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS
+
+$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
+	$(call if_changed_dep,cc_o_c)
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
new file mode 100644
index 000000000000..08a8416561e2
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce.S
@@ -0,0 +1,147 @@
+/*
+ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
+ *                                    Crypto Extensions
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#define AES_ENTRY(func)		ENTRY(ce_ ## func)
+#define AES_ENDPROC(func)	ENDPROC(ce_ ## func)
+
+	.arch		armv8-a+crypto
+
+	/* preload all round keys */
+	.macro		load_round_keys, rounds, rk
+	cmp		\rounds, #12
+	blo		2222f		/* 128 bits */
+	beq		1111f		/* 192 bits */
+	ld1		{v17.16b-v18.16b}, [\rk], #32
+1111:	ld1		{v19.16b-v20.16b}, [\rk], #32
+2222:	ld1		{v21.16b-v24.16b}, [\rk], #64
+	ld1		{v25.16b-v28.16b}, [\rk], #64
+	ld1		{v29.16b-v31.16b}, [\rk]
+	.endm
+
+	/* prepare for encryption with key in rk[] */
+	.macro		enc_prepare, rounds, rk, ignore
+	load_round_keys	\rounds, \rk
+	.endm
+
+	/* prepare for encryption (again) but with new key in rk[] */
+	.macro		enc_switch_key, rounds, rk, ignore
+	load_round_keys	\rounds, \rk
+	.endm
+
+	/* prepare for decryption with key in rk[] */
+	.macro		dec_prepare, rounds, rk, ignore
+	load_round_keys	\rounds, \rk
+	.endm
+
+	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4, i5, i6, i7
+	aes\de		\i0\().16b, \k\().16b
+	aes\mc		\i0\().16b, \i0\().16b
+	.ifnb		\i1
+	aes\de		\i1\().16b, \k\().16b
+	aes\mc		\i1\().16b, \i1\().16b
+	.ifnb		\i3
+	aes\de		\i2\().16b, \k\().16b
+	aes\mc		\i2\().16b, \i2\().16b
+	aes\de		\i3\().16b, \k\().16b
+	aes\mc		\i3\().16b, \i3\().16b
+	.ifnb		\i7
+	.irp		reg, \i4\().16b, \i5\().16b, \i6\().16b, \i7\().16b
+	aes\de		\reg, \k\().16b
+	aes\mc		\reg, \reg
+	.endr
+	.endif
+	.endif
+	.endif
+	.endm
+
+	/* up to 8 interleaved encryption rounds with the same round key */
+	.macro		round_Nx, enc, k, i0, i1, i2, i3, i4, i5, i6, i7
+	.ifc		\enc, e
+	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	.else
+	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	.endif
+	.endm
+
+	/* up to 8 interleaved final rounds */
+	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4, i5, i6, i7
+	aes\de		\i0\().16b, \k\().16b
+	eor		\i0\().16b, \i0\().16b, \k2\().16b
+	.ifnb		\i1
+	aes\de		\i1\().16b, \k\().16b
+	eor		\i1\().16b, \i1\().16b, \k2\().16b
+	.ifnb		\i3
+	aes\de		\i2\().16b, \k\().16b
+	eor		\i2\().16b, \i2\().16b, \k2\().16b
+	aes\de		\i3\().16b, \k\().16b
+	eor		\i3\().16b, \i3\().16b, \k2\().16b
+	.ifnb		\i7
+	.irp		reg, \i4\().16b,\i5\().16b,\i6\().16b,\i7\().16b
+	aes\de		\reg, \k\().16b
+	eor		\reg, \reg, \k2\().16b
+	.endr
+	.endif
+	.endif
+	.endif
+	.endm
+
+	/* up to 8 interleaved blocks */
+	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3, i4, i5, i6, i7
+	cmp		\rounds, #12
+	blo		2222f		/* 128 bits */
+	beq		1111f		/* 192 bits */
+	round_Nx	\enc, v17, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	round_Nx	\enc, v18, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	round_Nx	\enc, v20, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+	round_Nx	\enc, \key, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	.endr
+	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	.endm
+
+	.macro		encrypt_block, in, rounds, t0, t1, t2
+	do_block_Nx	e, \rounds, \in
+	.endm
+
+	.macro		encrypt_block2x, i0, i1, rounds, t0, t1, t2
+	do_block_Nx	e, \rounds, \i0, \i1
+	.endm
+
+	.macro		encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3
+	.endm
+
+	.macro		encrypt_block8x, i0, i1, i2, i3, i4, i5, i6, i7, \
+							rounds, t0, t1, t2
+	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	.endm
+
+	.macro		decrypt_block, in, rounds, t0, t1, t2
+	do_block_Nx	d, \rounds, \in
+	.endm
+
+	.macro		decrypt_block2x, i0, i1, rounds, t0, t1, t2
+	do_block_Nx	d, \rounds, \i0, \i1
+	.endm
+
+	.macro		decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3
+	.endm
+
+	.macro		decrypt_block8x, i0, i1, i2, i3, i4, i5, i6, i7, \
+							rounds, t0, t1, t2
+	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+	.endm
+
+#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
new file mode 100644
index 000000000000..60f2f4c12256
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue.c
@@ -0,0 +1,446 @@
+/*
+ * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/hwcap.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+#define MODE			"ce"
+#define PRIO			300
+#define aes_ecb_encrypt		ce_aes_ecb_encrypt
+#define aes_ecb_decrypt		ce_aes_ecb_decrypt
+#define aes_cbc_encrypt		ce_aes_cbc_encrypt
+#define aes_cbc_decrypt		ce_aes_cbc_decrypt
+#define aes_ctr_encrypt		ce_aes_ctr_encrypt
+#define aes_xts_encrypt		ce_aes_xts_encrypt
+#define aes_xts_decrypt		ce_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
+#else
+#define MODE			"neon"
+#define PRIO			200
+#define aes_ecb_encrypt		neon_aes_ecb_encrypt
+#define aes_ecb_decrypt		neon_aes_ecb_decrypt
+#define aes_cbc_encrypt		neon_aes_cbc_encrypt
+#define aes_cbc_decrypt		neon_aes_cbc_decrypt
+#define aes_ctr_encrypt		neon_aes_ctr_encrypt
+#define aes_xts_encrypt		neon_aes_xts_encrypt
+#define aes_xts_decrypt		neon_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
+MODULE_ALIAS("ecb(aes)");
+MODULE_ALIAS("cbc(aes)");
+MODULE_ALIAS("ctr(aes)");
+MODULE_ALIAS("xts(aes)");
+#endif
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+/* defined in aes-modes.S */
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				int rounds, int blocks, int first);
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				int rounds, int blocks, int first);
+
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				int rounds, int blocks, u8 iv[], int first);
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+				int rounds, int blocks, u8 iv[], int first);
+
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				int rounds, int blocks, u8 ctr[], int first);
+
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
+				int rounds, int blocks, u8 const rk2[], u8 iv[],
+				int first);
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
+				int rounds, int blocks, u8 const rk2[], u8 iv[],
+				int first);
+
+struct crypto_aes_xts_ctx {
+	struct crypto_aes_ctx key1;
+	struct crypto_aes_ctx __aligned(8) key2;
+};
+
+static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	int ret;
+
+	ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
+	if (!ret)
+		ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
+					    key_len / 2);
+	if (!ret)
+		return 0;
+
+	tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+	return -EINVAL;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	int err, first, rounds = 6 + ctx->key_length / 4;
+	struct blkcipher_walk walk;
+	unsigned int blocks;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_neon_begin();
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+		aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_enc, rounds, blocks, first);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_neon_end();
+	return err;
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	int err, first, rounds = 6 + ctx->key_length / 4;
+	struct blkcipher_walk walk;
+	unsigned int blocks;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_neon_begin();
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+		aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_dec, rounds, blocks, first);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_neon_end();
+	return err;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	int err, first, rounds = 6 + ctx->key_length / 4;
+	struct blkcipher_walk walk;
+	unsigned int blocks;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_neon_begin();
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+		aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+				first);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_neon_end();
+	return err;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	int err, first, rounds = 6 + ctx->key_length / 4;
+	struct blkcipher_walk walk;
+	unsigned int blocks;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_neon_begin();
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+		aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_dec, rounds, blocks, walk.iv,
+				first);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_neon_end();
+	return err;
+}
+
+static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	int err, first, rounds = 6 + ctx->key_length / 4;
+	struct blkcipher_walk walk;
+	int blocks;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+
+	first = 1;
+	kernel_neon_begin();
+	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+		aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+				first);
+		first = 0;
+		nbytes -= blocks * AES_BLOCK_SIZE;
+		if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
+			break;
+		err = blkcipher_walk_done(desc, &walk,
+					  walk.nbytes % AES_BLOCK_SIZE);
+	}
+	if (nbytes) {
+		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+		u8 __aligned(8) tail[AES_BLOCK_SIZE];
+
+		/*
+		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
+		 * to tell aes_ctr_encrypt() to only read half a block.
+		 */
+		blocks = (nbytes <= 8) ? -1 : 1;
+
+		aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
+				blocks, walk.iv, first);
+		memcpy(tdst, tail, nbytes);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	int err, first, rounds = 6 + ctx->key1.key_length / 4;
+	struct blkcipher_walk walk;
+	unsigned int blocks;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_neon_begin();
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key1.key_enc, rounds, blocks,
+				(u8 *)ctx->key2.key_enc, walk.iv, first);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	int err, first, rounds = 6 + ctx->key1.key_length / 4;
+	struct blkcipher_walk walk;
+	unsigned int blocks;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	kernel_neon_begin();
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key1.key_dec, rounds, blocks,
+				(u8 *)ctx->key2.key_enc, walk.iv, first);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct crypto_alg aes_algs[] = { {
+	.cra_name		= "__ecb-aes-" MODE,
+	.cra_driver_name	= "__driver-ecb-aes-" MODE,
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_blkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= crypto_aes_set_key,
+		.encrypt	= ecb_encrypt,
+		.decrypt	= ecb_decrypt,
+	},
+}, {
+	.cra_name		= "__cbc-aes-" MODE,
+	.cra_driver_name	= "__driver-cbc-aes-" MODE,
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_blkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= crypto_aes_set_key,
+		.encrypt	= cbc_encrypt,
+		.decrypt	= cbc_decrypt,
+	},
+}, {
+	.cra_name		= "__ctr-aes-" MODE,
+	.cra_driver_name	= "__driver-ctr-aes-" MODE,
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_blkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= crypto_aes_set_key,
+		.encrypt	= ctr_encrypt,
+		.decrypt	= ctr_encrypt,
+	},
+}, {
+	.cra_name		= "__xts-aes-" MODE,
+	.cra_driver_name	= "__driver-xts-aes-" MODE,
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_blkcipher = {
+		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
+		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= xts_set_key,
+		.encrypt	= xts_encrypt,
+		.decrypt	= xts_decrypt,
+	},
+}, {
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "ecb-aes-" MODE,
+	.cra_priority		= PRIO,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= ablk_set_key,
+		.encrypt	= ablk_encrypt,
+		.decrypt	= ablk_decrypt,
+	}
+}, {
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "cbc-aes-" MODE,
+	.cra_priority		= PRIO,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= ablk_set_key,
+		.encrypt	= ablk_encrypt,
+		.decrypt	= ablk_decrypt,
+	}
+}, {
+	.cra_name		= "ctr(aes)",
+	.cra_driver_name	= "ctr-aes-" MODE,
+	.cra_priority		= PRIO,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= ablk_set_key,
+		.encrypt	= ablk_encrypt,
+		.decrypt	= ablk_decrypt,
+	}
+}, {
+	.cra_name		= "xts(aes)",
+	.cra_driver_name	= "xts-aes-" MODE,
+	.cra_priority		= PRIO,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_ablkcipher = {
+		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
+		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= ablk_set_key,
+		.encrypt	= ablk_encrypt,
+		.decrypt	= ablk_decrypt,
+	}
+} };
+
+static int __init aes_init(void)
+{
+	return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static void __exit aes_exit(void)
+{
+	crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+module_cpu_feature_match(AES, aes_init);
+#else
+module_init(aes_init);
+#endif
+module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
new file mode 100644
index 000000000000..c990e1e379a2
--- /dev/null
+++ b/arch/arm64/crypto/aes-modes.S
@@ -0,0 +1,548 @@
+/*
+ * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* included by aes-ce.S and aes-neon.S */
+
+	.text
+	.align		4
+
+/*
+ * There are several ways to instantiate this code:
+ * - no interleave, all inline
+ * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
+ * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
+ * - 8-way interleave, 8x calls out of line (-DINTERLEAVE=8)
+ * - 8-way interleave, all inline (-DINTERLEAVE=8 -DINTERLEAVE_INLINE)
+ *
+ * Macros imported by this code:
+ * - enc_prepare	- setup NEON registers for encryption
+ * - dec_prepare	- setup NEON registers for decryption
+ * - enc_switch_key	- change to new key after having prepared for encryption
+ * - encrypt_block	- encrypt a single block
+ * - decrypt block	- decrypt a single block
+ * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ * - encrypt_block8x	- encrypt 8 blocks in parallel (if INTERLEAVE == 8)
+ * - decrypt_block8x	- decrypt 8 blocks in parallel (if INTERLEAVE == 8)
+ */
+
+#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
+#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp
+#define FRAME_POP	ldp x29, x30, [sp],#16
+
+#if INTERLEAVE == 4
+
+aes_encrypt_block4x:
+	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
+	ret
+ENDPROC(aes_encrypt_block4x)
+
+aes_decrypt_block4x:
+	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
+	ret
+ENDPROC(aes_decrypt_block4x)
+
+#elif INTERLEAVE == 8
+
+aes_encrypt_block8x:
+	encrypt_block8x	v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+	ret
+ENDPROC(aes_encrypt_block8x)
+
+aes_decrypt_block8x:
+	decrypt_block8x	v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+	ret
+ENDPROC(aes_decrypt_block8x)
+
+#endif
+	.macro		do_encrypt_block4x
+	bl		aes_encrypt_block4x
+	.endm
+
+	.macro		do_decrypt_block4x
+	bl		aes_decrypt_block4x
+	.endm
+
+	.macro		do_encrypt_block8x
+	bl		aes_encrypt_block8x
+	.endm
+
+	.macro		do_decrypt_block8x
+	bl		aes_decrypt_block8x
+	.endm
+#else
+#define FRAME_PUSH
+#define FRAME_POP
+
+	.macro		do_encrypt_block4x
+	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
+	.endm
+
+	.macro		do_decrypt_block4x
+	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
+	.endm
+
+	.macro		do_encrypt_block8x
+	encrypt_block8x	v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+	.endm
+
+	.macro		do_decrypt_block8x
+	decrypt_block8x	v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+	.endm
+#endif
+
+	/*
+	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		   int blocks, int first)
+	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		   int blocks, int first)
+	 */
+
+AES_ENTRY(aes_ecb_encrypt)
+	FRAME_PUSH
+	cbz		w5, .LecbencloopNx
+
+	enc_prepare	w3, x2, x5
+
+.LecbencloopNx:
+#if INTERLEAVE >= 4
+	subs		w4, w4, #INTERLEAVE
+	bmi		.Lecbenc1x
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
+#if INTERLEAVE == 8
+	ld1		{v8.16b-v11.16b}, [x1], #64	/* get 4 pt blocks */
+	do_encrypt_block8x
+	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v8.16b-v11.16b}, [x0], #64
+#else
+	do_encrypt_block4x
+	st1		{v0.16b-v3.16b}, [x0], #64
+#endif
+	b		.LecbencloopNx
+.Lecbenc1x:
+	adds		w4, w4, #INTERLEAVE
+	beq		.Lecbencout
+#endif
+.Lecbencloop:
+	ld1		{v0.16b}, [x1], #16		/* get next pt block */
+	encrypt_block	v0, w3, x2, x5, w6
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
+	bne		.Lecbencloop
+.Lecbencout:
+	FRAME_POP
+	ret
+AES_ENDPROC(aes_ecb_encrypt)
+
+
+AES_ENTRY(aes_ecb_decrypt)
+	FRAME_PUSH
+	cbz		w5, .LecbdecloopNx
+
+	dec_prepare	w3, x2, x5
+
+.LecbdecloopNx:
+#if INTERLEAVE >= 4
+	subs		w4, w4, #INTERLEAVE
+	bmi		.Lecbdec1x
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+#if INTERLEAVE == 8
+	ld1		{v8.16b-v11.16b}, [x1], #64	/* get 4 ct blocks */
+	do_decrypt_block8x
+	st1		{v0.16b-v3.16b}, [x0], #64
+	st1		{v8.16b-v11.16b}, [x0], #64
+#else
+	do_decrypt_block4x
+	st1		{v0.16b-v3.16b}, [x0], #64
+#endif
+	b		.LecbdecloopNx
+.Lecbdec1x:
+	adds		w4, w4, #INTERLEAVE
+	beq		.Lecbdecout
+#endif
+.Lecbdecloop:
+	ld1		{v0.16b}, [x1], #16		/* get next ct block */
+	decrypt_block	v0, w3, x2, x5, w6
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
+	bne		.Lecbdecloop
+.Lecbdecout:
+	FRAME_POP
+	ret
+AES_ENDPROC(aes_ecb_decrypt)
+
+
+	/*
+	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		   int blocks, u8 iv[], int first)
+	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		   int blocks, u8 iv[], int first)
+	 */
+
+AES_ENTRY(aes_cbc_encrypt)
+	cbz		w6, .Lcbcencloop
+
+	ld1		{v0.16b}, [x5]			/* get iv */
+	enc_prepare	w3, x2, x5
+
+.Lcbcencloop:
+	ld1		{v1.16b}, [x1], #16		/* get next pt block */
+	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */
+	encrypt_block	v0, w3, x2, x5, w6
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
+	bne		.Lcbcencloop
+	ret
+AES_ENDPROC(aes_cbc_encrypt)
+
+
+AES_ENTRY(aes_cbc_decrypt)
+	FRAME_PUSH
+	cbz		w6, .LcbcdecloopNx
+
+	ld1		{v7.16b}, [x5]			/* get iv */
+	dec_prepare	w3, x2, x5
+
+.LcbcdecloopNx:
+#if INTERLEAVE >= 4
+	subs		w4, w4, #INTERLEAVE
+	bmi		.Lcbcdec1x
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+	mov		v4.16b, v0.16b
+	mov		v5.16b, v1.16b
+	mov		v6.16b, v2.16b
+#if INTERLEAVE == 8
+	ld1		{v8.16b-v11.16b}, [x1], #64	/* get 4 ct blocks */
+	mov		v12.16b, v3.16b
+	mov		v13.16b, v8.16b
+	mov		v14.16b, v9.16b
+	mov		v15.16b, v10.16b
+	do_decrypt_block8x
+#else
+	do_decrypt_block4x
+#endif
+	sub		x1, x1, #16
+	eor		v0.16b, v0.16b, v7.16b
+	eor		v1.16b, v1.16b, v4.16b
+	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
+	eor		v2.16b, v2.16b, v5.16b
+	eor		v3.16b, v3.16b, v6.16b
+	st1		{v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+	eor		v8.16b, v8.16b, v12.16b
+	eor		v9.16b, v9.16b, v13.16b
+	eor		v10.16b, v10.16b, v14.16b
+	eor		v11.16b, v11.16b, v15.16b
+	st1		{v8.16b-v11.16b}, [x0], #64
+#endif
+	b		.LcbcdecloopNx
+.Lcbcdec1x:
+	adds		w4, w4, #INTERLEAVE
+	beq		.Lcbcdecout
+#endif
+.Lcbcdecloop:
+	ld1		{v1.16b}, [x1], #16		/* get next ct block */
+	mov		v0.16b, v1.16b			/* ...and copy to v0 */
+	decrypt_block	v0, w3, x2, x5, w6
+	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
+	mov		v7.16b, v1.16b			/* ct is next iv */
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
+	bne		.Lcbcdecloop
+.Lcbcdecout:
+	FRAME_POP
+	ret
+AES_ENDPROC(aes_cbc_decrypt)
+
+
+	/*
+	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+	 *		   int blocks, u8 ctr[], int first)
+	 */
+
+AES_ENTRY(aes_ctr_encrypt)
+	FRAME_PUSH
+	cbnz		w6, .Lctrfirst		/* 1st time around? */
+	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
+	rev		x5, x5
+#if INTERLEAVE >= 4
+	cmn		w5, w4			/* 32 bit overflow? */
+	bcs		.Lctrinc
+	add		x5, x5, #1		/* increment BE ctr */
+	b		.LctrincNx
+#else
+	b		.Lctrinc
+#endif
+.Lctrfirst:
+	enc_prepare	w3, x2, x6
+	ld1		{v4.16b}, [x5]
+	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
+	rev		x5, x5
+#if INTERLEAVE >= 4
+	cmn		w5, w4			/* 32 bit overflow? */
+	bcs		.Lctrloop
+.LctrloopNx:
+	subs		w4, w4, #INTERLEAVE
+	bmi		.Lctr1x
+	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
+	dup		v7.4s, w5
+	mov		v0.16b, v4.16b
+	add		v7.4s, v7.4s, v8.4s
+	mov		v1.16b, v4.16b
+	rev32		v8.16b, v7.16b
+	mov		v2.16b, v4.16b
+	mov		v3.16b, v4.16b
+	mov		v1.4s[3], v8.s[0]
+	mov		v2.4s[3], v8.s[1]
+	mov		v3.4s[3], v8.s[2]
+#if INTERLEAVE == 8
+	movi		v8.4s, #4			/* addends 4,4,4,4 */
+	add		x6, x1, #64
+	add		v7.4s, v7.4s, v8.4s
+	mov		v8.16b, v4.16b
+	mov		v9.16b, v4.16b
+	ld1		{v12.16b-v15.16b}, [x6]		/* get 4 input blocks */
+	rev32		v7.16b, v7.16b
+	mov		v10.16b, v4.16b
+	mov		v11.16b, v4.16b
+	mov		v8.4s[3], v7.s[3]
+	mov		v9.4s[3], v7.s[0]
+	mov		v10.4s[3], v7.s[1]
+	mov		v11.4s[3], v7.s[2]
+	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
+	do_encrypt_block8x
+#else
+	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
+	do_encrypt_block4x
+#endif
+	eor		v0.16b, v5.16b, v0.16b
+	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
+	eor		v1.16b, v6.16b, v1.16b
+	eor		v2.16b, v7.16b, v2.16b
+	eor		v3.16b, v5.16b, v3.16b
+	st1		{v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+	add		x1, x1, #64
+	eor		v12.16b, v8.16b, v12.16b
+	eor		v13.16b, v9.16b, v13.16b
+	eor		v14.16b, v10.16b, v14.16b
+	eor		v15.16b, v11.16b, v15.16b
+	st1		{v12.16b-v15.16b}, [x0], #64
+#endif
+	add		x5, x5, #INTERLEAVE
+.LctrincNx:
+	rev		x7, x5
+	ins		v4.d[1], x7
+	b		.LctrloopNx
+.Lctr1x:
+	adds		w4, w4, #INTERLEAVE
+	beq		.Lctrout
+#endif
+.Lctrloop:
+	mov		v0.16b, v4.16b
+	encrypt_block	v0, w3, x2, x6, w7
+	subs		w4, w4, #1
+	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
+	ld1		{v3.16b}, [x1], #16
+	eor		v3.16b, v0.16b, v3.16b
+	st1		{v3.16b}, [x0], #16
+	beq		.Lctrout
+.Lctrinc:
+	adds		x5, x5, #1		/* increment BE ctr */
+	rev		x7, x5
+	ins		v4.d[1], x7
+	bcc		.Lctrloop		/* no overflow? */
+	umov		x7, v4.d[0]		/* load upper word of ctr  */
+	rev		x7, x7			/* ... to handle the carry */
+	add		x7, x7, #1
+	rev		x7, x7
+	ins		v4.d[0], x7
+	b		.Lctrloop
+.Lctrhalfblock:
+	ld1		{v3.8b}, [x1]
+	eor		v3.8b, v0.8b, v3.8b
+	st1		{v3.8b}, [x0]
+.Lctrout:
+	FRAME_POP
+	ret
+AES_ENDPROC(aes_ctr_encrypt)
+	.ltorg
+
+
+	/*
+	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
+	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
+	 */
+
+	.macro		next_tweak, out, in, const, tmp
+	sshr		\tmp\().2d,  \in\().2d,   #63
+	and		\tmp\().16b, \tmp\().16b, \const\().16b
+	add		\out\().2d,  \in\().2d,   \in\().2d
+	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+	eor		\out\().16b, \out\().16b, \tmp\().16b
+	.endm
+
+.Lxts_mul_x:
+	.word		1, 0, 0x87, 0
+
+AES_ENTRY(aes_xts_encrypt)
+	FRAME_PUSH
+	cbz		w7, .LxtsencloopNx
+
+	ld1		{v4.16b}, [x6]
+	enc_prepare	w3, x5, x6
+	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
+	enc_switch_key	w3, x2, x6
+	ldr		q7, .Lxts_mul_x
+	b		.LxtsencNx
+
+.LxtsencloopNx:
+	ldr		q7, .Lxts_mul_x
+	next_tweak	v4, v4, v7, v8
+.LxtsencNx:
+#if INTERLEAVE >= 4
+	subs		w4, w4, #INTERLEAVE
+	bmi		.Lxtsenc1x
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+	next_tweak	v5, v4, v7, v8
+	eor		v0.16b, v0.16b, v4.16b
+	next_tweak	v6, v5, v7, v8
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+#if INTERLEAVE == 8
+	next_tweak	v12, v6, v7, v16
+	eor		v3.16b, v3.16b, v12.16b
+	ld1		{v8.16b-v11.16b}, [x1], #64	/* get 4 ct blocks */
+	next_tweak	v13, v12, v7, v16
+	next_tweak	v14, v13, v7, v16
+	eor		v8.16b, v8.16b, v13.16b
+	next_tweak	v15, v14, v7, v16
+	eor		v9.16b, v9.16b, v14.16b
+	next_tweak	v7, v15, v7, v16
+	eor		v10.16b, v10.16b, v15.16b
+	eor		v11.16b, v11.16b, v7.16b
+	do_encrypt_block8x
+	eor		v3.16b, v3.16b, v12.16b
+#else
+	next_tweak	v7, v6, v7, v8
+	eor		v3.16b, v3.16b, v7.16b
+	do_encrypt_block4x
+	eor		v3.16b, v3.16b, v7.16b
+#endif
+	eor		v0.16b, v0.16b, v4.16b
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+	st1		{v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+	eor		v8.16b, v8.16b, v13.16b
+	eor		v9.16b, v9.16b, v14.16b
+	eor		v10.16b, v10.16b, v15.16b
+	eor		v11.16b, v11.16b, v7.16b
+	st1		{v8.16b-v11.16b}, [x0], #64
+#endif
+	mov		v4.16b, v7.16b
+	b		.LxtsencloopNx
+.Lxtsenc1x:
+	adds		w4, w4, #INTERLEAVE
+	beq		.Lxtsencout
+#endif
+.Lxtsencloop:
+	ld1		{v1.16b}, [x1], #16
+	eor		v0.16b, v1.16b, v4.16b
+	encrypt_block	v0, w3, x2, x6, w7
+	eor		v0.16b, v0.16b, v4.16b
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
+	beq		.Lxtsencout
+	next_tweak	v4, v4, v7, v8
+	b		.Lxtsencloop
+.Lxtsencout:
+	FRAME_POP
+	ret
+AES_ENDPROC(aes_xts_encrypt)
+
+
+AES_ENTRY(aes_xts_decrypt)
+	FRAME_PUSH
+	cbz		w7, .LxtsdecloopNx
+
+	ld1		{v4.16b}, [x6]
+	enc_prepare	w3, x5, x6
+	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
+	dec_prepare	w3, x2, x6
+	ldr		q7, .Lxts_mul_x
+	b		.LxtsdecNx
+
+.LxtsdecloopNx:
+	ldr		q7, .Lxts_mul_x
+	next_tweak	v4, v4, v7, v8
+.LxtsdecNx:
+#if INTERLEAVE >= 4
+	subs		w4, w4, #INTERLEAVE
+	bmi		.Lxtsdec1x
+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
+	next_tweak	v5, v4, v7, v8
+	eor		v0.16b, v0.16b, v4.16b
+	next_tweak	v6, v5, v7, v8
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+#if INTERLEAVE == 8
+	next_tweak	v12, v6, v7, v16
+	eor		v3.16b, v3.16b, v12.16b
+	ld1		{v8.16b-v11.16b}, [x1], #64	/* get 4 ct blocks */
+	next_tweak	v13, v12, v7, v16
+	next_tweak	v14, v13, v7, v16
+	eor		v8.16b, v8.16b, v13.16b
+	next_tweak	v15, v14, v7, v16
+	eor		v9.16b, v9.16b, v14.16b
+	next_tweak	v7, v15, v7, v16
+	eor		v10.16b, v10.16b, v15.16b
+	eor		v11.16b, v11.16b, v7.16b
+	do_decrypt_block8x
+	eor		v3.16b, v3.16b, v12.16b
+#else
+	next_tweak	v7, v6, v7, v8
+	eor		v3.16b, v3.16b, v7.16b
+	do_decrypt_block4x
+	eor		v3.16b, v3.16b, v7.16b
+#endif
+	eor		v0.16b, v0.16b, v4.16b
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+	st1		{v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+	eor		v8.16b, v8.16b, v13.16b
+	eor		v9.16b, v9.16b, v14.16b
+	eor		v10.16b, v10.16b, v15.16b
+	eor		v11.16b, v11.16b, v7.16b
+	st1		{v8.16b-v11.16b}, [x0], #64
+#endif
+	mov		v4.16b, v7.16b
+	b		.LxtsdecloopNx
+.Lxtsdec1x:
+	adds		w4, w4, #INTERLEAVE
+	beq		.Lxtsdecout
+#endif
+.Lxtsdecloop:
+	ld1		{v1.16b}, [x1], #16
+	eor		v0.16b, v1.16b, v4.16b
+	decrypt_block	v0, w3, x2, x6, w7
+	eor		v0.16b, v0.16b, v4.16b
+	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
+	beq		.Lxtsdecout
+	next_tweak	v4, v4, v7, v8
+	b		.Lxtsdecloop
+.Lxtsdecout:
+	FRAME_POP
+	ret
+AES_ENDPROC(aes_xts_decrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
new file mode 100644
index 000000000000..a2aa6dab1f9e
--- /dev/null
+++ b/arch/arm64/crypto/aes-neon.S
@@ -0,0 +1,382 @@
+/*
+ * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#define AES_ENTRY(func)		ENTRY(neon_ ## func)
+#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
+
+	/* multiply by polynomial 'x' in GF(2^8) */
+	.macro		mul_by_x, out, in, temp, const
+	sshr		\temp, \in, #7
+	add		\out, \in, \in
+	and		\temp, \temp, \const
+	eor		\out, \out, \temp
+	.endm
+
+	/* preload the entire Sbox */
+	.macro		prepare, sbox, shiftrows, temp
+	adr		\temp, \sbox
+	movi		v12.16b, #0x40
+	ldr		q13, \shiftrows
+	movi		v14.16b, #0x1b
+	ld1		{v16.16b-v19.16b}, [\temp], #64
+	ld1		{v20.16b-v23.16b}, [\temp], #64
+	ld1		{v24.16b-v27.16b}, [\temp], #64
+	ld1		{v28.16b-v31.16b}, [\temp]
+	.endm
+
+	/* do preload for encryption */
+	.macro		enc_prepare, ignore0, ignore1, temp
+	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
+	.endm
+
+	.macro		enc_switch_key, ignore0, ignore1, temp
+	/* do nothing */
+	.endm
+
+	/* do preload for decryption */
+	.macro		dec_prepare, ignore0, ignore1, temp
+	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
+	.endm
+
+	/* apply SubBytes transformation using the the preloaded Sbox */
+	.macro		sub_bytes, in
+	sub		v9.16b, \in\().16b, v12.16b
+	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
+	sub		v10.16b, v9.16b, v12.16b
+	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
+	sub		v11.16b, v10.16b, v12.16b
+	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
+	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
+	.endm
+
+	/* apply MixColumns transformation */
+	.macro		mix_columns, in
+	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
+	rev32		v8.8h, \in\().8h
+	eor		\in\().16b, v10.16b, \in\().16b
+	shl		v9.4s, v8.4s, #24
+	sri		v9.4s, v8.4s, #8
+	shl		v11.4s, \in\().4s, #24
+	sri		v11.4s, \in\().4s, #8
+	eor		v9.16b, v9.16b, v8.16b
+	eor		v10.16b, v10.16b, v9.16b
+	eor		\in\().16b, v10.16b, v11.16b
+	.endm
+
+	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+	.macro		inv_mix_columns, in
+	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
+	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
+	eor		\in\().16b, \in\().16b, v11.16b
+	rev32		v11.8h, v11.8h
+	eor		\in\().16b, \in\().16b, v11.16b
+	mix_columns	\in
+	.endm
+
+	.macro		do_block, enc, in, rounds, rk, rkp, i
+	ld1		{v15.16b}, [\rk]
+	add		\rkp, \rk, #16
+	mov		\i, \rounds
+1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
+	sub_bytes	\in
+	ld1		{v15.16b}, [\rkp], #16
+	subs		\i, \i, #1
+	beq		2222f
+	.if		\enc == 1
+	mix_columns	\in
+	.else
+	inv_mix_columns	\in
+	.endif
+	b		1111b
+2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	.endm
+
+	.macro		encrypt_block, in, rounds, rk, rkp, i
+	do_block	1, \in, \rounds, \rk, \rkp, \i
+	.endm
+
+	.macro		decrypt_block, in, rounds, rk, rkp, i
+	do_block	0, \in, \rounds, \rk, \rkp, \i
+	.endm
+
+	/*
+	 * Interleaved versions: functionally equivalent to the
+	 * ones above, but applied to 2 or 4 AES states in parallel.
+	 */
+
+	.macro		sub_bytes_2x, in0, in1
+	sub		v8.16b, \in0\().16b, v12.16b
+	sub		v9.16b, \in1\().16b, v12.16b
+	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+	sub		v10.16b, v8.16b, v12.16b
+	sub		v11.16b, v9.16b, v12.16b
+	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
+	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
+	sub		v8.16b, v10.16b, v12.16b
+	sub		v9.16b, v11.16b, v12.16b
+	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
+	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
+	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
+	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
+	.endm
+
+	.macro		sub_bytes_4x, in0, in1, in2, in3
+	sub		v8.16b, \in0\().16b, v12.16b
+	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+	sub		v9.16b, \in1\().16b, v12.16b
+	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+	sub		v10.16b, \in2\().16b, v12.16b
+	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
+	sub		v11.16b, \in3\().16b, v12.16b
+	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
+	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
+	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
+	sub		v8.16b, v8.16b, v12.16b
+	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
+	sub		v9.16b, v9.16b, v12.16b
+	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
+	sub		v10.16b, v10.16b, v12.16b
+	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
+	sub		v11.16b, v11.16b, v12.16b
+	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
+	sub		v8.16b, v8.16b, v12.16b
+	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
+	sub		v9.16b, v9.16b, v12.16b
+	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
+	sub		v10.16b, v10.16b, v12.16b
+	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
+	sub		v11.16b, v11.16b, v12.16b
+	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
+	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
+	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
+	.endm
+
+	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
+	sshr		\tmp0\().16b, \in0\().16b,  #7
+	add		\out0\().16b, \in0\().16b,  \in0\().16b
+	sshr		\tmp1\().16b, \in1\().16b,  #7
+	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
+	add		\out1\().16b, \in1\().16b,  \in1\().16b
+	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
+	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
+	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
+	.endm
+
+	.macro		mix_columns_2x, in0, in1
+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
+	rev32		v10.8h, \in0\().8h
+	rev32		v11.8h, \in1\().8h
+	eor		\in0\().16b, v8.16b, \in0\().16b
+	eor		\in1\().16b, v9.16b, \in1\().16b
+	shl		v12.4s, v10.4s, #24
+	shl		v13.4s, v11.4s, #24
+	eor		v8.16b, v8.16b, v10.16b
+	sri		v12.4s, v10.4s, #8
+	shl		v10.4s, \in0\().4s, #24
+	eor		v9.16b, v9.16b, v11.16b
+	sri		v13.4s, v11.4s, #8
+	shl		v11.4s, \in1\().4s, #24
+	sri		v10.4s, \in0\().4s, #8
+	eor		\in0\().16b, v8.16b, v12.16b
+	sri		v11.4s, \in1\().4s, #8
+	eor		\in1\().16b, v9.16b, v13.16b
+	eor		\in0\().16b, v10.16b, \in0\().16b
+	eor		\in1\().16b, v11.16b, \in1\().16b
+	.endm
+
+	.macro		inv_mix_cols_2x, in0, in1
+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
+	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	rev32		v8.8h, v8.8h
+	rev32		v9.8h, v9.8h
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	mix_columns_2x	\in0, \in1
+	.endm
+
+	.macro		inv_mix_cols_4x, in0, in1, in2, in3
+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
+	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
+	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
+	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	eor		\in2\().16b, \in2\().16b, v10.16b
+	eor		\in3\().16b, \in3\().16b, v11.16b
+	rev32		v8.8h, v8.8h
+	rev32		v9.8h, v9.8h
+	rev32		v10.8h, v10.8h
+	rev32		v11.8h, v11.8h
+	eor		\in0\().16b, \in0\().16b, v8.16b
+	eor		\in1\().16b, \in1\().16b, v9.16b
+	eor		\in2\().16b, \in2\().16b, v10.16b
+	eor		\in3\().16b, \in3\().16b, v11.16b
+	mix_columns_2x	\in0, \in1
+	mix_columns_2x	\in2, \in3
+	.endm
+
+	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+	ld1		{v15.16b}, [\rk]
+	add		\rkp, \rk, #16
+	mov		\i, \rounds
+1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
+	sub_bytes_2x	\in0, \in1
+	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
+	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
+	ld1		{v15.16b}, [\rkp], #16
+	subs		\i, \i, #1
+	beq		2222f
+	.if		\enc == 1
+	mix_columns_2x	\in0, \in1
+	ldr		q13, .LForward_ShiftRows
+	.else
+	inv_mix_cols_2x	\in0, \in1
+	ldr		q13, .LReverse_ShiftRows
+	.endif
+	movi		v12.16b, #0x40
+	b		1111b
+2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
+	.endm
+
+	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
+	ld1		{v15.16b}, [\rk]
+	add		\rkp, \rk, #16
+	mov		\i, \rounds
+1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
+	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
+	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
+	sub_bytes_4x	\in0, \in1, \in2, \in3
+	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
+	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
+	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
+	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
+	ld1		{v15.16b}, [\rkp], #16
+	subs		\i, \i, #1
+	beq		2222f
+	.if		\enc == 1
+	mix_columns_2x	\in0, \in1
+	mix_columns_2x	\in2, \in3
+	ldr		q13, .LForward_ShiftRows
+	.else
+	inv_mix_cols_4x	\in0, \in1, \in2, \in3
+	ldr		q13, .LReverse_ShiftRows
+	.endif
+	movi		v12.16b, #0x40
+	b		1111b
+2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
+	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
+	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
+	.endm
+
+	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
+	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
+	.endm
+
+	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
+	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
+	.endm
+
+	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+	.endm
+
+	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+	.endm
+
+#include "aes-modes.S"
+
+	.text
+	.align		4
+.LForward_ShiftRows:
+	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
+	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
+
+.LReverse_ShiftRows:
+	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
+	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
+
+.LForward_Sbox:
+	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.LReverse_Sbox:
+	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH resend 12/15] arm64/crypto: add shared macro to test for NEED_RESCHED
  2014-05-01 15:51 [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 11/15] arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions Ard Biesheuvel
@ 2014-05-01 15:51 ` Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1 Ard Biesheuvel
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2014-05-01 15:51 UTC (permalink / raw)
  To: linux-arm-kernel

This adds the asm macro definition 'b_if_no_resched' that performs a conditional
branch depending on the preempt need_resched state.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/include/asm/assembler.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index fd3e3924041b..296105fd3021 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -22,6 +22,11 @@
 
 #include <asm/ptrace.h>
 
+#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#endif
+
 /*
  * Stack pushing/popping (register pairs only). Equivalent to store decrement
  * before, load increment after.
@@ -146,3 +151,19 @@ lr	.req	x30		// link register
 #endif
 	orr	\rd, \lbits, \hbits, lsl #32
 	.endm
+
+/*
+ * Branch to 'lb' but only if we have not been tagged for preemption.
+ *
+ * Expects current->thread_info in ti, or NULL if running in interrupt
+ * context. reg is a scratch x register.
+ */
+	.macro		b_if_no_resched, ti, reg, lb
+#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+	cbz		\ti, \lb			/* have thread_info? */
+	ldr		\reg, [\ti, #TI_FLAGS]		/* get flags */
+	tbz		\reg, #TIF_NEED_RESCHED, \lb	/* need rescheduling? */
+#else
+	b		\lb
+#endif
+	.endm
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1
  2014-05-01 15:51 [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 11/15] arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 12/15] arm64/crypto: add shared macro to test for NEED_RESCHED Ard Biesheuvel
@ 2014-05-01 15:51 ` Ard Biesheuvel
  2014-05-13 18:58   ` Jussi Kivilinna
  2014-05-14  1:36   ` Herbert Xu
  2014-05-01 15:51 ` [PATCH resend 14/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA2 Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 15/15] arm64/crypto: add voluntary preemption to Crypto Extensions GHASH Ard Biesheuvel
  4 siblings, 2 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2014-05-01 15:51 UTC (permalink / raw)
  To: linux-arm-kernel

The Crypto Extensions based SHA1 implementation uses the NEON register file,
and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
check to its inner loop so we at least give up the CPU voluntarily when we
are running in process context and have been tagged for preemption by the
scheduler.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/sha1-ce-core.S | 19 ++++++++--------
 arch/arm64/crypto/sha1-ce-glue.c | 49 +++++++++++++++++++++++++++++++---------
 2 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index bd4af29f2722..831e332f9331 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -66,8 +66,8 @@
 	.word		0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
 
 	/*
-	 * void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
-	 * 			  u8 *head, long bytes)
+	 * int sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+	 * 			 u8 *head, long bytes, struct thread_info *ti)
 	 */
 ENTRY(sha1_ce_transform)
 	/* load round constants */
@@ -127,7 +127,13 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	add		dgbv.2s, dgbv.2s, dg1v.2s
 	add		dgav.4s, dgav.4s, dg0v.4s
 
-	cbnz		w0, 0b
+	cbz		w0, 4f
+	b_if_no_resched	x5, x8, 0b
+
+	/* store new state */
+3:	str		dga, [x2]
+	str		dgb, [x2, #16]
+	ret
 
 	/*
 	 * Final block: add padding and total bit count.
@@ -135,7 +141,7 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	 * size was not a round multiple of the block size, and the padding is
 	 * handled by the C code.
 	 */
-	cbz		x4, 3f
+4:	cbz		x4, 3b
 	movi		v9.2d, #0
 	mov		x8, #0x80000000
 	movi		v10.2d, #0
@@ -145,9 +151,4 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	mov		v11.d[0], xzr
 	mov		v11.d[1], x7
 	b		2b
-
-	/* store new state */
-3:	str		dga, [x2]
-	str		dgb, [x2, #16]
-	ret
 ENDPROC(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index 6fe83f37a750..69850a163668 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -20,8 +20,8 @@ MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 
-asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
-				  u8 *head, long bytes);
+asmlinkage int sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+				 u8 *head, long bytes, struct thread_info *ti);
 
 static int sha1_init(struct shash_desc *desc)
 {
@@ -42,6 +42,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
 	sctx->count += len;
 
 	if ((partial + len) >= SHA1_BLOCK_SIZE) {
+		struct thread_info *ti = NULL;
 		int blocks;
 
 		if (partial) {
@@ -52,16 +53,30 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
 			len -= p;
 		}
 
+		/*
+		 * Pass current's thread info pointer to sha1_ce_transform()
+		 * below if we want it to play nice under preemption.
+		 */
+		if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+		     IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+			ti = current_thread_info();
+
 		blocks = len / SHA1_BLOCK_SIZE;
 		len %= SHA1_BLOCK_SIZE;
 
-		kernel_neon_begin_partial(16);
-		sha1_ce_transform(blocks, data, sctx->state,
-				  partial ? sctx->buffer : NULL, 0);
-		kernel_neon_end();
+		do {
+			int rem;
+
+			kernel_neon_begin_partial(16);
+			rem = sha1_ce_transform(blocks, data, sctx->state,
+						partial ? sctx->buffer : NULL,
+						0, ti);
+			kernel_neon_end();
 
-		data += blocks * SHA1_BLOCK_SIZE;
-		partial = 0;
+			data += (blocks - rem) * SHA1_BLOCK_SIZE;
+			blocks = rem;
+			partial = 0;
+		} while (unlikely(ti && blocks > 0));
 	}
 	if (len)
 		memcpy(sctx->buffer + partial, data, len);
@@ -94,6 +109,7 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
 		      unsigned int len, u8 *out)
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
+	struct thread_info *ti = NULL;
 	__be32 *dst = (__be32 *)out;
 	int blocks;
 	int i;
@@ -111,9 +127,20 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
 	 */
 	blocks = len / SHA1_BLOCK_SIZE;
 
-	kernel_neon_begin_partial(16);
-	sha1_ce_transform(blocks, data, sctx->state, NULL, len);
-	kernel_neon_end();
+	if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+	     IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+		ti = current_thread_info();
+
+	do {
+		int rem;
+
+		kernel_neon_begin_partial(16);
+		rem = sha1_ce_transform(blocks, data, sctx->state,
+					NULL, len, ti);
+		kernel_neon_end();
+		data += (blocks - rem) * SHA1_BLOCK_SIZE;
+		blocks = rem;
+	} while (unlikely(ti && blocks > 0));
 
 	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
 		put_unaligned_be32(sctx->state[i], dst++);
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH resend 14/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA2
  2014-05-01 15:51 [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic Ard Biesheuvel
                   ` (2 preceding siblings ...)
  2014-05-01 15:51 ` [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1 Ard Biesheuvel
@ 2014-05-01 15:51 ` Ard Biesheuvel
  2014-05-01 15:51 ` [PATCH resend 15/15] arm64/crypto: add voluntary preemption to Crypto Extensions GHASH Ard Biesheuvel
  4 siblings, 0 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2014-05-01 15:51 UTC (permalink / raw)
  To: linux-arm-kernel

The Crypto Extensions based SHA2 implementation uses the NEON register file,
and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
check to its inner loop so we at least give up the CPU voluntarily when we
are running in process context and have been tagged for preemption by the
scheduler.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/sha2-ce-core.S | 19 ++++++++-------
 arch/arm64/crypto/sha2-ce-glue.c | 51 ++++++++++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 53e750614169..46b669d91c29 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -73,8 +73,8 @@
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
 	/*
-	 * void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
-	 *                        u8 *head, long bytes)
+	 * int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+	 *                       u8 *head, long bytes, struct thread_info *ti)
 	 */
 ENTRY(sha2_ce_transform)
 	/* load round constants */
@@ -131,7 +131,14 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	add		dgbv.4s, dgbv.4s, dg1v.4s
 
 	/* handled all input blocks? */
-	cbnz		w0, 0b
+	cbz		w0, 4f
+
+	/* should we exit early? */
+	b_if_no_resched	x5, x8, 0b
+
+	/* store new state */
+3:	stp		dga, dgb, [x2]
+	ret
 
 	/*
 	 * Final block: add padding and total bit count.
@@ -139,7 +146,7 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	 * size was not a round multiple of the block size, and the padding is
 	 * handled by the C code.
 	 */
-	cbz		x4, 3f
+4:	cbz		x4, 3b
 	movi		v17.2d, #0
 	mov		x8, #0x80000000
 	movi		v18.2d, #0
@@ -149,8 +156,4 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	mov		v19.d[0], xzr
 	mov		v19.d[1], x7
 	b		2b
-
-	/* store new state */
-3:	stp		dga, dgb, [x2]
-	ret
 ENDPROC(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 81617262b3df..6566ad3fdf82 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -1,4 +1,4 @@
-/*
+h/*
  * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
@@ -20,8 +20,8 @@ MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 
-asmlinkage void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
-				 u8 *head, long bytes);
+asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+				 u8 *head, long bytes, struct thread_info *ti);
 
 static int sha224_init(struct shash_desc *desc)
 {
@@ -58,6 +58,7 @@ static int sha2_update(struct shash_desc *desc, const u8 *data,
 	sctx->count += len;
 
 	if ((partial + len) >= SHA256_BLOCK_SIZE) {
+		struct thread_info *ti = NULL;
 		int blocks;
 
 		if (partial) {
@@ -68,16 +69,30 @@ static int sha2_update(struct shash_desc *desc, const u8 *data,
 			len -= p;
 		}
 
+		/*
+		 * Pass current's thread info pointer to sha2_ce_transform()
+		 * below if we want it to play nice under preemption.
+		 */
+		if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+		     IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+			ti = current_thread_info();
+
 		blocks = len / SHA256_BLOCK_SIZE;
 		len %= SHA256_BLOCK_SIZE;
 
-		kernel_neon_begin_partial(28);
-		sha2_ce_transform(blocks, data, sctx->state,
-				  partial ? sctx->buf : NULL, 0);
-		kernel_neon_end();
+		do {
+			int rem;
+
+			kernel_neon_begin_partial(28);
+			rem = sha2_ce_transform(blocks, data, sctx->state,
+						partial ? sctx->buf : NULL,
+						0, ti);
+			kernel_neon_end();
 
-		data += blocks * SHA256_BLOCK_SIZE;
-		partial = 0;
+			data += (blocks - rem) * SHA256_BLOCK_SIZE;
+			blocks = rem;
+			partial = 0;
+		} while (unlikely(ti && blocks > 0));
 	}
 	if (len)
 		memcpy(sctx->buf + partial, data, len);
@@ -131,6 +146,7 @@ static void sha2_finup(struct shash_desc *desc, const u8 *data,
 		       unsigned int len)
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
+	struct thread_info *ti = NULL;
 	int blocks;
 
 	if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
@@ -147,9 +163,20 @@ static void sha2_finup(struct shash_desc *desc, const u8 *data,
 	 */
 	blocks = len / SHA256_BLOCK_SIZE;
 
-	kernel_neon_begin_partial(28);
-	sha2_ce_transform(blocks, data, sctx->state, NULL, len);
-	kernel_neon_end();
+	if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+	     IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+		ti = current_thread_info();
+
+	do {
+		int rem;
+
+		kernel_neon_begin_partial(28);
+		rem = sha2_ce_transform(blocks, data, sctx->state,
+					NULL, len, ti);
+		kernel_neon_end();
+		data += (blocks - rem) * SHA256_BLOCK_SIZE;
+		blocks = rem;
+	} while (unlikely(ti && blocks > 0));
 }
 
 static int sha224_finup(struct shash_desc *desc, const u8 *data,
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH resend 15/15] arm64/crypto: add voluntary preemption to Crypto Extensions GHASH
  2014-05-01 15:51 [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic Ard Biesheuvel
                   ` (3 preceding siblings ...)
  2014-05-01 15:51 ` [PATCH resend 14/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA2 Ard Biesheuvel
@ 2014-05-01 15:51 ` Ard Biesheuvel
  4 siblings, 0 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2014-05-01 15:51 UTC (permalink / raw)
  To: linux-arm-kernel

The Crypto Extensions based GHASH implementation uses the NEON register file,
and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
check to its inner loop so we at least give up the CPU voluntarily when we
are running in process context and have been tagged for preemption by the
scheduler.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/ghash-ce-core.S | 10 ++++++----
 arch/arm64/crypto/ghash-ce-glue.c | 33 +++++++++++++++++++++++++--------
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index b9e6eaf41c9b..523432f24ed2 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -31,8 +31,9 @@
 	.arch		armv8-a+crypto
 
 	/*
-	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-	 *			   struct ghash_key const *k, const char *head)
+	 * int pmull_ghash_update(int blocks, u64 dg[], const char *src,
+	 *			  struct ghash_key const *k, const char *head,
+	 * 			  struct thread_info *ti)
 	 */
 ENTRY(pmull_ghash_update)
 	ld1		{DATA.16b}, [x1]
@@ -88,8 +89,9 @@ CPU_LE(	rev64		IN1.16b, IN1.16b	)
 	eor		T1.16b, T1.16b, T2.16b
 	eor		DATA.16b, DATA.16b, T1.16b
 
-	cbnz		w0, 0b
+	cbz		w0, 2f
+	b_if_no_resched	x5, x7, 0b
 
-	st1		{DATA.16b}, [x1]
+2:	st1		{DATA.16b}, [x1]
 	ret
 ENDPROC(pmull_ghash_update)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index b92baf3f68c7..4df64832617d 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -33,8 +33,9 @@ struct ghash_desc_ctx {
 	u32 count;
 };
 
-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-				   struct ghash_key const *k, const char *head);
+asmlinkage int pmull_ghash_update(int blocks, u64 dg[], const char *src,
+				  struct ghash_key const *k, const char *head,
+				  struct thread_info *ti);
 
 static int ghash_init(struct shash_desc *desc)
 {
@@ -54,6 +55,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 
 	if ((partial + len) >= GHASH_BLOCK_SIZE) {
 		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+		struct thread_info *ti = NULL;
 		int blocks;
 
 		if (partial) {
@@ -64,14 +66,29 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 			len -= p;
 		}
 
+		/*
+		 * Pass current's thread info pointer to pmull_ghash_update()
+		 * below if we want it to play nice under preemption.
+		 */
+		if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+		     IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+			ti = current_thread_info();
+
 		blocks = len / GHASH_BLOCK_SIZE;
 		len %= GHASH_BLOCK_SIZE;
 
-		kernel_neon_begin_partial(6);
-		pmull_ghash_update(blocks, ctx->digest, src, key,
-				   partial ? ctx->buf : NULL);
-		kernel_neon_end();
-		src += blocks * GHASH_BLOCK_SIZE;
+		do {
+			int rem;
+
+			kernel_neon_begin_partial(6);
+			rem = pmull_ghash_update(blocks, ctx->digest, src, key,
+						 partial ? ctx->buf : NULL, ti);
+			kernel_neon_end();
+
+			src += (blocks - rem) * GHASH_BLOCK_SIZE;
+			blocks = rem;
+			partial = 0;
+		} while (unlikely(ti && blocks > 0));
 	}
 	if (len)
 		memcpy(ctx->buf + partial, src, len);
@@ -89,7 +106,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
 		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
 
 		kernel_neon_begin_partial(6);
-		pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
+		pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL, NULL);
 		kernel_neon_end();
 	}
 	put_unaligned_be64(ctx->digest[1], dst);
-- 
1.8.3.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1
  2014-05-01 15:51 ` [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1 Ard Biesheuvel
@ 2014-05-13 18:58   ` Jussi Kivilinna
  2014-05-14  1:36   ` Herbert Xu
  1 sibling, 0 replies; 8+ messages in thread
From: Jussi Kivilinna @ 2014-05-13 18:58 UTC (permalink / raw)
  To: linux-arm-kernel

On 01.05.2014 18:51, Ard Biesheuvel wrote:
> The Crypto Extensions based SHA1 implementation uses the NEON register file,
> and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
> check to its inner loop so we at least give up the CPU voluntarily when we
> are running in process context and have been tagged for preemption by the
> scheduler.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
<snip>
> @@ -42,6 +42,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
>  	sctx->count += len;
>  
>  	if ((partial + len) >= SHA1_BLOCK_SIZE) {
> +		struct thread_info *ti = NULL;
>  		int blocks;
>  
>  		if (partial) {
> @@ -52,16 +53,30 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
>  			len -= p;
>  		}
>  
> +		/*
> +		 * Pass current's thread info pointer to sha1_ce_transform()
> +		 * below if we want it to play nice under preemption.
> +		 */
> +		if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
> +		     IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
> +			ti = current_thread_info();
> +
>  		blocks = len / SHA1_BLOCK_SIZE;
>  		len %= SHA1_BLOCK_SIZE;
>  
> -		kernel_neon_begin_partial(16);
> -		sha1_ce_transform(blocks, data, sctx->state,
> -				  partial ? sctx->buffer : NULL, 0);
> -		kernel_neon_end();
> +		do {
> +			int rem;
> +
> +			kernel_neon_begin_partial(16);
> +			rem = sha1_ce_transform(blocks, data, sctx->state,
> +						partial ? sctx->buffer : NULL,
> +						0, ti);
> +			kernel_neon_end();
>  
> -		data += blocks * SHA1_BLOCK_SIZE;
> -		partial = 0;
> +			data += (blocks - rem) * SHA1_BLOCK_SIZE;
> +			blocks = rem;
> +			partial = 0;
> +		} while (unlikely(ti && blocks > 0));
>  	}
>  	if (len)
>  		memcpy(sctx->buffer + partial, data, len);
> @@ -94,6 +109,7 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
>  		      unsigned int len, u8 *out)
>  {
>  	struct sha1_state *sctx = shash_desc_ctx(desc);
> +	struct thread_info *ti = NULL;
>  	__be32 *dst = (__be32 *)out;
>  	int blocks;
>  	int i;
> @@ -111,9 +127,20 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
>  	 */
>  	blocks = len / SHA1_BLOCK_SIZE;
>  
> -	kernel_neon_begin_partial(16);
> -	sha1_ce_transform(blocks, data, sctx->state, NULL, len);
> -	kernel_neon_end();
> +	if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
> +	     IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
> +		ti = current_thread_info();
> +
> +	do {
> +		int rem;
> +
> +		kernel_neon_begin_partial(16);
> +		rem = sha1_ce_transform(blocks, data, sctx->state,
> +					NULL, len, ti);
> +		kernel_neon_end();
> +		data += (blocks - rem) * SHA1_BLOCK_SIZE;
> +		blocks = rem;
> +	} while (unlikely(ti && blocks > 0));
>  

These seem to be similar, how about renaming assembly function to __sha1_ce_transform
and moving this loop to new sha1_ce_transform.

Otherwise, patches looks good.

-Jussi

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1
  2014-05-01 15:51 ` [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1 Ard Biesheuvel
  2014-05-13 18:58   ` Jussi Kivilinna
@ 2014-05-14  1:36   ` Herbert Xu
  1 sibling, 0 replies; 8+ messages in thread
From: Herbert Xu @ 2014-05-14  1:36 UTC (permalink / raw)
  To: linux-arm-kernel

Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> +               /*
> +                * Pass current's thread info pointer to sha1_ce_transform()
> +                * below if we want it to play nice under preemption.
> +                */
> +               if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
> +                    IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
> +                       ti = current_thread_info();

Instead of in_interrupt you should test CRYPTO_TFM_REQ_MAY_SLEEP.
It's not a matter of correctness but it may avoid unnecessary
breaks in the loop.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-05-14  1:36 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-01 15:51 [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic Ard Biesheuvel
2014-05-01 15:51 ` [PATCH resend 11/15] arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions Ard Biesheuvel
2014-05-01 15:51 ` [PATCH resend 12/15] arm64/crypto: add shared macro to test for NEED_RESCHED Ard Biesheuvel
2014-05-01 15:51 ` [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1 Ard Biesheuvel
2014-05-13 18:58   ` Jussi Kivilinna
2014-05-14  1:36   ` Herbert Xu
2014-05-01 15:51 ` [PATCH resend 14/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA2 Ard Biesheuvel
2014-05-01 15:51 ` [PATCH resend 15/15] arm64/crypto: add voluntary preemption to Crypto Extensions GHASH Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).