Linux cryptographic layer development

Linux cryptographic layer development
 help / color / mirror / Atom feed

* [PATCH v3 5/6] crypto: arm64/crc32 - accelerated support based on x86 SSE implementation
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: linux-arm-kernel, Ard Biesheuvel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

This is a combination of the the Intel algorithm implemented using SSE
and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and
the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in
version 8 of the architecture. Two versions of the above combo are
provided, one for CRC32 and one for CRC32C.

The PMULL/NEON algorithm is faster, but operates on blocks of at least
64 bytes, and on multiples of 16 bytes only. For the remaining input,
or for all input on systems that lack the PMULL 64x64->128 instructions,
the CRC32 instructions will be used.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig         |   6 +
 arch/arm64/crypto/Makefile        |   3 +
 arch/arm64/crypto/crc32-ce-core.S | 266 ++++++++++++++++++++
 arch/arm64/crypto/crc32-ce-glue.c | 212 ++++++++++++++++
 4 files changed, 487 insertions(+)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index d773c0659202..21835deb1ab9 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -28,6 +28,11 @@ config CRYPTO_CRCT10DIF_ARM64_CE
 	depends on KERNEL_MODE_NEON && CRC_T10DIF
 	select CRYPTO_HASH
 
+config CRYPTO_CRC32_ARM64_CE
+	tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC32
+	select CRYPTO_HASH
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
@@ -58,4 +63,5 @@ config CRYPTO_CRC32_ARM64
 	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
 	depends on ARM64
 	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 36fd3eb4201b..144387805a46 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -20,6 +20,9 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
 crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
 
+obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
+crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
 
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..18f5a8442276
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -0,0 +1,266 @@
+/*
+ * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align		6
+	.cpu		generic+crypto+crc
+
+.Lcrc32_constants:
+	/*
+	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+	 * #define CONSTANT_R1  0x154442bd4LL
+	 *
+	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+	 * #define CONSTANT_R2  0x1c6e41596LL
+	 */
+	.octa		0x00000001c6e415960000000154442bd4
+
+	/*
+	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+	 * #define CONSTANT_R3  0x1751997d0LL
+	 *
+	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+	 * #define CONSTANT_R4  0x0ccaa009eLL
+	 */
+	.octa		0x00000000ccaa009e00000001751997d0
+
+	/*
+	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+	 * #define CONSTANT_R5  0x163cd6124LL
+	 */
+	.quad		0x0000000163cd6124
+	.quad		0x00000000FFFFFFFF
+
+	/*
+	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+	 *
+	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+	 *                                                      = 0x1F7011641LL
+	 * #define CONSTANT_RU  0x1F7011641LL
+	 */
+	.octa		0x00000001F701164100000001DB710641
+
+.Lcrc32c_constants:
+	.octa		0x000000009e4addf800000000740eef02
+	.octa		0x000000014cd00bd600000000f20c0dfe
+	.quad		0x00000000dd45aab8
+	.quad		0x00000000FFFFFFFF
+	.octa		0x00000000dea713f10000000105ec76f0
+
+	vCONSTANT	.req	v0
+	dCONSTANT	.req	d0
+	qCONSTANT	.req	q0
+
+	BUF		.req	x0
+	LEN		.req	x1
+	CRC		.req	x2
+
+	vzr		.req	v9
+
+	/**
+	 * Calculate crc32
+	 * BUF - buffer
+	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+	 * CRC - initial crc32
+	 * return %eax crc32
+	 * uint crc32_pmull_le(unsigned char const *buffer,
+	 *                     size_t len, uint crc32)
+	 */
+ENTRY(crc32_pmull_le)
+	adr		x3, .Lcrc32_constants
+	b		0f
+
+ENTRY(crc32c_pmull_le)
+	adr		x3, .Lcrc32c_constants
+
+0:	bic		LEN, LEN, #15
+	ld1		{v1.16b-v4.16b}, [BUF], #0x40
+	movi		vzr.16b, #0
+	fmov		dCONSTANT, CRC
+	eor		v1.16b, v1.16b, vCONSTANT.16b
+	sub		LEN, LEN, #0x40
+	cmp		LEN, #0x40
+	b.lt		less_64
+
+	ldr		qCONSTANT, [x3]
+
+loop_64:		/* 64 bytes Full cache line folding */
+	sub		LEN, LEN, #0x40
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull2		v6.1q, v2.2d, vCONSTANT.2d
+	pmull2		v7.1q, v3.2d, vCONSTANT.2d
+	pmull2		v8.1q, v4.2d, vCONSTANT.2d
+
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	pmull		v3.1q, v3.1d, vCONSTANT.1d
+	pmull		v4.1q, v4.1d, vCONSTANT.1d
+
+	eor		v1.16b, v1.16b, v5.16b
+	ld1		{v5.16b}, [BUF], #0x10
+	eor		v2.16b, v2.16b, v6.16b
+	ld1		{v6.16b}, [BUF], #0x10
+	eor		v3.16b, v3.16b, v7.16b
+	ld1		{v7.16b}, [BUF], #0x10
+	eor		v4.16b, v4.16b, v8.16b
+	ld1		{v8.16b}, [BUF], #0x10
+
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+	eor		v3.16b, v3.16b, v7.16b
+	eor		v4.16b, v4.16b, v8.16b
+
+	cmp		LEN, #0x40
+	b.ge		loop_64
+
+less_64:		/* Folding cache line into 128bit */
+	ldr		qCONSTANT, [x3, #16]
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v3.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v4.16b
+
+	cbz		LEN, fold_64
+
+loop_16:		/* Folding rest buffer into 128bit */
+	subs		LEN, LEN, #0x10
+
+	ld1		{v2.16b}, [BUF], #0x10
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	b.ne		loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	ext		v2.16b, v1.16b, v1.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	ext		v1.16b, v1.16b, vzr.16b, #8
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* final 32-bit fold */
+	ldr		dCONSTANT, [x3, #32]
+	ldr		d3, [x3, #40]
+
+	ext		v2.16b, v1.16b, vzr.16b, #4
+	and		v1.16b, v1.16b, v3.16b
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+	ldr		qCONSTANT, [x3, #48]
+
+	and		v2.16b, v1.16b, v3.16b
+	ext		v2.16b, vzr.16b, v2.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	and		v2.16b, v2.16b, v3.16b
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+	mov		w0, v1.s[1]
+
+	ret
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+	.macro		__crc32, c
+0:	subs		x2, x2, #16
+	b.mi		8f
+	ldp		x3, x4, [x1], #16
+CPU_BE(	rev		x3, x3		)
+CPU_BE(	rev		x4, x4		)
+	crc32\c\()x	w0, w0, x3
+	crc32\c\()x	w0, w0, x4
+	b.ne		0b
+	ret
+
+8:	tbz		x2, #3, 4f
+	ldr		x3, [x1], #8
+CPU_BE(	rev		x3, x3		)
+	crc32\c\()x	w0, w0, x3
+4:	tbz		x2, #2, 2f
+	ldr		w3, [x1], #4
+CPU_BE(	rev		w3, w3		)
+	crc32\c\()w	w0, w0, w3
+2:	tbz		x2, #1, 1f
+	ldrh		w3, [x1], #2
+CPU_BE(	rev16		w3, w3		)
+	crc32\c\()h	w0, w0, w3
+1:	tbz		x2, #0, 0f
+	ldrb		w3, [x1]
+	crc32\c\()b	w0, w0, w3
+0:	ret
+	.endm
+
+	.align		5
+ENTRY(crc32_armv8_le)
+	__crc32
+ENDPROC(crc32_armv8_le)
+
+	.align		5
+ENTRY(crc32c_armv8_le)
+	__crc32		c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..8594127d5e01
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -0,0 +1,212 @@
+/*
+ * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN		64L	/* minimum size of buffer
+					 * for crc32_pmull_le_16 */
+#define SCALE_F			16L	/* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
+static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
+
+static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+	return 0;
+}
+
+static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+	return 0;
+}
+
+static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
+			      unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_pmull_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = *mctx;
+	return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if ((u64)data % SCALE_F) {
+		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+		*crc = fallback_crc32(*crc, data, l);
+
+		data += l;
+		length -= l;
+	}
+
+	if (length >= PMULL_MIN_LEN) {
+		l = round_down(length, SCALE_F);
+
+		kernel_neon_begin_partial(10);
+		*crc = crc32_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0)
+		*crc = fallback_crc32(*crc, data, length);
+
+	return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if ((u64)data % SCALE_F) {
+		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+		*crc = fallback_crc32c(*crc, data, l);
+
+		data += l;
+		length -= l;
+	}
+
+	if (length >= PMULL_MIN_LEN) {
+		l = round_down(length, SCALE_F);
+
+		kernel_neon_begin_partial(10);
+		*crc = crc32c_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0) {
+		*crc = fallback_crc32c(*crc, data, length);
+	}
+
+	return 0;
+}
+
+static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(*crc, out);
+	return 0;
+}
+
+static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(~*crc, out);
+	return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32_pmull_update,
+	.final			= crc32_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32_pmull_cra_init,
+	.base.cra_name		= "crc32",
+	.base.cra_driver_name	= "crc32-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32c_pmull_update,
+	.final			= crc32c_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32c_pmull_cra_init,
+	.base.cra_name		= "crc32c",
+	.base.cra_driver_name	= "crc32c-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+	if (elf_hwcap & HWCAP_CRC32) {
+		fallback_crc32 = crc32_armv8_le;
+		fallback_crc32c = crc32c_armv8_le;
+	} else {
+		fallback_crc32 = crc32_le;
+		fallback_crc32c = __crc32c_le;
+	}
+
+	return crypto_register_shashes(crc32_pmull_algs,
+				       ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+	crypto_unregister_shashes(crc32_pmull_algs,
+				  ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 4/6] crypto: arm/crct10dif - port x86 SSE implementation to ARM
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: linux-arm-kernel, Ard Biesheuvel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

This is a transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only
operate on buffers that are 16 byte aligned (but of any size)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/Kconfig             |   5 +
 arch/arm/crypto/Makefile            |   2 +
 arch/arm/crypto/crct10dif-ce-core.S | 427 ++++++++++++++++++++
 arch/arm/crypto/crct10dif-ce-glue.c | 101 +++++
 4 files changed, 535 insertions(+)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..fce801fa52a1 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
 	  that is part of the ARMv8 Crypto Extensions
 
+config CRYPTO_CRCT10DIF_ARM_CE
+	tristate "CRCT10DIF digest algorithm using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC_T10DIF
+	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..fc77265014b7 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
 
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
+crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
new file mode 100644
index 000000000000..ce45ba0c0687
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -0,0 +1,427 @@
+//
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Function API:
+//       UINT16 crc_t10dif_pcl(
+//               UINT16 init_crc, //initial CRC value, 16 bits
+//               const unsigned char *buf, //buffer pointer to calculate CRC on
+//               UINT64 len //buffer length in bytes (64-bit data)
+//       );
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//	Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...)		code
+#endif
+
+	.text
+	.fpu		crypto-neon-fp-armv8
+
+	arg1_low32	.req	r0
+	arg2		.req	r1
+	arg3		.req	r2
+
+	qzr		.req	q13
+
+	q0l		.req	d0
+	q0h		.req	d1
+	q1l		.req	d2
+	q1h		.req	d3
+	q2l		.req	d4
+	q2h		.req	d5
+	q3l		.req	d6
+	q3h		.req	d7
+	q4l		.req	d8
+	q4h		.req	d9
+	q5l		.req	d10
+	q5h		.req	d11
+	q6l		.req	d12
+	q6h		.req	d13
+	q7l		.req	d14
+	q7h		.req	d15
+
+ENTRY(crc_t10dif_pmull)
+	vmov.i8		qzr, #0			// init zero register
+
+	// adjust the 16-bit initial_crc value, scale it to 32 bits
+	lsl		arg1_low32, arg1_low32, #16
+
+	// check if smaller than 256
+	cmp		arg3, #256
+
+	// for sizes less than 128, we can't fold 64B at a time...
+	blt		_less_than_128
+
+	// load the initial crc value
+	// crc value does not need to be byte-reflected, but it needs
+	// to be moved to the high part of the register.
+	// because data will be byte-reflected and will align with
+	// initial crc at correct place.
+	vmov		s0, arg1_low32		// initial crc
+	vext.8		q10, qzr, q0, #4
+
+	// receive the initial 64B data, xor the initial crc value
+	vld1.64		{q0-q1}, [arg2, :128]!
+	vld1.64		{q2-q3}, [arg2, :128]!
+	vld1.64		{q4-q5}, [arg2, :128]!
+	vld1.64		{q6-q7}, [arg2, :128]!
+CPU_LE(	vrev64.8	q0, q0			)
+CPU_LE(	vrev64.8	q1, q1			)
+CPU_LE(	vrev64.8	q2, q2			)
+CPU_LE(	vrev64.8	q3, q3			)
+CPU_LE(	vrev64.8	q4, q4			)
+CPU_LE(	vrev64.8	q5, q5			)
+CPU_LE(	vrev64.8	q6, q6			)
+CPU_LE(	vrev64.8	q7, q7			)
+
+	vswp		d0, d1
+	vswp		d2, d3
+	vswp		d4, d5
+	vswp		d6, d7
+	vswp		d8, d9
+	vswp		d10, d11
+	vswp		d12, d13
+	vswp		d14, d15
+
+	// XOR the initial_crc value
+	veor.8		q0, q0, q10
+
+	adr		ip, rk3
+	vld1.64		{q10}, [ip, :128]	// xmm10 has rk3 and rk4
+
+	//
+	// we subtract 256 instead of 128 to save one instruction from the loop
+	//
+	sub		arg3, arg3, #256
+
+	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	// buffer. The _fold_64_B_loop will fold 64B at a time
+	// until we have 64+y Bytes of buffer
+
+
+	// fold 64B at a time. This section of the code folds 4 vector
+	// registers in parallel
+_fold_64_B_loop:
+
+	.macro		fold64, reg1, reg2
+	vld1.64		{q11-q12}, [arg2, :128]!
+
+	vmull.p64	q8, \reg1\()h, d21
+	vmull.p64	\reg1, \reg1\()l, d20
+	vmull.p64	q9, \reg2\()h, d21
+	vmull.p64	\reg2, \reg2\()l, d20
+
+CPU_LE(	vrev64.8	q11, q11		)
+CPU_LE(	vrev64.8	q12, q12		)
+	vswp		d22, d23
+	vswp		d24, d25
+
+	veor.8		\reg1, \reg1, q8
+	veor.8		\reg2, \reg2, q9
+	veor.8		\reg1, \reg1, q11
+	veor.8		\reg2, \reg2, q12
+	.endm
+
+	fold64		q0, q1
+	fold64		q2, q3
+	fold64		q4, q5
+	fold64		q6, q7
+
+	subs		arg3, arg3, #128
+
+	// check if there is another 64B in the buffer to be able to fold
+	bge		_fold_64_B_loop
+
+	// at this point, the buffer pointer is pointing at the last y Bytes
+	// of the buffer the 64B of folded data is in 4 of the vector
+	// registers: v0, v1, v2, v3
+
+	// fold the 8 vector registers to 1 vector register with different
+	// constants
+
+	adr		ip, rk9
+	vld1.64		{q10}, [ip, :128]!
+
+	.macro		fold16, reg, rk
+	vmull.p64	q8, \reg\()l, d20
+	vmull.p64	\reg, \reg\()h, d21
+	.ifnb		\rk
+	vld1.64		{q10}, [ip, :128]!
+	.endif
+	veor.8		q7, q7, q8
+	veor.8		q7, q7, \reg
+	.endm
+
+	fold16		q0, rk11
+	fold16		q1, rk13
+	fold16		q2, rk15
+	fold16		q3, rk17
+	fold16		q4, rk19
+	fold16		q5, rk1
+	fold16		q6
+
+	// instead of 64, we add 48 to the loop counter to save 1 instruction
+	// from the loop instead of a cmp instruction, we use the negative
+	// flag with the jl instruction
+	adds		arg3, arg3, #(128-16)
+	blt		_final_reduction_for_128
+
+	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
+	// continue folding 16B at a time
+
+_16B_reduction_loop:
+	vmull.p64	q8, d14, d20
+	vmull.p64	q7, d15, d21
+	veor.8		q7, q7, q8
+
+	vld1.64		{q0}, [arg2, :128]!
+CPU_LE(	vrev64.8	q0, q0		)
+	vswp		d0, d1
+	veor.8		q7, q7, q0
+	subs		arg3, arg3, #16
+
+	// instead of a cmp instruction, we utilize the flags with the
+	// jge instruction equivalent of: cmp arg3, 16-16
+	// check if there is any more 16B in the buffer to be able to fold
+	bge		_16B_reduction_loop
+
+	// now we have 16+z bytes left to reduce, where 0<= z < 16.
+	// first, we reduce the data in the xmm7 register
+
+_final_reduction_for_128:
+	// check if any more data to fold. If not, compute the CRC of
+	// the final 128 bits
+	adds		arg3, arg3, #16
+	beq		_128_done
+
+	// here we are getting data that is less than 16 bytes.
+	// since we know that there was data before the pointer, we can
+	// offset the input pointer before the actual point, to receive
+	// exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_regs:
+	add		arg2, arg2, arg3
+	sub		arg2, arg2, #16
+	vld1.64		{q1}, [arg2]
+CPU_LE(	vrev64.8	q1, q1			)
+	vswp		d2, d3
+
+	// get rid of the extra data that was loaded before
+	// load the shift constant
+	adr		ip, tbl_shf_table + 16
+	sub		ip, ip, arg3
+	vld1.8		{q0}, [ip]
+
+	// shift v2 to the left by arg3 bytes
+	vtbl.8		d4, {d14-d15}, d0
+	vtbl.8		d5, {d14-d15}, d1
+
+	// shift v7 to the right by 16-arg3 bytes
+	vmov.i8		q9, #0x80
+	veor.8		q0, q0, q9
+	vtbl.8		d18, {d14-d15}, d0
+	vtbl.8		d19, {d14-d15}, d1
+
+	// blend
+	vshr.s8		q0, q0, #7		// convert to 8-bit mask
+	vbsl.8		q0, q2, q1
+
+	// fold 16 Bytes
+	vmull.p64	q8, d18, d20
+	vmull.p64	q7, d19, d21
+	veor.8		q7, q7, q8
+	veor.8		q7, q7, q0
+
+_128_done:
+	// compute crc of a 128-bit value
+	vldr		d20, rk5
+	vldr		d21, rk6		// rk5 and rk6 in xmm10
+
+	// 64b fold
+	vext.8		q0, qzr, q7, #8
+	vmull.p64	q7, d15, d20
+	veor.8		q7, q7, q0
+
+	// 32b fold
+	vext.8		q0, q7, qzr, #12
+	vmov		s31, s3
+	vmull.p64	q0, d0, d21
+	veor.8		q7, q0, q7
+
+	// barrett reduction
+_barrett:
+	vldr		d20, rk7
+	vldr		d21, rk8
+
+	vmull.p64	q0, d15, d20
+	vext.8		q0, qzr, q0, #12
+	vmull.p64	q0, d1, d21
+	vext.8		q0, qzr, q0, #12
+	veor.8		q7, q7, q0
+	vmov		r0, s29
+
+_cleanup:
+	// scale the result back to 16 bits
+	lsr		r0, r0, #16
+	bx		lr
+
+_less_than_128:
+	teq		arg3, #0
+	beq		_cleanup
+
+	vmov.i8		q0, #0
+	vmov		s3, arg1_low32		// get the initial crc value
+
+	vld1.64		{q7}, [arg2, :128]!
+CPU_LE(	vrev64.8	q7, q7		)
+	vswp		d14, d15
+	veor.8		q7, q7, q0
+
+	cmp		arg3, #16
+	beq		_128_done		// exactly 16 left
+	blt		_less_than_16_left
+
+	// now if there is, load the constants
+	vldr		d20, rk1
+	vldr		d21, rk2		// rk1 and rk2 in xmm10
+
+	// check if there is enough buffer to be able to fold 16B at a time
+	subs		arg3, arg3, #32
+	addlt		arg3, arg3, #16
+	blt		_get_last_two_regs
+	b		_16B_reduction_loop
+
+_less_than_16_left:
+	// shl r9, 4
+	adr		ip, tbl_shf_table + 16
+	sub		ip, ip, arg3
+	vld1.8		{q0}, [ip]
+	vmov.i8		q9, #0x80
+	veor.8		q0, q0, q9
+	vtbl.8		d18, {d14-d15}, d0
+	vtbl.8		d15, {d14-d15}, d1
+	vmov		d14, d18
+	b		_128_done
+ENDPROC(crc_t10dif_pmull)
+
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+	.align		4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q
+
+rk3:	.quad		0x9d9d000000000000
+rk4:	.quad		0x7cf5000000000000
+rk5:	.quad		0x2d56000000000000
+rk6:	.quad		0x1368000000000000
+rk7:	.quad		0x00000001f65a57f8
+rk8:	.quad		0x000000018bb70000
+rk9:	.quad		0xceae000000000000
+rk10:	.quad		0xbfd6000000000000
+rk11:	.quad		0x1e16000000000000
+rk12:	.quad		0x713c000000000000
+rk13:	.quad		0xf7f9000000000000
+rk14:	.quad		0x80a6000000000000
+rk15:	.quad		0x044c000000000000
+rk16:	.quad		0xe658000000000000
+rk17:	.quad		0xad18000000000000
+rk18:	.quad		0xa497000000000000
+rk19:	.quad		0x6ee3000000000000
+rk20:	.quad		0xe7b5000000000000
+rk1:	.quad		0x2d56000000000000
+rk2:	.quad		0x06df000000000000
+
+tbl_shf_table:
+// use these values for shift constants for the tbl/tbx instruction
+// different alignments result in values as shown:
+//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+
+	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
new file mode 100644
index 000000000000..d428355cf38d
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,101 @@
+/*
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
+
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u32 len);
+
+static int crct10dif_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = 0;
+	return 0;
+}
+
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+			    unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if (!may_use_simd()) {
+		*crc = crc_t10dif_generic(*crc, data, length);
+	} else {
+		if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
+			l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
+				  ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+
+			*crc = crc_t10dif_generic(*crc, data, l);
+
+			length -= l;
+			data += l;
+		}
+		if (length > 0) {
+			kernel_neon_begin();
+			*crc = crc_t10dif_pmull(*crc, data, length);
+			kernel_neon_end();
+		}
+	}
+	return 0;
+}
+
+static int crct10dif_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*(u16 *)out = *crc;
+	return 0;
+}
+
+static struct shash_alg crc_t10dif_alg = {
+	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
+	.init			= crct10dif_init,
+	.update			= crct10dif_update,
+	.final			= crct10dif_final,
+	.descsize		= CRC_T10DIF_DIGEST_SIZE,
+
+	.base.cra_name		= "crct10dif",
+	.base.cra_driver_name	= "crct10dif-arm-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+};
+
+static int __init crc_t10dif_mod_init(void)
+{
+	if (!(elf_hwcap2 & HWCAP2_PMULL))
+		return -ENODEV;
+
+	return crypto_register_shash(&crc_t10dif_alg);
+}
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+	crypto_unregister_shash(&crc_t10dif_alg);
+}
+
+module_init(crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 3/6] crypto: arm64/crct10dif - port x86 SSE implementation to arm64
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: linux-arm-kernel, Ard Biesheuvel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

This is a transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only
operate on buffers that are 16 byte aligned (but of any size)

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig             |   5 +
 arch/arm64/crypto/Makefile            |   3 +
 arch/arm64/crypto/crct10dif-ce-core.S | 392 ++++++++++++++++++++
 arch/arm64/crypto/crct10dif-ce-glue.c |  95 +++++
 4 files changed, 495 insertions(+)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 2cf32e9887e1..d773c0659202 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE
 	depends on ARM64 && KERNEL_MODE_NEON
 	select CRYPTO_HASH
 
+config CRYPTO_CRCT10DIF_ARM64_CE
+	tristate "CRCT10DIF digest algorithm using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC_T10DIF
+	select CRYPTO_HASH
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index abb79b3cfcfe..36fd3eb4201b 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -17,6 +17,9 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
+obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
+crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
 
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
new file mode 100644
index 000000000000..d5b5a8c038c8
--- /dev/null
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -0,0 +1,392 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Function API:
+//       UINT16 crc_t10dif_pcl(
+//               UINT16 init_crc, //initial CRC value, 16 bits
+//               const unsigned char *buf, //buffer pointer to calculate CRC on
+//               UINT64 len //buffer length in bytes (64-bit data)
+//       );
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//	Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.cpu		generic+crypto
+
+	arg1_low32	.req	w0
+	arg2		.req	x1
+	arg3		.req	x2
+
+	vzr		.req	v13
+
+ENTRY(crc_t10dif_pmull)
+	movi		vzr.16b, #0		// init zero register
+
+	// adjust the 16-bit initial_crc value, scale it to 32 bits
+	lsl		arg1_low32, arg1_low32, #16
+
+	// check if smaller than 256
+	cmp		arg3, #256
+
+	// for sizes less than 128, we can't fold 64B at a time...
+	b.lt		_less_than_128
+
+	// load the initial crc value
+	// crc value does not need to be byte-reflected, but it needs
+	// to be moved to the high part of the register.
+	// because data will be byte-reflected and will align with
+	// initial crc at correct place.
+	movi		v10.16b, #0
+	mov		v10.s[3], arg1_low32		// initial crc
+
+	// receive the initial 64B data, xor the initial crc value
+	ldp		q0, q1, [arg2]
+	ldp		q2, q3, [arg2, #0x20]
+	ldp		q4, q5, [arg2, #0x40]
+	ldp		q6, q7, [arg2, #0x60]
+	add		arg2, arg2, #0x80
+
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	rev64		v2.16b, v2.16b			)
+CPU_LE(	rev64		v3.16b, v3.16b			)
+CPU_LE(	rev64		v4.16b, v4.16b			)
+CPU_LE(	rev64		v5.16b, v5.16b			)
+CPU_LE(	rev64		v6.16b, v6.16b			)
+CPU_LE(	rev64		v7.16b, v7.16b			)
+
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
+CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
+CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
+CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
+CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+
+	// XOR the initial_crc value
+	eor		v0.16b, v0.16b, v10.16b
+
+	ldr		q10, rk3	// xmm10 has rk3 and rk4
+					// type of pmull instruction
+					// will determine which constant to use
+
+	//
+	// we subtract 256 instead of 128 to save one instruction from the loop
+	//
+	sub		arg3, arg3, #256
+
+	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	// buffer. The _fold_64_B_loop will fold 64B at a time
+	// until we have 64+y Bytes of buffer
+
+
+	// fold 64B at a time. This section of the code folds 4 vector
+	// registers in parallel
+_fold_64_B_loop:
+
+	.macro		fold64, reg1, reg2
+	ldp		q11, q12, [arg2], #0x20
+
+	pmull2		v8.1q, \reg1\().2d, v10.2d
+	pmull		\reg1\().1q, \reg1\().1d, v10.1d
+
+CPU_LE(	rev64		v11.16b, v11.16b		)
+CPU_LE(	rev64		v12.16b, v12.16b		)
+
+	pmull2		v9.1q, \reg2\().2d, v10.2d
+	pmull		\reg2\().1q, \reg2\().1d, v10.1d
+
+CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
+CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
+
+	eor		\reg1\().16b, \reg1\().16b, v8.16b
+	eor		\reg2\().16b, \reg2\().16b, v9.16b
+	eor		\reg1\().16b, \reg1\().16b, v11.16b
+	eor		\reg2\().16b, \reg2\().16b, v12.16b
+	.endm
+
+	fold64		v0, v1
+	fold64		v2, v3
+	fold64		v4, v5
+	fold64		v6, v7
+
+	subs		arg3, arg3, #128
+
+	// check if there is another 64B in the buffer to be able to fold
+	b.ge		_fold_64_B_loop
+
+	// at this point, the buffer pointer is pointing at the last y Bytes
+	// of the buffer the 64B of folded data is in 4 of the vector
+	// registers: v0, v1, v2, v3
+
+	// fold the 8 vector registers to 1 vector register with different
+	// constants
+
+	ldr		q10, rk9
+
+	.macro		fold16, reg, rk
+	pmull		v8.1q, \reg\().1d, v10.1d
+	pmull2		\reg\().1q, \reg\().2d, v10.2d
+	.ifnb		\rk
+	ldr		q10, \rk
+	.endif
+	eor		v7.16b, v7.16b, v8.16b
+	eor		v7.16b, v7.16b, \reg\().16b
+	.endm
+
+	fold16		v0, rk11
+	fold16		v1, rk13
+	fold16		v2, rk15
+	fold16		v3, rk17
+	fold16		v4, rk19
+	fold16		v5, rk1
+	fold16		v6
+
+	// instead of 64, we add 48 to the loop counter to save 1 instruction
+	// from the loop instead of a cmp instruction, we use the negative
+	// flag with the jl instruction
+	adds		arg3, arg3, #(128-16)
+	b.lt		_final_reduction_for_128
+
+	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
+	// continue folding 16B at a time
+
+_16B_reduction_loop:
+	pmull		v8.1q, v7.1d, v10.1d
+	pmull2		v7.1q, v7.2d, v10.2d
+	eor		v7.16b, v7.16b, v8.16b
+
+	ldr		q0, [arg2], #16
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	eor		v7.16b, v7.16b, v0.16b
+	subs		arg3, arg3, #16
+
+	// instead of a cmp instruction, we utilize the flags with the
+	// jge instruction equivalent of: cmp arg3, 16-16
+	// check if there is any more 16B in the buffer to be able to fold
+	b.ge		_16B_reduction_loop
+
+	// now we have 16+z bytes left to reduce, where 0<= z < 16.
+	// first, we reduce the data in the xmm7 register
+
+_final_reduction_for_128:
+	// check if any more data to fold. If not, compute the CRC of
+	// the final 128 bits
+	adds		arg3, arg3, #16
+	b.eq		_128_done
+
+	// here we are getting data that is less than 16 bytes.
+	// since we know that there was data before the pointer, we can
+	// offset the input pointer before the actual point, to receive
+	// exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_regs:
+	add		arg2, arg2, arg3
+	ldr		q1, [arg2, #-16]
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+
+	// get rid of the extra data that was loaded before
+	// load the shift constant
+	adr		x4, tbl_shf_table + 16
+	sub		x4, x4, arg3
+	ld1		{v0.16b}, [x4]
+
+	// shift v2 to the left by arg3 bytes
+	tbl		v2.16b, {v7.16b}, v0.16b
+
+	// shift v7 to the right by 16-arg3 bytes
+	movi		v9.16b, #0x80
+	eor		v0.16b, v0.16b, v9.16b
+	tbl		v7.16b, {v7.16b}, v0.16b
+
+	// blend
+	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
+	bsl		v0.16b, v2.16b, v1.16b
+
+	// fold 16 Bytes
+	pmull		v8.1q, v7.1d, v10.1d
+	pmull2		v7.1q, v7.2d, v10.2d
+	eor		v7.16b, v7.16b, v8.16b
+	eor		v7.16b, v7.16b, v0.16b
+
+_128_done:
+	// compute crc of a 128-bit value
+	ldr		q10, rk5		// rk5 and rk6 in xmm10
+
+	// 64b fold
+	ext		v0.16b, vzr.16b, v7.16b, #8
+	mov		v7.d[0], v7.d[1]
+	pmull		v7.1q, v7.1d, v10.1d
+	eor		v7.16b, v7.16b, v0.16b
+
+	// 32b fold
+	ext		v0.16b, v7.16b, vzr.16b, #4
+	mov		v7.s[3], vzr.s[0]
+	pmull2		v0.1q, v0.2d, v10.2d
+	eor		v7.16b, v7.16b, v0.16b
+
+	// barrett reduction
+_barrett:
+	ldr		q10, rk7
+	mov		v0.d[0], v7.d[1]
+
+	pmull		v0.1q, v0.1d, v10.1d
+	ext		v0.16b, vzr.16b, v0.16b, #12
+	pmull2		v0.1q, v0.2d, v10.2d
+	ext		v0.16b, vzr.16b, v0.16b, #12
+	eor		v7.16b, v7.16b, v0.16b
+	mov		w0, v7.s[1]
+
+_cleanup:
+	// scale the result back to 16 bits
+	lsr		x0, x0, #16
+	ret
+
+_less_than_128:
+	cbz		arg3, _cleanup
+
+	movi		v0.16b, #0
+	mov		v0.s[3], arg1_low32	// get the initial crc value
+
+	ldr		q7, [arg2], #0x10
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
+
+	cmp		arg3, #16
+	b.eq		_128_done		// exactly 16 left
+	b.lt		_less_than_16_left
+
+	ldr		q10, rk1		// rk1 and rk2 in xmm10
+
+	// update the counter. subtract 32 instead of 16 to save one
+	// instruction from the loop
+	subs		arg3, arg3, #32
+	b.ge		_16B_reduction_loop
+
+	add		arg3, arg3, #16
+	b		_get_last_two_regs
+
+_less_than_16_left:
+	// shl r9, 4
+	adr		x0, tbl_shf_table + 16
+	sub		x0, x0, arg3
+	ld1		{v0.16b}, [x0]
+	movi		v9.16b, #0x80
+	eor		v0.16b, v0.16b, v9.16b
+	tbl		v7.16b, {v7.16b}, v0.16b
+	b		_128_done
+ENDPROC(crc_t10dif_pmull)
+
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+	.align		4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q
+
+rk1:	.octa		0x06df0000000000002d56000000000000
+rk3:	.octa		0x7cf50000000000009d9d000000000000
+rk5:	.octa		0x13680000000000002d56000000000000
+rk7:	.octa		0x000000018bb7000000000001f65a57f8
+rk9:	.octa		0xbfd6000000000000ceae000000000000
+rk11:	.octa		0x713c0000000000001e16000000000000
+rk13:	.octa		0x80a6000000000000f7f9000000000000
+rk15:	.octa		0xe658000000000000044c000000000000
+rk17:	.octa		0xa497000000000000ad18000000000000
+rk19:	.octa		0xe7b50000000000006ee3000000000000
+
+tbl_shf_table:
+// use these values for shift constants for the tbl/tbx instruction
+// different alignments result in values as shown:
+//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
+
+	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
new file mode 100644
index 000000000000..60cb590c2590
--- /dev/null
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,95 @@
+/*
+ * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/neon.h>
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
+
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
+
+static int crct10dif_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = 0;
+	return 0;
+}
+
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+			    unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if (unlikely((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
+		l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
+			  ((u64)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
+
+		*crc = crc_t10dif_generic(*crc, data, l);
+
+		length -= l;
+		data += l;
+	}
+
+	if (length > 0) {
+		kernel_neon_begin_partial(14);
+		*crc = crc_t10dif_pmull(*crc, data, length);
+		kernel_neon_end();
+	}
+
+	return 0;
+}
+
+static int crct10dif_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*(u16 *)out = *crc;
+	return 0;
+}
+
+static struct shash_alg crc_t10dif_alg = {
+	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
+	.init			= crct10dif_init,
+	.update			= crct10dif_update,
+	.final			= crct10dif_final,
+	.descsize		= CRC_T10DIF_DIGEST_SIZE,
+
+	.base.cra_name		= "crct10dif",
+	.base.cra_driver_name	= "crct10dif-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+};
+
+static int __init crc_t10dif_mod_init(void)
+{
+	return crypto_register_shash(&crc_t10dif_alg);
+}
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+	crypto_unregister_shash(&crc_t10dif_alg);
+}
+
+module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 2/6] crypto: testmgr - add/enhance test cases for CRC-T10DIF
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: linux-arm-kernel, Ard Biesheuvel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

The existing test cases only exercise a small slice of the various
possible code paths through the x86 SSE/PCLMULQDQ implementation,
and the upcoming ports of it for arm64. So add one that exceeds 256
bytes in size, and convert another to a chunked test.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 crypto/testmgr.h | 70 ++++++++++++--------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index e64a4ef9d8ca..9b656be7f52f 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -1334,36 +1334,50 @@ static struct hash_testvec rmd320_tv_template[] = {
 	}
 };
 
-#define CRCT10DIF_TEST_VECTORS	3
+#define CRCT10DIF_TEST_VECTORS	ARRAY_SIZE(crct10dif_tv_template)
 static struct hash_testvec crct10dif_tv_template[] = {
 	{
-		.plaintext = "abc",
-		.psize  = 3,
-#ifdef __LITTLE_ENDIAN
-		.digest = "\x3b\x44",
-#else
-		.digest = "\x44\x3b",
-#endif
-	}, {
-		.plaintext = "1234567890123456789012345678901234567890"
-			     "123456789012345678901234567890123456789",
-		.psize	= 79,
-#ifdef __LITTLE_ENDIAN
-		.digest	= "\x70\x4b",
-#else
-		.digest	= "\x4b\x70",
-#endif
-	}, {
-		.plaintext =
-		"abcddddddddddddddddddddddddddddddddddddddddddddddddddddd",
-		.psize  = 56,
-#ifdef __LITTLE_ENDIAN
-		.digest = "\xe3\x9c",
-#else
-		.digest = "\x9c\xe3",
-#endif
-		.np     = 2,
-		.tap    = { 28, 28 }
+		.plaintext	= "abc",
+		.psize		= 3,
+		.digest		= (u8 *)(u16 []){ 0x443b },
+	}, {
+		.plaintext 	= "1234567890123456789012345678901234567890"
+				  "123456789012345678901234567890123456789",
+		.psize		= 79,
+		.digest 	= (u8 *)(u16 []){ 0x4b70 },
+		.np		= 2,
+		.tap		= { 63, 16 },
+	}, {
+		.plaintext	= "abcdddddddddddddddddddddddddddddddddddddddd"
+				  "ddddddddddddd",
+		.psize		= 56,
+		.digest		= (u8 *)(u16 []){ 0x9ce3 },
+		.np		= 8,
+		.tap		= { 1, 2, 28, 7, 6, 5, 4, 3 },
+	}, {
+		.plaintext 	= "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "123456789012345678901234567890123456789",
+		.psize		= 319,
+		.digest		= (u8 *)(u16 []){ 0x44c6 },
+	}, {
+		.plaintext 	= "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "123456789012345678901234567890123456789",
+		.psize		= 319,
+		.digest		= (u8 *)(u16 []){ 0x44c6 },
+		.np		= 4,
+		.tap		= { 1, 255, 57, 6 },
 	}
 };
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 1/6] crypto: testmgr - avoid overlap in chunked tests
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: linux-arm-kernel, Ard Biesheuvel
In-Reply-To: <1480963348-24203-1-git-send-email-ard.biesheuvel@linaro.org>

The IDXn offsets are chosen such that tap values (which may go up to
255) end up overlapping in the xbuf allocation. In particular, IDX1
and IDX3 are too close together, so update IDX3 to avoid this issue.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 crypto/testmgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ded50b67c757..670893bcf361 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -63,7 +63,7 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
  */
 #define IDX1		32
 #define IDX2		32400
-#define IDX3		1
+#define IDX3		511
 #define IDX4		8193
 #define IDX5		22222
 #define IDX6		17101
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup
From: Ard Biesheuvel @ 2016-12-05 18:42 UTC (permalink / raw)
  To: linux-crypto, herbert; +Cc: linux-arm-kernel, Ard Biesheuvel

This v3 combines the CRC-T10DIF and CRC32 implementations for both ARM and
arm64 that I sent out a couple of weeks ago, and adds support to the latter
for CRC32C.

Changes since v2:
- fix a couple of big-endian bugs in CRC32/CRC32C
- add back handling to the CRC-T10DIF routines of buffers that are not a
  multiple of 16 bytes (but they still must be 16 byte aligned)

Ard Biesheuvel (6):
  crypto: testmgr - avoid overlap in chunked tests
  crypto: testmgr - add/enhance test cases for CRC-T10DIF
  crypto: arm64/crct10dif - port x86 SSE implementation to arm64
  crypto: arm/crct10dif - port x86 SSE implementation to ARM
  crypto: arm64/crc32 - accelerated support based on x86 SSE
    implementation
  crypto: arm/crc32 - accelerated support based on x86 SSE
    implementation

 arch/arm/crypto/Kconfig               |  10 +
 arch/arm/crypto/Makefile              |   4 +
 arch/arm/crypto/crc32-ce-core.S       | 306 ++++++++++++++
 arch/arm/crypto/crc32-ce-glue.c       | 242 +++++++++++
 arch/arm/crypto/crct10dif-ce-core.S   | 427 ++++++++++++++++++++
 arch/arm/crypto/crct10dif-ce-glue.c   | 101 +++++
 arch/arm64/crypto/Kconfig             |  11 +
 arch/arm64/crypto/Makefile            |   6 +
 arch/arm64/crypto/crc32-ce-core.S     | 266 ++++++++++++
 arch/arm64/crypto/crc32-ce-glue.c     | 212 ++++++++++
 arch/arm64/crypto/crct10dif-ce-core.S | 392 ++++++++++++++++++
 arch/arm64/crypto/crct10dif-ce-glue.c |  95 +++++
 crypto/testmgr.c                      |   2 +-
 crypto/testmgr.h                      |  70 ++--
 14 files changed, 2115 insertions(+), 29 deletions(-)
 create mode 100644 arch/arm/crypto/crc32-ce-core.S
 create mode 100644 arch/arm/crypto/crc32-ce-glue.c
 create mode 100644 arch/arm/crypto/crct10dif-ce-core.S
 create mode 100644 arch/arm/crypto/crct10dif-ce-glue.c
 create mode 100644 arch/arm64/crypto/crc32-ce-core.S
 create mode 100644 arch/arm64/crypto/crc32-ce-glue.c
 create mode 100644 arch/arm64/crypto/crct10dif-ce-core.S
 create mode 100644 arch/arm64/crypto/crct10dif-ce-glue.c

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH] crypto: rsa - fix a potential race condition in build
From: Yang Shi @ 2016-12-05 17:49 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, linux-crypto, linux-kernel
In-Reply-To: <20161205064800.GA9496@gondor.apana.org.au>

On 12/4/2016 10:48 PM, Herbert Xu wrote:
> On Fri, Dec 02, 2016 at 03:41:04PM -0800, Yang Shi wrote:
>> When building kernel with RSA enabled with multithreaded, the below
>> compile failure might be caught:
>>
>> | /buildarea/kernel-source/crypto/rsa_helper.c:18:28: fatal error: rsapubkey-asn1.h: No such file or directory
>> | #include "rsapubkey-asn1.h"
>> | ^
>> | compilation terminated.
>> | CC crypto/rsa-pkcs1pad.o
>> | CC crypto/algboss.o
>> | CC crypto/testmgr.o
>> | make[3]: *** [/buildarea/kernel-source/scripts/Makefile.build:289: crypto/rsa_helper.o] Error 1
>> | make[3]: *** Waiting for unfinished jobs....
>> | make[2]: *** [/buildarea/kernel-source/Makefile:969: crypto] Error 2
>> | make[1]: *** [Makefile:150: sub-make] Error 2
>> | make: *** [Makefile:24: __sub-make] Error 2
>>
>> The header file is not generated before rsa_helper is compiled, so
>> adding dependency to avoid such issue.
>>
>> Signed-off-by: Yang Shi <yang.shi@windriver.com>
> This should already be fixed in the latest crypto tree.  Could
> you please double-check?

Thanks, Herbert, I just found the commit. Please ignore my patch, sorry 
for the inconvenience.

I will backport that commit to our kernel tree.

Yang

>
> Thanks,

^ permalink raw reply

* Re: [PATCH] crypto/mcryptd: Check mcryptd algorithm compatability
From: Tim Chen @ 2016-12-05 16:50 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Mikulas Patocka, David S. Miller, megha.dey, linux-crypto,
	dm-devel, Milan Broz, Eric Biggers, stable
In-Reply-To: <20161205123403.GB10635@gondor.apana.org.au>

On Mon, 2016-12-05 at 20:34 +0800, Herbert Xu wrote:
> On Fri, Dec 02, 2016 at 04:15:21PM -0800, Tim Chen wrote:
> > 
> > Algorithms not compatible with mcryptd could be spawned by mcryptd
> > with a direct crypto_alloc_tfm invocation using a "mcryptd(alg)"
> > name construct.  This causes mcryptd to crash the kernel if
> > "alg" is incompatible and not intended to be used with mcryptd.
> > 
> > A flag CRYPTO_ALG_MCRYPT is being added to mcryptd compatible
> > algorithms' cra_flags. The compatability is checked when mcryptd spawn
> > off an algorithm.
> > 
> > Link: http://marc.info/?l=linux-crypto-vger&m=148063683310477&w=2
> > Cc: stable@vger.kernel.org
> > Reported-by: Mikulas Patocka <mpatocka@redhat.com>
> > Tested-by: Megha Dey <megha.dey@linux.intel.com>
> > Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> Tim, I think we should instead make mcryptd refuse to generate
> a non-internal algorithm.  This way the user would not be able
> to access it at all since they can only request for non-internal
> algorithms.
> 
> Basically you want to check at the start of mcryptd_create_hash
> that the INTERNAL bit is set on both the type and mask as returned
> by crypto_get_attr_type.

I have thought about that.  However, not all internal algorithms
are compatible with mcryptd. We can still have trouble if some
random internal algorithm is paired with mcryptd.

Tim

^ permalink raw reply

* [PATCH v4] crypto: AF_ALG - fix AEAD tag memory handling
From: Stephan Mueller @ 2016-12-05 14:26 UTC (permalink / raw)
  To: herbert; +Cc: linux-crypto

Hi Herbert,

Changes v4: restore the old behavior -- if the caller does not provide sufficient
output buffer size, return an error.

---8<---

For encryption, the AEAD ciphers require AAD || PT as input and generate
AAD || CT || Tag as output and vice versa for decryption. Prior to this
patch, the AF_ALG interface for AEAD ciphers requires the buffer to be
present as input for encryption. Similarly, the output buffer for
decryption required the presence of the tag buffer too. This implies
that the kernel reads / writes data buffers from/to kernel space
even though this operation is not required.

This patch changes the AF_ALG AEAD interface to be consistent with the
in-kernel AEAD cipher requirements.

Due to this handling, he changes are transparent to user space with one
exception: the return code of recv indicates the mount of output buffer.
That output buffer has a different size compared to before the patch
which implies that the return code of recv will also be different.
For example, a decryption operation uses 16 bytes AAD, 16 bytes CT and
16 bytes tag, the AF_ALG AEAD interface before showed a recv return
code of 48 (bytes) whereas after this patch, the return code is 32
since the tag is not returned any more.

Reported-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/algif_aead.c | 57 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 6e95137..4359c9b 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -81,7 +81,11 @@ static inline bool aead_sufficient_data(struct aead_ctx *ctx)
 {
 	unsigned as = crypto_aead_authsize(crypto_aead_reqtfm(&ctx->aead_req));
 
-	return ctx->used >= ctx->aead_assoclen + as;
+	/*
+	 * The minimum amount of memory needed for an AEAD cipher is
+	 * the AAD and in case of decryption the tag.
+	 */
+	return ctx->used >= ctx->aead_assoclen + (ctx->enc ? 0 : as);
 }
 
 static void aead_reset_ctx(struct aead_ctx *ctx)
@@ -426,12 +430,15 @@ static int aead_recvmsg_async(struct socket *sock, struct msghdr *msg,
 			goto unlock;
 	}
 
-	used = ctx->used;
-	outlen = used;
-
 	if (!aead_sufficient_data(ctx))
 		goto unlock;
 
+	used = ctx->used;
+	if (ctx->enc)
+		outlen = used + as;
+	else
+		outlen = used - as;
+
 	req = sock_kmalloc(sk, reqlen, GFP_KERNEL);
 	if (unlikely(!req))
 		goto unlock;
@@ -445,7 +452,7 @@ static int aead_recvmsg_async(struct socket *sock, struct msghdr *msg,
 	aead_request_set_ad(req, ctx->aead_assoclen);
 	aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				  aead_async_cb, sk);
-	used -= ctx->aead_assoclen + (ctx->enc ? as : 0);
+	used -= ctx->aead_assoclen;
 
 	/* take over all tx sgls from ctx */
 	areq->tsgl = sock_kmalloc(sk,
@@ -462,7 +469,7 @@ static int aead_recvmsg_async(struct socket *sock, struct msghdr *msg,
 	areq->tsgls = sgl->cur;
 
 	/* create rx sgls */
-	while (iov_iter_count(&msg->msg_iter)) {
+	while (outlen > usedpages && iov_iter_count(&msg->msg_iter)) {
 		size_t seglen = min_t(size_t, iov_iter_count(&msg->msg_iter),
 				      (outlen - usedpages));
 
@@ -492,16 +499,14 @@ static int aead_recvmsg_async(struct socket *sock, struct msghdr *msg,
 
 		last_rsgl = rsgl;
 
-		/* we do not need more iovecs as we have sufficient memory */
-		if (outlen <= usedpages)
-			break;
-
 		iov_iter_advance(&msg->msg_iter, err);
 	}
-	err = -EINVAL;
+
 	/* ensure output buffer is sufficiently large */
-	if (usedpages < outlen)
-		goto free;
+	if (usedpages < outlen) {
+		err = -EINVAL;
+		goto unlock;
+	}
 
 	aead_request_set_crypt(req, areq->tsgl, areq->first_rsgl.sgl.sg, used,
 			       areq->iv);
@@ -572,6 +577,7 @@ static int aead_recvmsg_sync(struct socket *sock, struct msghdr *msg, int flags)
 			goto unlock;
 	}
 
+	/* data length provided by caller via sendmsg/sendpage */
 	used = ctx->used;
 
 	/*
@@ -586,16 +592,27 @@ static int aead_recvmsg_sync(struct socket *sock, struct msghdr *msg, int flags)
 	if (!aead_sufficient_data(ctx))
 		goto unlock;
 
-	outlen = used;
+	/*
+	 * Calculate the minimum output buffer size holding the result of the
+	 * cipher operation. When encrypting data, the receiving buffer is
+	 * larger by the tag length compared to the input buffer as the
+	 * encryption operation generates the tag. For decryption, the input
+	 * buffer provides the tag which is consumed resulting in only the
+	 * plaintext without a buffer for the tag returned to the caller.
+	 */
+	if (ctx->enc)
+		outlen = used + as;
+	else
+		outlen = used - as;
 
 	/*
 	 * The cipher operation input data is reduced by the associated data
 	 * length as this data is processed separately later on.
 	 */
-	used -= ctx->aead_assoclen + (ctx->enc ? as : 0);
+	used -= ctx->aead_assoclen;
 
 	/* convert iovecs of output buffers into scatterlists */
-	while (iov_iter_count(&msg->msg_iter)) {
+	while (outlen > usedpages && iov_iter_count(&msg->msg_iter)) {
 		size_t seglen = min_t(size_t, iov_iter_count(&msg->msg_iter),
 				      (outlen - usedpages));
 
@@ -622,16 +639,14 @@ static int aead_recvmsg_sync(struct socket *sock, struct msghdr *msg, int flags)
 
 		last_rsgl = rsgl;
 
-		/* we do not need more iovecs as we have sufficient memory */
-		if (outlen <= usedpages)
-			break;
 		iov_iter_advance(&msg->msg_iter, err);
 	}
 
-	err = -EINVAL;
 	/* ensure output buffer is sufficiently large */
-	if (usedpages < outlen)
+	if (usedpages < outlen) {
+		err = -EINVAL;
 		goto unlock;
+	}
 
 	sg_mark_end(sgl->sg + sgl->cur - 1);
 	aead_request_set_crypt(&ctx->aead_req, sgl->sg, ctx->first_rsgl.sgl.sg,
-- 
2.9.3

^ permalink raw reply related

* Loan Offer
From: Quick Loan @ 2016-12-05 10:51 UTC (permalink / raw)
  To: Recipients

We can help you with a genuine loan to meet your needs.
Do you need a personal or business loan without stress and
quick approval? Do you need an urgent loan today? No Credit Checks

* LOAN APPROVAL IN 60MINS !!
* GUARANTEED SAME DAY TRANSFER !!
* 100% APPROVAL RATE !!
* LOW INTEREST RATE !!

Contact US for more information about loan offer and we will solve your
financial problem. contact us via email: quickloanss@hotmail.com

^ permalink raw reply

* Re: [PATCH v2] crypto: sun4i-ss: support the Security System PRNG
From: Corentin Labbe @ 2016-12-05 12:57 UTC (permalink / raw)
  To: Herbert Xu
  Cc: davem, maxime.ripard, wens, linux-kernel, linux-crypto,
	linux-arm-kernel
In-Reply-To: <20161205123705.GA10732@gondor.apana.org.au>

On Mon, Dec 05, 2016 at 08:37:05PM +0800, Herbert Xu wrote:
> On Mon, Dec 05, 2016 at 11:48:42AM +0100, Corentin Labbe wrote:
> > From: LABBE Corentin <clabbe.montjoie@gmail.com>
> > 
> > The Security System have a PRNG.
> > This patch add support for it as an hwrng.
> > 
> > Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
> 
> Please don't add PRNGs to hwrng.  If we have existing PRNGs in
> there please let me know so that we can remove them.
> 

So how to expose PRNG to user space ? or more generally how to "use" a PRNG ?

I found hisi-rng, crypto4xx_ and exynos-rng which seems to be PRNG used as hwrng.

Regards

^ permalink raw reply

* Re: [PATCH v2] crypto: sun4i-ss: support the Security System PRNG
From: Herbert Xu @ 2016-12-05 12:37 UTC (permalink / raw)
  To: Corentin Labbe
  Cc: davem, maxime.ripard, wens, linux-kernel, linux-crypto,
	linux-arm-kernel
In-Reply-To: <1480934922-20732-1-git-send-email-clabbe.montjoie@gmail.com>

On Mon, Dec 05, 2016 at 11:48:42AM +0100, Corentin Labbe wrote:
> From: LABBE Corentin <clabbe.montjoie@gmail.com>
> 
> The Security System have a PRNG.
> This patch add support for it as an hwrng.
> 
> Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>

Please don't add PRNGs to hwrng.  If we have existing PRNGs in
there please let me know so that we can remove them.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] crypto/mcryptd: Check mcryptd algorithm compatability
From: Herbert Xu @ 2016-12-05 12:34 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mikulas Patocka, David S. Miller, megha.dey, linux-crypto,
	dm-devel, Milan Broz, Eric Biggers, stable
In-Reply-To: <deb5f529a0ff9b454fcc62b849ac3aff96c3e684.1480724043.git.tim.c.chen@linux.intel.com>

On Fri, Dec 02, 2016 at 04:15:21PM -0800, Tim Chen wrote:
> Algorithms not compatible with mcryptd could be spawned by mcryptd
> with a direct crypto_alloc_tfm invocation using a "mcryptd(alg)"
> name construct.  This causes mcryptd to crash the kernel if
> "alg" is incompatible and not intended to be used with mcryptd.
> 
> A flag CRYPTO_ALG_MCRYPT is being added to mcryptd compatible
> algorithms' cra_flags. The compatability is checked when mcryptd spawn
> off an algorithm.
> 
> Link: http://marc.info/?l=linux-crypto-vger&m=148063683310477&w=2
> Cc: stable@vger.kernel.org
> Reported-by: Mikulas Patocka <mpatocka@redhat.com>
> Tested-by: Megha Dey <megha.dey@linux.intel.com>
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>

Tim, I think we should instead make mcryptd refuse to generate
a non-internal algorithm.  This way the user would not be able
to access it at all since they can only request for non-internal
algorithms.

Basically you want to check at the start of mcryptd_create_hash
that the INTERNAL bit is set on both the type and mask as returned
by crypto_get_attr_type.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v3] crypto: AF_ALG - fix AEAD tag memory handling
From: Herbert Xu @ 2016-12-05 12:26 UTC (permalink / raw)
  To: Stephan Mueller; +Cc: mathew.j.martineau, linux-crypto
In-Reply-To: <14103787.a1H06iWQgk@positron.chronox.de>

On Fri, Dec 02, 2016 at 03:16:26PM +0100, Stephan Mueller wrote:
>
> In addition, the code now handles the situation where the provided
> output buffer is too small by reducing the size of the processed
> input buffer accordingly. Due to this handling, he changes are

I think that's dangerous.  The AEAD interface doesn't do chaining
so it's all or nothing.  If the input doesn't fit within the output
buffer then you should not truncate the input as that would either
generate a bogus tag on encryption or accept a bogus tag on
decryption.

I suggest that you instead return EINVAL in such a case.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH v2] crypto: sun4i-ss: support the Security System PRNG
From: Corentin Labbe @ 2016-12-05 10:48 UTC (permalink / raw)
  To: herbert, davem, maxime.ripard, wens
  Cc: LABBE Corentin, linux-kernel, linux-arm-kernel, linux-crypto

From: LABBE Corentin <clabbe.montjoie@gmail.com>

The Security System have a PRNG.
This patch add support for it as an hwrng.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
---
Changes since v1:
 - Replaced all spin_lock_bh by simple spin_lock
 - Removed handling of size not modulo 4 which will never happen
 - Added add_random_ready_callback()

 drivers/crypto/Kconfig                   |  8 +++
 drivers/crypto/sunxi-ss/Makefile         |  1 +
 drivers/crypto/sunxi-ss/sun4i-ss-core.c  | 14 +++++
 drivers/crypto/sunxi-ss/sun4i-ss-hwrng.c | 99 ++++++++++++++++++++++++++++++++
 drivers/crypto/sunxi-ss/sun4i-ss.h       |  9 +++
 5 files changed, 131 insertions(+)
 create mode 100644 drivers/crypto/sunxi-ss/sun4i-ss-hwrng.c

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 4d2b81f..38f7aca 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -538,6 +538,14 @@ config CRYPTO_DEV_SUN4I_SS
 	  To compile this driver as a module, choose M here: the module
 	  will be called sun4i-ss.
 
+config CRYPTO_DEV_SUN4I_SS_PRNG
+	bool "Support for Allwinner Security System PRNG"
+	depends on CRYPTO_DEV_SUN4I_SS
+	select HW_RANDOM
+	help
+	  This driver provides kernel-side support for the Pseudo-Random
+	  Number Generator found in the Security System.
+
 config CRYPTO_DEV_ROCKCHIP
 	tristate "Rockchip's Cryptographic Engine driver"
 	depends on OF && ARCH_ROCKCHIP
diff --git a/drivers/crypto/sunxi-ss/Makefile b/drivers/crypto/sunxi-ss/Makefile
index 8f4c7a2..ca049ee 100644
--- a/drivers/crypto/sunxi-ss/Makefile
+++ b/drivers/crypto/sunxi-ss/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_CRYPTO_DEV_SUN4I_SS) += sun4i-ss.o
 sun4i-ss-y += sun4i-ss-core.o sun4i-ss-hash.o sun4i-ss-cipher.o
+sun4i-ss-$(CONFIG_CRYPTO_DEV_SUN4I_SS_PRNG) += sun4i-ss-hwrng.o
diff --git a/drivers/crypto/sunxi-ss/sun4i-ss-core.c b/drivers/crypto/sunxi-ss/sun4i-ss-core.c
index 3ac6c6c..fa739de 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss-core.c
+++ b/drivers/crypto/sunxi-ss/sun4i-ss-core.c
@@ -359,6 +359,16 @@ static int sun4i_ss_probe(struct platform_device *pdev)
 		}
 	}
 	platform_set_drvdata(pdev, ss);
+
+#ifdef CONFIG_CRYPTO_DEV_SUN4I_SS_PRNG
+	/* Voluntarily made the PRNG optional */
+	err = sun4i_ss_hwrng_register(&ss->hwrng);
+	if (!err)
+		dev_info(ss->dev, "sun4i-ss PRNG loaded");
+	else
+		dev_err(ss->dev, "sun4i-ss PRNG failed");
+#endif
+
 	return 0;
 error_alg:
 	i--;
@@ -386,6 +396,10 @@ static int sun4i_ss_remove(struct platform_device *pdev)
 	int i;
 	struct sun4i_ss_ctx *ss = platform_get_drvdata(pdev);
 
+#ifdef CONFIG_CRYPTO_DEV_SUN4I_SS_PRNG
+	sun4i_ss_hwrng_remove(&ss->hwrng);
+#endif
+
 	for (i = 0; i < ARRAY_SIZE(ss_algs); i++) {
 		switch (ss_algs[i].type) {
 		case CRYPTO_ALG_TYPE_ABLKCIPHER:
diff --git a/drivers/crypto/sunxi-ss/sun4i-ss-hwrng.c b/drivers/crypto/sunxi-ss/sun4i-ss-hwrng.c
new file mode 100644
index 0000000..8319cae
--- /dev/null
+++ b/drivers/crypto/sunxi-ss/sun4i-ss-hwrng.c
@@ -0,0 +1,99 @@
+#include "sun4i-ss.h"
+
+static void sun4i_ss_seed(struct random_ready_callback *rdy)
+{
+	struct sun4i_ss_ctx *ss;
+
+	ss = container_of(rdy, struct sun4i_ss_ctx, random_ready);
+	get_random_bytes(ss->seed, SS_SEED_LEN);
+	ss->random_ready.func = NULL;
+}
+
+static int sun4i_ss_hwrng_init(struct hwrng *hwrng)
+{
+	struct sun4i_ss_ctx *ss;
+	int ret;
+
+	ss = container_of(hwrng, struct sun4i_ss_ctx, hwrng);
+
+	ss->random_ready.owner = THIS_MODULE;
+	ss->random_ready.func = sun4i_ss_seed;
+
+	ret = add_random_ready_callback(&ss->random_ready);
+	switch (ret) {
+	case 0:
+		break;
+	case -EALREADY:
+		get_random_bytes(ss->seed, SS_SEED_LEN);
+		ss->random_ready.func = NULL;
+		ret = 0;
+		break;
+	default:
+		ss->random_ready.func = NULL;
+	}
+
+	return ret;
+}
+
+static int sun4i_ss_hwrng_read(struct hwrng *hwrng, void *buf,
+			       size_t max, bool wait)
+{
+	int i;
+	u32 v;
+	u32 *data = buf;
+	const u32 mode = SS_OP_PRNG | SS_PRNG_CONTINUE | SS_ENABLED;
+	size_t len;
+	struct sun4i_ss_ctx *ss;
+
+	ss = container_of(hwrng, struct sun4i_ss_ctx, hwrng);
+	len = min_t(size_t, SS_DATA_LEN, max);
+
+	/* If the PRNG is not seeded */
+	if (ss->random_ready.func)
+		return -EAGAIN;
+
+	spin_lock(&ss->slock);
+
+	writel(mode, ss->base + SS_CTL);
+
+	/* write the seed */
+	for (i = 0; i < SS_SEED_LEN / 4; i++)
+		writel(ss->seed[i], ss->base + SS_KEY0 + i * 4);
+	writel(mode | SS_PRNG_START, ss->base + SS_CTL);
+
+	/* Read the random data */
+	readsl(ss->base + SS_TXFIFO, data, len / 4);
+
+	/* Update the seed */
+	for (i = 0; i < SS_SEED_LEN / 4; i++) {
+		v = readl(ss->base + SS_KEY0 + i * 4);
+		ss->seed[i] = v;
+	}
+
+	writel(0, ss->base + SS_CTL);
+	spin_unlock(&ss->slock);
+	return len;
+}
+
+int sun4i_ss_hwrng_register(struct hwrng *hwrng)
+{
+	hwrng->name = "sun4i Security System PRNG";
+	hwrng->init = sun4i_ss_hwrng_init;
+	hwrng->read = sun4i_ss_hwrng_read;
+	hwrng->quality = 1000;
+
+	/* Cannot use devm_hwrng_register() since we need to hwrng_unregister
+	 * before stopping clocks/regulator
+	 */
+	return hwrng_register(hwrng);
+}
+
+void sun4i_ss_hwrng_remove(struct hwrng *hwrng)
+{
+	struct sun4i_ss_ctx *ss;
+
+	ss = container_of(hwrng, struct sun4i_ss_ctx, hwrng);
+	if (ss->random_ready.func)
+		del_random_ready_callback(&ss->random_ready);
+	hwrng_unregister(hwrng);
+}
diff --git a/drivers/crypto/sunxi-ss/sun4i-ss.h b/drivers/crypto/sunxi-ss/sun4i-ss.h
index f04c0f8..85772d7 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss.h
+++ b/drivers/crypto/sunxi-ss/sun4i-ss.h
@@ -23,6 +23,7 @@
 #include <linux/scatterlist.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/hw_random.h>
 #include <crypto/md5.h>
 #include <crypto/sha.h>
 #include <crypto/hash.h>
@@ -125,6 +126,9 @@
 #define SS_RXFIFO_EMP_INT_ENABLE	(1 << 2)
 #define SS_TXFIFO_AVA_INT_ENABLE	(1 << 0)
 
+#define SS_SEED_LEN (192 / 8)
+#define SS_DATA_LEN (160 / 8)
+
 struct sun4i_ss_ctx {
 	void __iomem *base;
 	int irq;
@@ -134,6 +138,9 @@ struct sun4i_ss_ctx {
 	struct device *dev;
 	struct resource *res;
 	spinlock_t slock; /* control the use of the device */
+	struct random_ready_callback random_ready;
+	struct hwrng hwrng;
+	u32 seed[SS_SEED_LEN / 4];
 };
 
 struct sun4i_ss_alg_template {
@@ -199,3 +206,5 @@ int sun4i_ss_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int keylen);
 int sun4i_ss_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
 			 unsigned int keylen);
+int sun4i_ss_hwrng_register(struct hwrng *hwrng);
+void sun4i_ss_hwrng_remove(struct hwrng *hwrng);
-- 
2.7.3

^ permalink raw reply related

* Re: [PATCH v1 2/2] crypto: mediatek - add DT bindings documentation
From: Matthias Brugger @ 2016-12-05 10:18 UTC (permalink / raw)
  To: Ryder Lee, Herbert Xu, David S. Miller
  Cc: devicetree, linux-mediatek, linux-kernel, linux-crypto,
	linux-arm-kernel, Sean Wang, Roy Luo
In-Reply-To: <1480921284-45827-3-git-send-email-ryder.lee@mediatek.com>



On 05/12/16 08:01, Ryder Lee wrote:
> Add DT bindings documentation for the crypto driver
>
> Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
> ---
>  .../devicetree/bindings/crypto/mediatek-crypto.txt | 32 ++++++++++++++++++++++
>  1 file changed, 32 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
>
> diff --git a/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
> new file mode 100644
> index 0000000..8b1db08
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
> @@ -0,0 +1,32 @@
> +MediaTek cryptographic accelerators
> +
> +Required properties:
> +- compatible: Should be "mediatek,mt7623-crypto"

Do you know how big the difference is between the crypto engine for 
mt7623/mt2701/mt8521p in comparison, let's say mt8173 or mt6797?
Do this SoCs have a crypot engine? If so and they are quite similar, we 
might think of adding a mtk-crypto binding and add soc specific bindings.

Regards,
Matthias

> +- reg: Address and length of the register set for the device
> +- interrupts: Should contain the five crypto engines interrupts in numeric
> +	order. These are global system and four descriptor rings.
> +- clocks: the clock used by the core
> +- clock-names: the names of the clock listed in the clocks property. These are
> +	"ethif", "cryp"
> +- power-domains: Must contain a reference to the PM domain.
> +
> +
> +Optional properties:
> +- interrupt-parent: Should be the phandle for the interrupt controller
> +  that services interrupts for this device
> +
> +
> +Example:
> +	crypto: crypto@1b240000 {
> +		compatible = "mediatek,mt7623-crypto";
> +		reg = <0 0x1b240000 0 0x20000>;
> +		interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_LOW>,
> +			     <GIC_SPI 83 IRQ_TYPE_LEVEL_LOW>,
> +			     <GIC_SPI 84 IRQ_TYPE_LEVEL_LOW>,
> +			     <GIC_SPI 91 IRQ_TYPE_LEVEL_LOW>,
> +			     <GIC_SPI 97 IRQ_TYPE_LEVEL_LOW>;
> +		clocks = <&topckgen CLK_TOP_ETHIF_SEL>,
> +			 <&ethsys CLK_ETHSYS_CRYPTO>;
> +		clock-names = "ethif","cryp";
> +		power-domains = <&scpsys MT2701_POWER_DOMAIN_ETH>;
> +	};
>

^ permalink raw reply

* Re: [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup
From: Ard Biesheuvel @ 2016-12-05  9:20 UTC (permalink / raw)
  To: linux-crypto@vger.kernel.org, Herbert Xu; +Cc: Ard Biesheuvel
In-Reply-To: <1480852447-25082-1-git-send-email-ard.biesheuvel@linaro.org>

On 4 December 2016 at 11:54, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> This v2 combines the CRC-T10DIF and CRC32 implementations for both ARM and
> arm64 that I sent out a couple of weeks ago, and adds support to the latter
> for CRC32C.
>

Please don't apply yet. There is an issue in the 32-bit ARM code I
only spotted just now.


> Ard Biesheuvel (6):
>   crypto: testmgr - avoid overlap in chunked tests
>   crypto: testmgr - add/enhance test cases for CRC-T10DIF
>   crypto: arm64/crct10dif - port x86 SSE implementation to arm64
>   crypto: arm/crct10dif - port x86 SSE implementation to ARM
>   crypto: arm64/crc32 - accelerated support based on x86 SSE
>     implementation
>   crypto: arm/crc32 - accelerated support based on x86 SSE
>     implementation
>
>  arch/arm/crypto/Kconfig               |  10 +
>  arch/arm/crypto/Makefile              |   4 +
>  arch/arm/crypto/crc32-ce-core.S       | 306 +++++++++++++++++
>  arch/arm/crypto/crc32-ce-glue.c       | 195 +++++++++++
>  arch/arm/crypto/crct10dif-ce-core.S   | 349 ++++++++++++++++++++
>  arch/arm/crypto/crct10dif-ce-glue.c   |  95 ++++++
>  arch/arm64/crypto/Kconfig             |  11 +
>  arch/arm64/crypto/Makefile            |   6 +
>  arch/arm64/crypto/crc32-ce-core.S     | 266 +++++++++++++++
>  arch/arm64/crypto/crc32-ce-glue.c     | 188 +++++++++++
>  arch/arm64/crypto/crct10dif-ce-core.S | 317 ++++++++++++++++++
>  arch/arm64/crypto/crct10dif-ce-glue.c |  91 +++++
>  crypto/testmgr.c                      |   2 +-
>  crypto/testmgr.h                      |  70 ++--
>  14 files changed, 1881 insertions(+), 29 deletions(-)
>  create mode 100644 arch/arm/crypto/crc32-ce-core.S
>  create mode 100644 arch/arm/crypto/crc32-ce-glue.c
>  create mode 100644 arch/arm/crypto/crct10dif-ce-core.S
>  create mode 100644 arch/arm/crypto/crct10dif-ce-glue.c
>  create mode 100644 arch/arm64/crypto/crc32-ce-core.S
>  create mode 100644 arch/arm64/crypto/crc32-ce-glue.c
>  create mode 100644 arch/arm64/crypto/crct10dif-ce-core.S
>  create mode 100644 arch/arm64/crypto/crct10dif-ce-glue.c
>
> --
> 2.7.4
>

^ permalink raw reply

* [PATCH] crypto: caam - fix pointer size for AArch64 boot loader, AArch32 kernel
From: Horia Geantă @ 2016-12-05  9:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David S. Miller, linux-crypto, Dan Douglass, Alison Wang

Start with a clean slate before dealing with bit 16 (pointer size)
of Master Configuration Register.
This fixes the case of AArch64 boot loader + AArch32 kernel, when
the boot loader might set MCFGR[PS] and kernel would fail to clear it.

Cc: <stable@vger.kernel.org>
Reported-by: Alison Wang <alison.wang@nxp.com>
Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/ctrl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index be62a7f482ac..0a6ca3919270 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -556,8 +556,9 @@ static int caam_probe(struct platform_device *pdev)
 	 * Enable DECO watchdogs and, if this is a PHYS_ADDR_T_64BIT kernel,
 	 * long pointers in master configuration register
 	 */
-	clrsetbits_32(&ctrl->mcr, MCFGR_AWCACHE_MASK, MCFGR_AWCACHE_CACH |
-		      MCFGR_AWCACHE_BUFF | MCFGR_WDENABLE | MCFGR_LARGE_BURST |
+	clrsetbits_32(&ctrl->mcr, MCFGR_AWCACHE_MASK | MCFGR_LONG_PTR,
+		      MCFGR_AWCACHE_CACH | MCFGR_AWCACHE_BUFF |
+		      MCFGR_WDENABLE | MCFGR_LARGE_BURST |
 		      (sizeof(dma_addr_t) == sizeof(u64) ? MCFGR_LONG_PTR : 0));

 	/*
-- 
2.4.4

^ permalink raw reply related

* [PATCH v2 0/2] CESA: Fixes for STD ahash requests
From: Romain Perier @ 2016-12-05  8:56 UTC (permalink / raw)
  To: Boris Brezillon, Arnaud Ebalard
  Cc: linux-crypto, Jason Cooper, Andrew Lunn, Sebastian Hesselbarth,
	Gregory Clement, Thomas Petazzoni, stable, Nadav Haklai,
	Ofer Heifetz

This set of patches fixes two issues for STD ahash requests. The first
one is that the operation template is copied twice to the SRAM from the
step function, it is not needed. The second one is also contained in the
step function which copies creq->state to the engine for all type of
requests, even if this one is a fragment of the initial req and is
re-launched. This might corrupt the context of the request in some cases.

Romain Perier (2):
  crypto: marvell - Don't copy hash operation twice into the SRAM
  crypto: marvell - Don't corrupt state of an STD req for re-stepped
    ahash

 drivers/crypto/marvell/hash.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

-- 

v1: https://www.mail-archive.com/linux-crypto@vger.kernel.org/msg22113.html

Changes in v2:
 - Rephrased commit message for PATCH 2/2
 - Added Cc: stable for both commits
 - Fixed coding style issue in PATCH 2/2
 - Rephased the cover letter

2.9.3

^ permalink raw reply

* [PATCH v2 2/2] crypto: marvell - Don't corrupt state of an STD req for re-stepped ahash
From: Romain Perier @ 2016-12-05  8:56 UTC (permalink / raw)
  To: Boris Brezillon, Arnaud Ebalard
  Cc: linux-crypto, Jason Cooper, Andrew Lunn, Sebastian Hesselbarth,
	Gregory Clement, Thomas Petazzoni, stable, Nadav Haklai,
	Ofer Heifetz
In-Reply-To: <20161205085639.21034-1-romain.perier@free-electrons.com>

mv_cesa_hash_std_step() copies the creq->state into the SRAM at each
step, but this is only required on the first one. By doing that, we
overwrite the engine state, and get erroneous results when the crypto
request is split in several chunks to fit in the internal SRAM.

This commit changes the function to copy the state only on the first
step.

Fixes: commit 2786cee8e50b ("crypto: marvell - Move SRAM I/O op...")
Signed-off-by: Romain Perier <romain.perier@free-electrons.com>
Cc: <stable@vger.kernel.org>
---
 drivers/crypto/marvell/hash.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/marvell/hash.c b/drivers/crypto/marvell/hash.c
index fbbcbf8..317cf02 100644
--- a/drivers/crypto/marvell/hash.c
+++ b/drivers/crypto/marvell/hash.c
@@ -168,9 +168,11 @@ static void mv_cesa_ahash_std_step(struct ahash_request *req)
 	mv_cesa_adjust_op(engine, &creq->op_tmpl);
 	memcpy_toio(engine->sram, &creq->op_tmpl, sizeof(creq->op_tmpl));
 
-	digsize = crypto_ahash_digestsize(crypto_ahash_reqtfm(req));
-	for (i = 0; i < digsize / 4; i++)
-		writel_relaxed(creq->state[i], engine->regs + CESA_IVDIG(i));
+	if (!sreq->offset) {
+		digsize = crypto_ahash_digestsize(crypto_ahash_reqtfm(req));
+		for (i = 0; i < digsize / 4; i++)
+			writel_relaxed(creq->state[i], engine->regs + CESA_IVDIG(i));
+	}
 
 	if (creq->cache_ptr)
 		memcpy_toio(engine->sram + CESA_SA_DATA_SRAM_OFFSET,
-- 
2.9.3

^ permalink raw reply related

* [PATCH v2 1/2] crypto: marvell - Don't copy hash operation twice into the SRAM
From: Romain Perier @ 2016-12-05  8:56 UTC (permalink / raw)
  To: Boris Brezillon, Arnaud Ebalard
  Cc: linux-crypto, Jason Cooper, Andrew Lunn, Sebastian Hesselbarth,
	Gregory Clement, Thomas Petazzoni, stable, Nadav Haklai,
	Ofer Heifetz
In-Reply-To: <20161205085639.21034-1-romain.perier@free-electrons.com>

No need to copy the template of an hash operation twice into the SRAM
from the step function.

Fixes: commit 85030c5168f1 ("crypto: marvell - Add support for chai...")
Signed-off-by: Romain Perier <romain.perier@free-electrons.com>
Cc: <stable@vger.kernel.org>
---
 drivers/crypto/marvell/hash.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/crypto/marvell/hash.c b/drivers/crypto/marvell/hash.c
index 2a92605..fbbcbf8 100644
--- a/drivers/crypto/marvell/hash.c
+++ b/drivers/crypto/marvell/hash.c
@@ -172,9 +172,6 @@ static void mv_cesa_ahash_std_step(struct ahash_request *req)
 	for (i = 0; i < digsize / 4; i++)
 		writel_relaxed(creq->state[i], engine->regs + CESA_IVDIG(i));
 
-	mv_cesa_adjust_op(engine, &creq->op_tmpl);
-	memcpy_toio(engine->sram, &creq->op_tmpl, sizeof(creq->op_tmpl));
-
 	if (creq->cache_ptr)
 		memcpy_toio(engine->sram + CESA_SA_DATA_SRAM_OFFSET,
 			    creq->cache, creq->cache_ptr);
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH v1 1/2] Add crypto driver support for some MediaTek chips
From: Corentin Labbe @ 2016-12-05  8:52 UTC (permalink / raw)
  To: Ryder Lee
  Cc: devicetree-u79uwXL29TY76Z2rM5mHXA, Herbert Xu, Sean Wang,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roy Luo,
	linux-mediatek-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, Matthias Brugger,
	David S. Miller,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <1480921284-45827-2-git-send-email-ryder.lee-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org>

Hello

I have two minor comment.

On Mon, Dec 05, 2016 at 03:01:23PM +0800, Ryder Lee wrote:
> This adds support for the MediaTek hardware accelerator on
> mt7623/mt2701/mt8521p SoC.
> 
> This driver currently implement:
> - SHA1 and SHA2 family(HMAC) hash alogrithms.

There is a typo for algorithms.

[...]
> +/**
> + * struct mtk_desc - DMA descriptor
> + * @hdr:	the descriptor control header
> + * @buf:	DMA address of input buffer segment
> + * @ct:		DMA address of command token that control operation flow
> + * @ct_hdr:	the command token control header
> + * @tag:	the user-defined field
> + * @tfm:	DMA address of transform state
> + * @bound:	align descriptors offset boundary
> + *
> + * Structure passed to the crypto engine to describe where source
> + * data needs to be fetched and how it needs to be processed.
> + */
> +struct mtk_desc {
> +	u32 hdr;
> +	u32 buf;
> +	u32 ct;
> +	u32 ct_hdr;
> +	u32 tag;
> +	u32 tfm;
> +	u32 bound[2];
> +};

Do you have tested this descriptor with BE/LE kernel ?

Regards
Corentin Labbe

^ permalink raw reply

* [PATCH v1 2/2] crypto: mediatek - add DT bindings documentation
From: Ryder Lee @ 2016-12-05  7:01 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, Matthias Brugger
  Cc: devicetree, linux-mediatek, linux-kernel, linux-crypto,
	linux-arm-kernel, Sean Wang, Roy Luo, Ryder Lee
In-Reply-To: <1480921284-45827-1-git-send-email-ryder.lee@mediatek.com>

Add DT bindings documentation for the crypto driver

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
---
 .../devicetree/bindings/crypto/mediatek-crypto.txt | 32 ++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/crypto/mediatek-crypto.txt

diff --git a/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
new file mode 100644
index 0000000..8b1db08
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
@@ -0,0 +1,32 @@
+MediaTek cryptographic accelerators
+
+Required properties:
+- compatible: Should be "mediatek,mt7623-crypto"
+- reg: Address and length of the register set for the device
+- interrupts: Should contain the five crypto engines interrupts in numeric
+	order. These are global system and four descriptor rings.
+- clocks: the clock used by the core
+- clock-names: the names of the clock listed in the clocks property. These are
+	"ethif", "cryp"
+- power-domains: Must contain a reference to the PM domain.
+
+
+Optional properties:
+- interrupt-parent: Should be the phandle for the interrupt controller
+  that services interrupts for this device
+
+
+Example:
+	crypto: crypto@1b240000 {
+		compatible = "mediatek,mt7623-crypto";
+		reg = <0 0x1b240000 0 0x20000>;
+		interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 83 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 84 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 91 IRQ_TYPE_LEVEL_LOW>,
+			     <GIC_SPI 97 IRQ_TYPE_LEVEL_LOW>;
+		clocks = <&topckgen CLK_TOP_ETHIF_SEL>,
+			 <&ethsys CLK_ETHSYS_CRYPTO>;
+		clock-names = "ethif","cryp";
+		power-domains = <&scpsys MT2701_POWER_DOMAIN_ETH>;
+	};
-- 
1.9.1

^ permalink raw reply related

* [PATCH v1 1/2] Add crypto driver support for some MediaTek chips
From: Ryder Lee @ 2016-12-05  7:01 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, Matthias Brugger
  Cc: devicetree, linux-mediatek, linux-kernel, linux-crypto,
	linux-arm-kernel, Sean Wang, Roy Luo, Ryder Lee
In-Reply-To: <1480921284-45827-1-git-send-email-ryder.lee@mediatek.com>

This adds support for the MediaTek hardware accelerator on
mt7623/mt2701/mt8521p SoC.

This driver currently implement:
- SHA1 and SHA2 family(HMAC) hash alogrithms.
- AES block cipher in CBC/ECB mode with 128/196/256 bits keys.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
---
 drivers/crypto/Kconfig                 |   17 +
 drivers/crypto/Makefile                |    1 +
 drivers/crypto/mediatek/Makefile       |    2 +
 drivers/crypto/mediatek/mtk-aes.c      |  763 +++++++++++++++++
 drivers/crypto/mediatek/mtk-platform.c |  580 +++++++++++++
 drivers/crypto/mediatek/mtk-platform.h |  235 ++++++
 drivers/crypto/mediatek/mtk-regs.h     |  194 +++++
 drivers/crypto/mediatek/mtk-sha.c      | 1423 ++++++++++++++++++++++++++++++++
 8 files changed, 3215 insertions(+)
 create mode 100644 drivers/crypto/mediatek/Makefile
 create mode 100644 drivers/crypto/mediatek/mtk-aes.c
 create mode 100644 drivers/crypto/mediatek/mtk-platform.c
 create mode 100644 drivers/crypto/mediatek/mtk-platform.h
 create mode 100644 drivers/crypto/mediatek/mtk-regs.h
 create mode 100644 drivers/crypto/mediatek/mtk-sha.c

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 4d2b81f..ad0a00b 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -553,6 +553,23 @@ config CRYPTO_DEV_ROCKCHIP
 	  This driver interfaces with the hardware crypto accelerator.
 	  Supporting cbc/ecb chainmode, and aes/des/des3_ede cipher mode.
 
+config CRYPTO_DEV_MEDIATEK
+	tristate "MediaTek's Cryptographic Engine driver"
+	depends on ARM && (ARCH_MEDIATEK || COMPILE_TEST)
+	select NEON
+	select KERNEL_MODE_NEON
+	select ARM_CRYPTO
+	select CRYPTO_AES
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_SHA1_ARM_NEON
+	select CRYPTO_SHA256_ARM
+	select CRYPTO_SHA512_ARM
+	select CRYPTO_HMAC
+	help
+	  This driver allows you to utilize the hardware crypto accelerator
+	  which can be found on the MT7623 MT2701, MT8521p, etc ....
+	  Select this if you want to use it for AES/SHA1/SHA2 algorithms.
+
 source "drivers/crypto/chelsio/Kconfig"
 
 endif # CRYPTO_HW
diff --git a/drivers/crypto/Makefile b/drivers/crypto/Makefile
index ad7250f..272b51a 100644
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_DEV_IMGTEC_HASH) += img-hash.o
 obj-$(CONFIG_CRYPTO_DEV_IXP4XX) += ixp4xx_crypto.o
 obj-$(CONFIG_CRYPTO_DEV_MV_CESA) += mv_cesa.o
 obj-$(CONFIG_CRYPTO_DEV_MARVELL_CESA) += marvell/
+obj-$(CONFIG_CRYPTO_DEV_MEDIATEK) += mediatek/
 obj-$(CONFIG_CRYPTO_DEV_MXS_DCP) += mxs-dcp.o
 obj-$(CONFIG_CRYPTO_DEV_NIAGARA2) += n2_crypto.o
 n2_crypto-y := n2_core.o n2_asm.o
diff --git a/drivers/crypto/mediatek/Makefile b/drivers/crypto/mediatek/Makefile
new file mode 100644
index 0000000..187be79
--- /dev/null
+++ b/drivers/crypto/mediatek/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_CRYPTO_DEV_MEDIATEK) += mtk-crypto.o
+mtk-crypto-objs:= mtk-platform.o mtk-aes.o mtk-sha.o
diff --git a/drivers/crypto/mediatek/mtk-aes.c b/drivers/crypto/mediatek/mtk-aes.c
new file mode 100644
index 0000000..0208981
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-aes.c
@@ -0,0 +1,763 @@
+/*
+ * Cryptographic API.
+ *
+ * Support for MediaTek AES hardware accelerator.
+ *
+ * Copyright (c) 2016 MediaTek Inc.
+ * Author: Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Some ideas are from atmel-aes.c drivers.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/scatterwalk.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+#include "mtk-platform.h"
+#include "mtk-regs.h"
+
+#define AES_QUEUE_LENGTH	512
+#define AES_BUFFER_ORDER	2
+#define AES_BUFFER_SIZE		((PAGE_SIZE << AES_BUFFER_ORDER) \
+				& ~(AES_BLOCK_SIZE - 1))
+
+/* AES command token */
+#define AES_CT_SIZE_ECB		2
+#define AES_CT_SIZE_CBC		3
+#define AES_CT_CTRL_HDR		0x00220000
+#define AES_COMMAND0		0x05000000
+#define AES_COMMAND1		0x2d060000
+#define AES_COMMAND2		0xe4a63806
+
+/* AES transform information */
+#define AES_TFM_ECB		(0x0 << 0)
+#define AES_TFM_CBC		(0x1 << 0)
+#define AES_TFM_DECRYPT		(0x5 << 0)
+#define AES_TFM_ENCRYPT		(0x4 << 0)
+#define AES_TFM_SIZE(x)		((x) << 8)
+#define AES_TFM_128BITS		(0xb << 16)
+#define AES_TFM_192BITS		(0xd << 16)
+#define AES_TFM_256BITS		(0xf << 16)
+#define AES_TFM_FULL_IV		(0xf << 5)
+
+/* AES flags */
+#define AES_FLAGS_MODE_MSK	GENMASK(2, 0)
+#define AES_FLAGS_ECB		BIT(0)
+#define AES_FLAGS_CBC		BIT(1)
+#define AES_FLAGS_ENCRYPT	BIT(2)
+#define AES_FLAGS_BUSY		BIT(3)
+
+/**
+ * AES command token(CT) is a set of hardware instructions that
+ * are used to control crypto engine AES processing flow.
+ */
+struct mtk_aes_ct {
+	u32 ct_ctrl0;
+	u32 ct_ctrl1;
+	u32 ct_ctrl2;
+};
+
+/**
+ * AES transform state(tfm) is use to define AES transform state
+ * and contains all keys and initial vectors.
+ */
+struct mtk_aes_tfm {
+	u32 tfm_ctrl0;
+	u32 tfm_ctrl1;
+	/* store keys and IVs */
+	u8 state[AES_KEYSIZE_256 + AES_BLOCK_SIZE] __aligned(sizeof(u32));
+};
+
+/**
+ * mtk_aes_info consists of command token and transform state of AES,
+ * which should be encapsulated in command and result descriptors.
+ * The packet processing engine requires these information to do:
+ *
+ * - Commands decoding and control of the crypto engine’s data path.
+ * - Coordinating hardware data fetch and store operations.
+ * - Result token construction and output.
+ */
+struct mtk_aes_info {
+	struct mtk_aes_ct ct;
+	struct mtk_aes_tfm tfm;
+};
+
+struct mtk_aes_reqctx {
+	u64 mode;
+};
+
+struct mtk_aes_ctx {
+	struct mtk_cryp *cryp;
+	struct mtk_aes_info info;
+	u32 keylen;
+
+	unsigned long flags;
+};
+
+struct mtk_aes_drv {
+	struct list_head dev_list;
+	/* device list lock */
+	spinlock_t lock;
+};
+
+static struct mtk_aes_drv mtk_aes = {
+	.dev_list = LIST_HEAD_INIT(mtk_aes.dev_list),
+	.lock = __SPIN_LOCK_UNLOCKED(mtk_aes.lock),
+};
+
+static inline u32 mtk_aes_read(struct mtk_cryp *cryp, u32 offset)
+{
+	return readl_relaxed(cryp->base + offset);
+}
+
+static inline void mtk_aes_write(struct mtk_cryp *cryp,
+				 u32 offset, u32 value)
+{
+	writel_relaxed(value, cryp->base + offset);
+}
+
+static struct mtk_cryp *mtk_aes_find_dev(struct mtk_aes_ctx *ctx)
+{
+	struct mtk_cryp *cryp = NULL;
+	struct mtk_cryp *tmp;
+
+	spin_lock_bh(&mtk_aes.lock);
+	if (!ctx->cryp) {
+		list_for_each_entry(tmp, &mtk_aes.dev_list, aes_list) {
+			cryp = tmp;
+			break;
+		}
+		ctx->cryp = cryp;
+	} else {
+		cryp = ctx->cryp;
+	}
+	spin_unlock_bh(&mtk_aes.lock);
+
+	return cryp;
+}
+
+static inline size_t mtk_aes_padlen(size_t len)
+{
+	len &= AES_BLOCK_SIZE - 1;
+	return len ? AES_BLOCK_SIZE - len : 0;
+}
+
+static bool mtk_aes_check_aligned(struct scatterlist *sg,
+				  size_t len, struct mtk_aes_dma *dma)
+{
+	int nents;
+
+	if (!IS_ALIGNED(len, AES_BLOCK_SIZE))
+		return false;
+
+	for (nents = 0; sg; sg = sg_next(sg), ++nents) {
+		if (!IS_ALIGNED(sg->offset, sizeof(u32)))
+			return false;
+
+		if (len <= sg->length) {
+			if (!IS_ALIGNED(len, AES_BLOCK_SIZE))
+				return false;
+
+			dma->nents = nents + 1;
+			dma->remainder = sg->length - len;
+			sg->length = len;
+			return true;
+		}
+
+		if (!IS_ALIGNED(sg->length, AES_BLOCK_SIZE))
+			return false;
+
+		len -= sg->length;
+	}
+
+	return false;
+}
+
+/* Initialize and map transform information of AES */
+static int mtk_aes_info_map(struct mtk_cryp *cryp,
+			    struct mtk_aes *aes, size_t len)
+{
+	struct mtk_aes_ctx *ctx = crypto_ablkcipher_ctx(
+			crypto_ablkcipher_reqtfm(aes->req));
+	struct mtk_aes_info *info = aes->info;
+	struct mtk_aes_ct *ct = &info->ct;
+	struct mtk_aes_tfm *tfm = &info->tfm;
+	u32 keylen = ctx->keylen;
+
+	aes->ct_hdr = AES_CT_CTRL_HDR | len;
+	ct->ct_ctrl0 = AES_COMMAND0 | len;
+	ct->ct_ctrl1 = AES_COMMAND1;
+
+	if (aes->flags & AES_FLAGS_ENCRYPT)
+		tfm->tfm_ctrl0 = AES_TFM_ENCRYPT;
+	else
+		tfm->tfm_ctrl0 = AES_TFM_DECRYPT;
+
+	if (aes->flags & AES_FLAGS_CBC) {
+		aes->ct_size = AES_CT_SIZE_CBC;
+		ct->ct_ctrl2 = AES_COMMAND2;
+
+		tfm->tfm_ctrl0 |=
+			AES_TFM_SIZE(SIZE_IN_WORDS(keylen + AES_BLOCK_SIZE));
+		tfm->tfm_ctrl1 = AES_TFM_CBC;
+		tfm->tfm_ctrl1 |= AES_TFM_FULL_IV;
+
+		memcpy(tfm->state + keylen, aes->req->info, AES_BLOCK_SIZE);
+	} else if (aes->flags & AES_FLAGS_ECB) {
+		aes->ct_size = AES_CT_SIZE_ECB;
+		tfm->tfm_ctrl0 |= AES_TFM_SIZE(SIZE_IN_WORDS(keylen));
+		tfm->tfm_ctrl1 = AES_TFM_ECB;
+	}
+
+	if (keylen == AES_KEYSIZE_128)
+		tfm->tfm_ctrl0 |= AES_TFM_128BITS;
+	else if (keylen == AES_KEYSIZE_256)
+		tfm->tfm_ctrl0 |= AES_TFM_256BITS;
+	else if (keylen == AES_KEYSIZE_192)
+		tfm->tfm_ctrl0 |= AES_TFM_192BITS;
+
+	aes->ct_dma = dma_map_single(cryp->dev, info, sizeof(*info),
+					DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(cryp->dev, aes->ct_dma))) {
+		dev_err(cryp->dev, "dma %d bytes error\n", sizeof(*info));
+		return -EINVAL;
+	}
+	aes->tfm_dma = aes->ct_dma + sizeof(*ct);
+
+	return 0;
+}
+
+static int mtk_aes_xmit(struct mtk_cryp *cryp, struct mtk_aes *aes)
+{
+	struct mtk_ring *ring = cryp->ring[aes->id];
+	struct mtk_desc *cmd = NULL, *res = NULL;
+	struct scatterlist *ssg, *dsg;
+	u32 len = aes->src.sg_len;
+	int nents;
+
+	/* Fill command and result descriptors */
+	for (nents = 0; nents < len; ++nents) {
+		ssg = &aes->src.sg[nents];
+		dsg = &aes->dst.sg[nents];
+
+		cmd = ring->cmd_base + ring->pos;
+		res = ring->res_base + ring->pos;
+
+		res->hdr = MTK_DESC_BUF_LEN(dsg->length);
+		res->buf = sg_dma_address(dsg);
+
+		cmd->hdr = MTK_DESC_BUF_LEN(ssg->length);
+		cmd->buf = sg_dma_address(ssg);
+
+		if (nents == 0) {
+			res->hdr |= MTK_DESC_FIRST;
+			cmd->hdr |= MTK_DESC_FIRST;
+			cmd->hdr |= MTK_DESC_CT_LEN(aes->ct_size);
+			cmd->ct = aes->ct_dma;
+			cmd->ct_hdr = aes->ct_hdr;
+			cmd->tfm = aes->tfm_dma;
+		}
+
+		if (++ring->pos == MTK_MAX_DESC_NUM)
+			ring->pos = 0;
+	}
+
+	cmd->hdr |= MTK_DESC_LAST;
+	res->hdr |= MTK_DESC_LAST;
+
+	/*
+	 * make sure that all changes to the dma ring are done before we
+	 * start engine.
+	 */
+	wmb();
+	/* Start DMA transfer */
+	mtk_aes_write(cryp, RDR_PREP_COUNT(aes->id), MTK_DESC_CNT(len));
+	mtk_aes_write(cryp, CDR_PREP_COUNT(aes->id), MTK_DESC_CNT(len));
+
+	return -EINPROGRESS;
+}
+
+static inline void mtk_aes_restore_sg(const struct mtk_aes_dma *dma)
+{
+	struct scatterlist *sg = dma->sg;
+	int nents = dma->nents;
+
+	if (!dma->remainder)
+		return;
+
+	while (--nents > 0 && sg)
+		sg = sg_next(sg);
+
+	if (!sg)
+		return;
+
+	sg->length += dma->remainder;
+}
+
+static int mtk_aes_map(struct mtk_cryp *cryp, struct mtk_aes *aes)
+{
+	struct scatterlist *src = aes->req->src;
+	struct scatterlist *dst = aes->req->dst;
+	size_t len = aes->req->nbytes;
+	size_t padlen = 0;
+	bool src_aligned, dst_aligned;
+
+	aes->total = len;
+	aes->src.sg = src;
+	aes->dst.sg = dst;
+	aes->real_dst = dst;
+
+	src_aligned = mtk_aes_check_aligned(src, len, &aes->src);
+	if (src == dst)
+		dst_aligned = src_aligned;
+	else
+		dst_aligned = mtk_aes_check_aligned(dst, len, &aes->dst);
+
+	if (!src_aligned || !dst_aligned) {
+		padlen = mtk_aes_padlen(len);
+
+		if (len + padlen > AES_BUFFER_SIZE)
+			return -ENOMEM;
+
+		if (!src_aligned) {
+			sg_copy_to_buffer(src, sg_nents(src), aes->buf, len);
+			aes->src.sg = &aes->aligned_sg;
+			aes->src.nents = 1;
+			aes->src.remainder = 0;
+		}
+
+		if (!dst_aligned) {
+			aes->dst.sg = &aes->aligned_sg;
+			aes->dst.nents = 1;
+			aes->dst.remainder = 0;
+		}
+
+		sg_init_table(&aes->aligned_sg, 1);
+		sg_set_buf(&aes->aligned_sg, aes->buf, len + padlen);
+	}
+
+	if (aes->src.sg == aes->dst.sg) {
+		aes->src.sg_len = dma_map_sg(cryp->dev, aes->src.sg,
+				aes->src.nents, DMA_BIDIRECTIONAL);
+		aes->dst.sg_len = aes->src.sg_len;
+		if (unlikely(!aes->src.sg_len))
+			return -EFAULT;
+	} else {
+		aes->src.sg_len = dma_map_sg(cryp->dev, aes->src.sg,
+				aes->src.nents, DMA_TO_DEVICE);
+		if (unlikely(!aes->src.sg_len))
+			return -EFAULT;
+
+		aes->dst.sg_len = dma_map_sg(cryp->dev, aes->dst.sg,
+				aes->dst.nents, DMA_FROM_DEVICE);
+		if (unlikely(!aes->dst.sg_len)) {
+			dma_unmap_sg(cryp->dev, aes->src.sg,
+				     aes->src.nents, DMA_TO_DEVICE);
+			return -EFAULT;
+		}
+	}
+
+	return mtk_aes_info_map(cryp, aes, len + padlen);
+}
+
+static int mtk_aes_handle_queue(struct mtk_cryp *cryp, u8 id,
+				struct ablkcipher_request *req)
+{
+	struct mtk_aes *aes = cryp->aes[id];
+	struct crypto_async_request *areq, *backlog;
+	struct mtk_aes_reqctx *rctx;
+	struct mtk_aes_ctx *ctx;
+	unsigned long flags;
+	int err, ret = 0;
+
+	spin_lock_irqsave(&aes->lock, flags);
+	if (req)
+		ret = ablkcipher_enqueue_request(&aes->queue, req);
+	if (aes->flags & AES_FLAGS_BUSY) {
+		spin_unlock_irqrestore(&aes->lock, flags);
+		return ret;
+	}
+	backlog = crypto_get_backlog(&aes->queue);
+	areq = crypto_dequeue_request(&aes->queue);
+	if (areq)
+		aes->flags |= AES_FLAGS_BUSY;
+	spin_unlock_irqrestore(&aes->lock, flags);
+
+	if (!areq)
+		return ret;
+
+	if (backlog)
+		backlog->complete(backlog, -EINPROGRESS);
+
+	req = ablkcipher_request_cast(areq);
+	ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req));
+	rctx = ablkcipher_request_ctx(req);
+	rctx->mode &= AES_FLAGS_MODE_MSK;
+	/* assign new request to device */
+	aes->req = req;
+	aes->info = &ctx->info;
+	aes->flags = (aes->flags & ~AES_FLAGS_MODE_MSK) | rctx->mode;
+
+	err = mtk_aes_map(cryp, aes);
+	if (err)
+		return err;
+
+	return mtk_aes_xmit(cryp, aes);
+}
+
+static void mtk_aes_unmap(struct mtk_cryp *cryp, struct mtk_aes *aes)
+{
+	dma_unmap_single(cryp->dev, aes->ct_dma,
+			 sizeof(struct mtk_aes_info), DMA_TO_DEVICE);
+
+	if (aes->src.sg == aes->dst.sg) {
+		dma_unmap_sg(cryp->dev, aes->src.sg,
+			     aes->src.nents, DMA_BIDIRECTIONAL);
+
+		if (aes->src.sg != &aes->aligned_sg)
+			mtk_aes_restore_sg(&aes->src);
+	} else {
+		dma_unmap_sg(cryp->dev, aes->dst.sg,
+			     aes->dst.nents, DMA_FROM_DEVICE);
+
+		if (aes->dst.sg != &aes->aligned_sg)
+			mtk_aes_restore_sg(&aes->dst);
+
+		dma_unmap_sg(cryp->dev, aes->src.sg,
+			     aes->src.nents, DMA_TO_DEVICE);
+
+		if (aes->src.sg != &aes->aligned_sg)
+			mtk_aes_restore_sg(&aes->src);
+	}
+
+	if (aes->dst.sg == &aes->aligned_sg)
+		sg_copy_from_buffer(aes->real_dst,
+				    sg_nents(aes->real_dst),
+				    aes->buf, aes->total);
+}
+
+static inline void mtk_aes_complete(struct mtk_cryp *cryp,
+				    struct mtk_aes *aes)
+{
+	aes->flags &= ~AES_FLAGS_BUSY;
+
+	aes->req->base.complete(&aes->req->base, 0);
+
+	/* handle new request */
+	mtk_aes_handle_queue(cryp, aes->id, NULL);
+}
+
+/* Check and set the AES key to transform state's buffer */
+static int mtk_aes_setkey(struct crypto_ablkcipher *tfm,
+			  const u8 *key, u32 keylen)
+{
+	struct mtk_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	u8 *state = ctx->info.tfm.state;
+
+	if (keylen != AES_KEYSIZE_128 &&
+	    keylen != AES_KEYSIZE_192 &&
+	    keylen != AES_KEYSIZE_256) {
+		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	ctx->keylen = keylen;
+	memcpy(state, key, keylen);
+
+	return 0;
+}
+
+static int mtk_aes_crypt(struct ablkcipher_request *req, u64 mode)
+{
+	struct mtk_aes_ctx *ctx = crypto_ablkcipher_ctx(
+			crypto_ablkcipher_reqtfm(req));
+	struct mtk_aes_reqctx *rctx = ablkcipher_request_ctx(req);
+
+	rctx->mode = mode;
+
+	return mtk_aes_handle_queue(ctx->cryp,
+			!(mode & AES_FLAGS_ENCRYPT), req);
+}
+
+static int mtk_ecb_encrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_ENCRYPT | AES_FLAGS_ECB);
+}
+
+static int mtk_ecb_decrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_ECB);
+}
+
+static int mtk_cbc_encrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_ENCRYPT | AES_FLAGS_CBC);
+}
+
+static int mtk_cbc_decrypt(struct ablkcipher_request *req)
+{
+	return mtk_aes_crypt(req, AES_FLAGS_CBC);
+}
+
+static int mtk_aes_cra_init(struct crypto_tfm *tfm)
+{
+	struct mtk_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct mtk_cryp *cryp = NULL;
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct mtk_aes_reqctx);
+
+	cryp = mtk_aes_find_dev(ctx);
+	if (!cryp) {
+		pr_err("can't find crypto device\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static struct crypto_alg aes_algs[] = {
+{
+	.cra_name		=	"cbc(aes)",
+	.cra_driver_name	=	"cbc-aes-mtk",
+	.cra_priority		=	400,
+	.cra_flags		=	CRYPTO_ALG_TYPE_ABLKCIPHER |
+						CRYPTO_ALG_ASYNC,
+	.cra_init		=	mtk_aes_cra_init,
+	.cra_blocksize		=	AES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct mtk_aes_ctx),
+	.cra_alignmask		=	15,
+	.cra_type		=	&crypto_ablkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_u.ablkcipher	=	{
+		.min_keysize	=	AES_MIN_KEY_SIZE,
+		.max_keysize	=	AES_MAX_KEY_SIZE,
+		.setkey		=	mtk_aes_setkey,
+		.encrypt	=	mtk_cbc_encrypt,
+		.decrypt	=	mtk_cbc_decrypt,
+		.ivsize		=	AES_BLOCK_SIZE,
+	}
+},
+{
+	.cra_name		=	"ecb(aes)",
+	.cra_driver_name	=	"ecb-aes-mtk",
+	.cra_priority		=	400,
+	.cra_flags		=	CRYPTO_ALG_TYPE_ABLKCIPHER |
+						CRYPTO_ALG_ASYNC,
+	.cra_init		=	mtk_aes_cra_init,
+	.cra_blocksize		=	AES_BLOCK_SIZE,
+	.cra_ctxsize		=	sizeof(struct mtk_aes_ctx),
+	.cra_alignmask		=	15,
+	.cra_type		=	&crypto_ablkcipher_type,
+	.cra_module		=	THIS_MODULE,
+	.cra_u.ablkcipher	=	{
+		.min_keysize	=	AES_MIN_KEY_SIZE,
+		.max_keysize	=	AES_MAX_KEY_SIZE,
+		.setkey		=	mtk_aes_setkey,
+		.encrypt	=	mtk_ecb_encrypt,
+		.decrypt	=	mtk_ecb_decrypt,
+	}
+},
+};
+
+static void mtk_aes_enc_task(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_aes *aes = cryp->aes[0];
+
+	mtk_aes_unmap(cryp, aes);
+	mtk_aes_complete(cryp, aes);
+}
+
+static void mtk_aes_dec_task(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_aes *aes = cryp->aes[1];
+
+	mtk_aes_unmap(cryp, aes);
+	mtk_aes_complete(cryp, aes);
+}
+
+static irqreturn_t mtk_aes_enc_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_aes *aes = cryp->aes[0];
+	u32 val = mtk_aes_read(cryp, RDR_STAT(RING0));
+
+	mtk_aes_write(cryp, RDR_STAT(RING0), val);
+
+	if (likely(AES_FLAGS_BUSY & aes->flags)) {
+		mtk_aes_write(cryp, RDR_PROC_COUNT(RING0), MTK_DESC_CNT_CLR);
+		mtk_aes_write(cryp, RDR_THRESH(RING0), MTK_RDR_THRESH_DEF);
+
+		tasklet_schedule(&aes->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mtk_aes_dec_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_aes *aes = cryp->aes[1];
+	u32 val = mtk_aes_read(cryp, RDR_STAT(RING1));
+
+	mtk_aes_write(cryp, RDR_STAT(RING1), val);
+
+	if (likely(AES_FLAGS_BUSY & aes->flags)) {
+		mtk_aes_write(cryp, RDR_PROC_COUNT(RING1), MTK_DESC_CNT_CLR);
+		mtk_aes_write(cryp, RDR_THRESH(RING1), MTK_RDR_THRESH_DEF);
+
+		tasklet_schedule(&aes->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * The purpose of creating encryption and decryption records is
+ * to process outbound/inbound data in parallel, it can improve
+ * performance in most use cases, such as IPSec VPN, especially
+ * under heavy network traffic.
+ */
+static int mtk_aes_record_init(struct mtk_cryp *cryp)
+{
+	struct mtk_aes **aes = cryp->aes;
+	int i, err = -ENOMEM;
+
+	for (i = 0; i < RECORD_NUM; i++) {
+		aes[i] = kzalloc(sizeof(**aes), GFP_KERNEL);
+		if (!aes[i])
+			goto err_cleanup;
+
+		aes[i]->buf = (void *)__get_free_pages(GFP_KERNEL,
+						AES_BUFFER_ORDER);
+		if (!aes[i]->buf)
+			goto err_cleanup;
+
+		aes[i]->id = i;
+
+		spin_lock_init(&aes[i]->lock);
+		crypto_init_queue(&aes[i]->queue, AES_QUEUE_LENGTH);
+	}
+
+	tasklet_init(&aes[0]->task, mtk_aes_enc_task, (unsigned long)cryp);
+	tasklet_init(&aes[1]->task, mtk_aes_dec_task, (unsigned long)cryp);
+
+	return 0;
+
+err_cleanup:
+	for (; i--; ) {
+		free_page((unsigned long)aes[i]->buf);
+		kfree(aes[i]);
+	}
+
+	return err;
+}
+
+static void mtk_aes_record_free(struct mtk_cryp *cryp)
+{
+	int i;
+
+	for (i = 0; i < RECORD_NUM; i++) {
+		tasklet_kill(&cryp->aes[i]->task);
+		free_page((unsigned long)cryp->aes[i]->buf);
+		kfree(cryp->aes[i]);
+	}
+}
+
+static void mtk_aes_unregister_algs(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
+		crypto_unregister_alg(&aes_algs[i]);
+}
+
+static int mtk_aes_register_algs(void)
+{
+	int err, i, j;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		err = crypto_register_alg(&aes_algs[i]);
+		if (err)
+			goto err_aes_algs;
+	}
+
+	return 0;
+
+err_aes_algs:
+	for (j = 0; j < i; j++)
+		crypto_unregister_alg(&aes_algs[j]);
+
+	return err;
+}
+
+int mtk_cipher_alg_register(struct mtk_cryp *cryp)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&cryp->aes_list);
+
+	/* Initialize two cipher records */
+	ret = mtk_aes_record_init(cryp);
+	if (ret)
+		goto err_record;
+
+	/* Ring0 irq is use by encryption record */
+	ret = devm_request_irq(cryp->dev, cryp->irq[RING0], mtk_aes_enc_irq,
+			       IRQF_TRIGGER_LOW, "mtk-aes", cryp);
+	if (ret) {
+		dev_err(cryp->dev, "unable to request AES encryption irq.\n");
+		goto err_res;
+	}
+
+	/* Ring1 irq is use by decryption record */
+	ret = devm_request_irq(cryp->dev, cryp->irq[RING1], mtk_aes_dec_irq,
+			       IRQF_TRIGGER_LOW, "mtk-aes", cryp);
+	if (ret) {
+		dev_err(cryp->dev, "unable to request AES decryption irq.\n");
+		goto err_res;
+	}
+
+	/* Enable ring0 and ring1 interrupt */
+	mtk_aes_write(cryp, AIC_ENABLE_SET(RING0), MTK_IRQ_RDR0);
+	mtk_aes_write(cryp, AIC_ENABLE_SET(RING1), MTK_IRQ_RDR1);
+
+	spin_lock(&mtk_aes.lock);
+	list_add_tail(&cryp->aes_list, &mtk_aes.dev_list);
+	spin_unlock(&mtk_aes.lock);
+
+	ret = mtk_aes_register_algs();
+	if (ret)
+		goto err_algs;
+
+	return 0;
+
+err_algs:
+	spin_lock(&mtk_aes.lock);
+	list_del(&cryp->aes_list);
+	spin_unlock(&mtk_aes.lock);
+err_res:
+	mtk_aes_record_free(cryp);
+err_record:
+
+	dev_err(cryp->dev, "mtk-aes initialization failed.\n");
+	return ret;
+}
+
+void mtk_cipher_alg_release(struct mtk_cryp *cryp)
+{
+	spin_lock(&mtk_aes.lock);
+	list_del(&cryp->aes_list);
+	spin_unlock(&mtk_aes.lock);
+
+	mtk_aes_unregister_algs();
+	mtk_aes_record_free(cryp);
+}
diff --git a/drivers/crypto/mediatek/mtk-platform.c b/drivers/crypto/mediatek/mtk-platform.c
new file mode 100644
index 0000000..25025fe
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-platform.c
@@ -0,0 +1,580 @@
+/*
+ * Support for MediaTek cryptographic accelerator.
+ *
+ * Copyright (c) 2016 MediaTek Inc.
+ * Author: Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include "mtk-platform.h"
+#include "mtk-regs.h"
+
+#define MTK_BURST_SIZE(x, y)		(((x) & ~0xf0) | ((y) << 4))
+#define MTK_DESC_SIZE_SET(x)		((x) << 0)
+#define MTK_DESC_OFFSET_SET(x)		((x) << 16)
+#define MTK_DFSE_RING_ID(x)		(((x) >> 12) & 0xf)
+#define MTK_DSE_MIN_DATA(x)		((x) << 0)
+#define MTK_DSE_MAX_DATA(x)		((x) << 8)
+#define MTK_DFE_MIN_DATA(x)		((x) << 0)
+#define MTK_DFE_MAX_DATA(x)		((x) << 8)
+#define MTK_DFE_MIN_CTRL(x)		((x) << 16)
+#define MTK_DFE_MAX_CTRL(x)		((x) << 24)
+#define MTK_FETCH_SIZE_SET(x)		((x) << 0)
+#define MTK_FETCH_THRESH_SET(x)		((x) << 16)
+#define MTK_IN_BUF_MIN_THRESH(x)	((x) << 8)
+#define MTK_IN_BUF_MAX_THRESH(x)	((x) << 12)
+#define MTK_OUT_BUF_MIN_THRESH(x)	((x) << 0)
+#define MTK_OUT_BUF_MAX_THRESH(x)	((x) << 4)
+#define MTK_CMD_FIFO_SIZE(x)		(((x) >> 8) & 0xf)
+#define MTK_RES_FIFO_SIZE(x)		(((x) >> 12) & 0xf)
+#define MTK_HIA_DATA_WIDTH(x)		(((x) >> 25) & 0x3)
+#define MTK_HIA_DMA_LENGTH(x)		(((x) >> 20) & 0x1f)
+#define MTK_IN_TBUF_SIZE(x)		(((x) >> 4) & 0xf)
+#define MTK_IN_DBUF_SIZE(x)		(((x) >> 8) & 0xf)
+#define MTK_OUT_DBUF_SIZE(x)		(((x) >> 16) & 0xf)
+#define MTK_AIC_INT_NUM(x)		((x) & 0x3f)
+#define MTK_AIC_VER_GET(x)		((x) & 0x0ff0ffff)
+#define MTK_PE_TOKEN_CTRL_DEF		0x00014004
+#define MTK_PE_INT_CTRL_DEF		0xc00f400f
+#define MTK_PRNG_CTRL_EN		BIT(0)
+#define MTK_PRNG_CTRL_AUTO		BIT(1)
+#define MTK_TOKEN_TIMEOUT_EN		BIT(22)
+#define MTK_OVL_IRQ_EN			BIT(25)
+#define MTK_ATP_PRESENT			BIT(30)
+#define MTK_DFSE_THR_CTRL_EN		BIT(30)
+#define MTK_DFSE_THR_CTRL_RESET		BIT(31)
+#define MTK_HIA_SIGNATURE		((u16)0x35ca)
+#define MTK_CDR_STAT_CLR		0x1f
+#define MTK_RDR_STAT_CLR		0xff
+#define MTK_AIC_VER11			0x011036C9
+#define MTK_AIC_VER12			0x012036C9
+#define MTK_AIC_GLOBAL_CLR		0x7FF00000
+#define MTK_DFSE_IDLE			0xf
+
+/**
+ * This engine is an integrated security subsystem to accelerate
+ * cryptographic functions and protocols to off-load the host processor.
+ *
+ * Hardware modules are briefly introduced below:
+ *
+ * Host Interface Adapter(HIA) - the main interface between the host
+ * system and the hardware subsystem. It is responsible for attaching
+ * processing engine to the specific host bus interface and provides a
+ * standardized software view for off loading tasks to the engine.
+ *
+ * Command Descriptor Ring Manager(CDR Manager) - keeps track of how many
+ * CD the host has prepared in the CDR. It monitors the fill level of its
+ * CD-FIFO and if there's sufficient space for the next block of descriptors,
+ * then it fires off a DMA request to fetch a block of CDs.
+ *
+ * Data fetch engine(DFE) - It is responsible for parsing the CD and
+ * setting up the required control and packet data DMA transfers from
+ * system memory to the processing engine.
+ *
+ * Result Descriptor Ring Manager(RDR Manager) - same as CDR Manager,
+ * but target is result descriptors, Moreover, it also handles the RD
+ * updates under control of the DSE. For each packet data segment
+ * processed, the DSE triggers the RDR Manager to write the updated RD.
+ * If triggered to update, the RDR Manager sets up a DMA operation to
+ * copy the RD from the DSE to the correct location in the RDR.
+ *
+ * Data Store Engine(DSE) - It is responsible for parsing the prepared RD
+ * and setting up the required control and packet data DMA transfers from
+ * the processing engine to system memory.
+ *
+ * Advanced Interrupt Controllers(AICs) - receive interrupt request signals
+ * from various sources and combine them into one interrupt output. The AICs
+ * are use by:
+ * - One for the HIA global and processing engine interrupts.
+ * - The others for the descriptor ring interrupts.
+ */
+
+/* Cryptographic engine capabilities */
+struct mtk_sys_cap {
+	/* host interface adapter */
+	u32 hia_ver;
+	u32 hia_opt;
+	/* packet engine */
+	u32 pkt_eng_opt;
+	/* global hardware */
+	u32 hw_opt;
+};
+
+static void mtk_desc_ring_link(struct mtk_cryp *cryp, u32 mask)
+{
+	/* Assign rings to DFE/DSE thread and enable it */
+	writel(MTK_DFSE_THR_CTRL_EN | mask, cryp->base + DFE_THR_CTRL);
+	writel(MTK_DFSE_THR_CTRL_EN | mask, cryp->base + DSE_THR_CTRL);
+}
+
+static void mtk_dfe_dse_buf_setup(struct mtk_cryp *cryp,
+				  struct mtk_sys_cap *cap)
+{
+	u32 width = MTK_HIA_DATA_WIDTH(cap->hia_opt) + 2;
+	u32 len = MTK_HIA_DMA_LENGTH(cap->hia_opt) - 1;
+	u32 ipbuf = min(MTK_IN_DBUF_SIZE(cap->hw_opt) + width, len);
+	u32 opbuf = min(MTK_OUT_DBUF_SIZE(cap->hw_opt) + width, len);
+	u32 itbuf = min(MTK_IN_TBUF_SIZE(cap->hw_opt) + width, len);
+	u32 val;
+
+	val = MTK_DFE_MIN_DATA(ipbuf - 1) | MTK_DFE_MAX_DATA(ipbuf) |
+		MTK_DFE_MIN_CTRL(itbuf - 1) | MTK_DFE_MAX_CTRL(itbuf);
+	writel(val, cryp->base + DFE_CFG);
+
+	val = MTK_DFE_MIN_DATA(opbuf - 1) | MTK_DFE_MAX_DATA(opbuf);
+	writel(val, cryp->base + DSE_CFG);
+
+	val = MTK_IN_BUF_MIN_THRESH(ipbuf - 1) | MTK_IN_BUF_MAX_THRESH(ipbuf);
+	writel(val, cryp->base + PE_IN_DBUF_THRESH);
+
+	val = MTK_IN_BUF_MIN_THRESH(itbuf - 1) | MTK_IN_BUF_MAX_THRESH(itbuf);
+	writel(val, cryp->base + PE_IN_TBUF_THRESH);
+
+	val = MTK_OUT_BUF_MIN_THRESH(opbuf - 1) | MTK_OUT_BUF_MAX_THRESH(opbuf);
+	writel(val, cryp->base + PE_OUT_DBUF_THRESH);
+
+	writel(0, cryp->base + PE_OUT_TBUF_THRESH);
+	writel(0, cryp->base + PE_OUT_BUF_CTRL);
+}
+
+static int mtk_dfe_dse_state_check(struct mtk_cryp *cryp)
+{
+	int ret = -EINVAL;
+	u32 val;
+
+	/* Check for completion of all DMA transfers */
+	val = readl(cryp->base + DFE_THR_STAT);
+	if (MTK_DFSE_RING_ID(val) == MTK_DFSE_IDLE) {
+		val = readl(cryp->base + DSE_THR_STAT);
+		if (MTK_DFSE_RING_ID(val) == MTK_DFSE_IDLE)
+			ret = 0;
+	}
+
+	if (!ret) {
+		/* Take DFE/DSE thread out of reset */
+		writel(0, cryp->base + DFE_THR_CTRL);
+		writel(0, cryp->base + DSE_THR_CTRL);
+	} else {
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static int mtk_dfe_dse_reset(struct mtk_cryp *cryp)
+{
+	int err;
+
+	/* Reset DSE/DFE and correct system priorities for all rings. */
+	writel(MTK_DFSE_THR_CTRL_RESET, cryp->base + DFE_THR_CTRL);
+	writel(0, cryp->base + DFE_PRIO_0);
+	writel(0, cryp->base + DFE_PRIO_1);
+	writel(0, cryp->base + DFE_PRIO_2);
+	writel(0, cryp->base + DFE_PRIO_3);
+
+	writel(MTK_DFSE_THR_CTRL_RESET, cryp->base + DSE_THR_CTRL);
+	writel(0, cryp->base + DSE_PRIO_0);
+	writel(0, cryp->base + DSE_PRIO_1);
+	writel(0, cryp->base + DSE_PRIO_2);
+	writel(0, cryp->base + DSE_PRIO_3);
+
+	err = mtk_dfe_dse_state_check(cryp);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void mtk_cmd_desc_ring_setup(struct mtk_cryp *cryp,
+				    int i, struct mtk_sys_cap *cap)
+{
+	/* Full descriptor that fits FIFO minus one */
+	u32 count =
+		((1 << MTK_CMD_FIFO_SIZE(cap->hia_opt)) / MTK_DESC_SIZE) - 1;
+	u32 size = count * MTK_DESC_OFFSET;
+	u32 thresh = count * MTK_DESC_SIZE;
+	u32 val;
+
+	/* Temporarily disable external triggering */
+	writel(0, cryp->base + CDR_CFG(i));
+
+	/* Clear CDR count */
+	writel(MTK_DESC_CNT_CLR, cryp->base + CDR_PREP_COUNT(i));
+	writel(MTK_DESC_CNT_CLR, cryp->base + CDR_PROC_COUNT(i));
+
+	writel(0, cryp->base + CDR_PREP_PNTR(i));
+	writel(0, cryp->base + CDR_PROC_PNTR(i));
+	writel(0, cryp->base + CDR_DMA_CFG(i));
+
+	/* Configure command ring host address space */
+	writel(0, cryp->base + CDR_BASE_ADDR_HI(i));
+	writel(cryp->ring[i]->cmd_dma, cryp->base + CDR_BASE_ADDR_LO(i));
+
+	writel(MTK_MAX_RING_SIZE, cryp->base + CDR_RING_SIZE(i));
+
+	/* Clear and disable all CDR interrupts */
+	writel(MTK_CDR_STAT_CLR, cryp->base + CDR_STAT(i));
+
+	/*
+	 * Set command descriptor offset and enable additional
+	 * token present in descriptor.
+	 */
+	val = MTK_DESC_SIZE_SET(MTK_DESC_SIZE) |
+		MTK_DESC_OFFSET_SET(MTK_DESC_OFFSET) |
+		MTK_ATP_PRESENT;
+	writel(val, cryp->base + CDR_DESC_SIZE(i));
+
+	val = MTK_FETCH_SIZE_SET(size) | MTK_FETCH_THRESH_SET(thresh);
+	writel(val, cryp->base + CDR_CFG(i));
+}
+
+static void mtk_res_desc_ring_setup(struct mtk_cryp *cryp,
+				    int i, struct mtk_sys_cap *cap)
+{
+	u32 rndup = 2;
+	u32 count = ((1 << MTK_RES_FIFO_SIZE(cap->hia_opt)) / rndup) - 1;
+	u32 size = count * MTK_DESC_OFFSET;
+	u32 thresh = count * rndup;
+	u32 val;
+
+	writel(0, cryp->base + RDR_CFG(i));
+
+	writel(MTK_DESC_CNT_CLR, cryp->base + RDR_PREP_COUNT(i));
+	writel(MTK_DESC_CNT_CLR, cryp->base + RDR_PROC_COUNT(i));
+
+	writel(0, cryp->base + RDR_PREP_PNTR(i));
+	writel(0, cryp->base + RDR_PROC_PNTR(i));
+	writel(0, cryp->base + RDR_DMA_CFG(i));
+
+	writel(0, cryp->base + RDR_BASE_ADDR_HI(i));
+	writel(cryp->ring[i]->res_dma, cryp->base + RDR_BASE_ADDR_LO(i));
+
+	writel(MTK_MAX_RING_SIZE, cryp->base + RDR_RING_SIZE(i));
+	writel(MTK_RDR_STAT_CLR, cryp->base + RDR_STAT(i));
+
+	/*
+	 * RDR manager generates update interrupts on a per-completed-packet,
+	 * and the rd_proc_thresh_irq interrupt is fired when proc_pkt_count
+	 * for the RDR exceeds the number of packets.
+	 */
+	writel(MTK_RDR_THRESH_DEF, cryp->base + RDR_THRESH(i));
+
+	/*
+	 * Configure a threshold and time-out value for the processed
+	 * result descriptors (or complete packets) that are written to
+	 * the RDR.
+	 */
+	val = MTK_DESC_SIZE_SET(MTK_DESC_SIZE) |
+		MTK_DESC_OFFSET_SET(MTK_DESC_OFFSET);
+	writel(val, cryp->base + RDR_DESC_SIZE(i));
+
+	/*
+	 * Configure HIA fetch size and fetch threshold that are used to
+	 * fetch blocks of multiple descriptors.
+	 */
+	val = MTK_FETCH_SIZE_SET(size) |
+		MTK_FETCH_THRESH_SET(thresh) |
+		MTK_OVL_IRQ_EN;
+	writel(val, cryp->base + RDR_CFG(i));
+}
+
+static int mtk_packet_engine_setup(struct mtk_cryp *cryp)
+{
+	struct mtk_sys_cap cap;
+	int i, err;
+	u32 val;
+
+	cap.hia_ver = readl(cryp->base + HIA_VERSION);
+	cap.hia_opt = readl(cryp->base + HIA_OPTIONS);
+	cap.hw_opt = readl(cryp->base + EIP97_OPTIONS);
+
+	if (!(((u16)cap.hia_ver) == MTK_HIA_SIGNATURE))
+		return -EINVAL;
+
+	/* Configure endianness conversion method for master (DMA) interface */
+	writel(0, cryp->base + EIP97_MST_CTRL);
+
+	/* Set HIA burst size */
+	val = readl(cryp->base + HIA_MST_CTRL);
+	writel(MTK_BURST_SIZE(val, 5), cryp->base + HIA_MST_CTRL);
+
+	err = mtk_dfe_dse_reset(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Failed to reset DFE and DSE.\n");
+		return err;
+	}
+
+	mtk_dfe_dse_buf_setup(cryp, &cap);
+
+	/* Enable the 4 rings for the packet engines. */
+	mtk_desc_ring_link(cryp, 0xf);
+
+	for (i = 0; i < RING_MAX; i++) {
+		mtk_cmd_desc_ring_setup(cryp, i, &cap);
+		mtk_res_desc_ring_setup(cryp, i, &cap);
+	}
+
+	val = MTK_PE_TOKEN_CTRL_DEF | MTK_TOKEN_TIMEOUT_EN;
+	writel(val, cryp->base + PE_TOKEN_CTRL_STAT);
+
+	/* Clear all pending interrupts */
+	writel(MTK_AIC_GLOBAL_CLR, cryp->base + AIC_G_ACK);
+	writel(MTK_PE_INT_CTRL_DEF, cryp->base + PE_INTERRUPT_CTRL_STAT);
+
+	return 0;
+}
+
+static int mtk_aic_cap_check(struct mtk_cryp *cryp, int hw)
+{
+	u32 val;
+
+	if (hw == RING_MAX)
+		val = readl(cryp->base + AIC_G_VERSION);
+	else
+		val = readl(cryp->base + AIC_VERSION(hw));
+
+	val = MTK_AIC_VER_GET(val);
+	if (val != MTK_AIC_VER11 && val != MTK_AIC_VER12)
+		return -ENXIO;
+
+	if (hw == RING_MAX)
+		val = readl(cryp->base + AIC_G_OPTIONS);
+	else
+		val = readl(cryp->base + AIC_OPTIONS(hw));
+
+	val = MTK_AIC_INT_NUM(val);
+	if (!val || val > 32)
+		return -ENXIO;
+
+	return 0;
+}
+
+static int mtk_aic_init(struct mtk_cryp *cryp, int hw)
+{
+	int err;
+
+	err = mtk_aic_cap_check(cryp, hw);
+	if (err)
+		return err;
+
+	/* Disable all interrupts and set initial configuration */
+	if (hw == RING_MAX) {
+		writel(0, cryp->base + AIC_G_ENABLE_CTRL);
+		writel(0, cryp->base + AIC_G_POL_CTRL);
+		writel(0, cryp->base + AIC_G_TYPE_CTRL);
+		writel(0, cryp->base + AIC_G_ENABLE_SET);
+	} else {
+		writel(0, cryp->base + AIC_ENABLE_CTRL(hw));
+		writel(0, cryp->base + AIC_POL_CTRL(hw));
+		writel(0, cryp->base + AIC_TYPE_CTRL(hw));
+		writel(0, cryp->base + AIC_ENABLE_SET(hw));
+	}
+
+	return 0;
+}
+
+static int mtk_accelerator_init(struct mtk_cryp *cryp)
+{
+	int i, err;
+
+	/* Initialize advanced interrupt controller(AIC) */
+	for (i = 0; i < IRQ_NUM; i++) {
+		err = mtk_aic_init(cryp, i);
+		if (err) {
+			dev_err(cryp->dev, "Failed to initialize AIC.\n");
+			return err;
+		}
+	}
+
+	/* Initialize packet engine */
+	err = mtk_packet_engine_setup(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Failed to configure packet engine.\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static void mtk_desc_dma_free(struct mtk_cryp *cryp)
+{
+	int i;
+
+	for (i = 0; i < RING_MAX; i++) {
+		dma_free_coherent(cryp->dev, MTK_MAX_RING_SIZE,
+				  cryp->ring[i]->res_base,
+				  cryp->ring[i]->res_dma);
+		dma_free_coherent(cryp->dev, MTK_MAX_RING_SIZE,
+				  cryp->ring[i]->cmd_base,
+				  cryp->ring[i]->cmd_dma);
+		kfree(cryp->ring[i]);
+	}
+}
+
+static int mtk_desc_ring_alloc(struct mtk_cryp *cryp)
+{
+	struct mtk_ring **ring = cryp->ring;
+	int i, err = ENOMEM;
+
+	for (i = 0; i < RING_MAX; i++) {
+		ring[i] = kzalloc(sizeof(**ring), GFP_KERNEL);
+		if (!ring[i])
+			goto err_cleanup;
+
+		ring[i]->cmd_base = dma_zalloc_coherent(cryp->dev,
+					   MTK_MAX_RING_SIZE, &ring[i]->cmd_dma,
+					   GFP_KERNEL);
+		if (!ring[i]->cmd_base)
+			goto err_cleanup;
+
+		ring[i]->res_base = dma_zalloc_coherent(cryp->dev,
+					   MTK_MAX_RING_SIZE, &ring[i]->res_dma,
+					   GFP_KERNEL);
+		if (!ring[i]->res_base)
+			goto err_cleanup;
+	}
+	return 0;
+
+err_cleanup:
+	for (; i--; ) {
+		dma_free_coherent(cryp->dev, MTK_MAX_RING_SIZE,
+				  ring[i]->res_base, ring[i]->res_dma);
+		dma_free_coherent(cryp->dev, MTK_MAX_RING_SIZE,
+				  ring[i]->cmd_base, ring[i]->cmd_dma);
+		kfree(ring[i]);
+	}
+	return err;
+}
+
+static int mtk_crypto_probe(struct platform_device *pdev)
+{
+	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	struct mtk_cryp *cryp;
+	int i, err;
+
+	cryp = devm_kzalloc(&pdev->dev, sizeof(*cryp), GFP_KERNEL);
+	if (!cryp)
+		return -ENOMEM;
+
+	cryp->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(cryp->base))
+		return PTR_ERR(cryp->base);
+
+	for (i = 0; i < IRQ_NUM; i++) {
+		cryp->irq[i] = platform_get_irq(pdev, i);
+		if (cryp->irq[i] < 0) {
+			dev_err(cryp->dev, "no IRQ:%d resource info\n", i);
+			return -ENXIO;
+		}
+	}
+
+	cryp->clk_ethif = devm_clk_get(&pdev->dev, "ethif");
+	cryp->clk_cryp = devm_clk_get(&pdev->dev, "cryp");
+	if (IS_ERR(cryp->clk_ethif) || IS_ERR(cryp->clk_cryp))
+		return -EPROBE_DEFER;
+
+	cryp->dev = &pdev->dev;
+	pm_runtime_enable(cryp->dev);
+	pm_runtime_get_sync(cryp->dev);
+
+	err = clk_prepare_enable(cryp->clk_ethif);
+	if (err)
+		goto err_clk_ethif;
+
+	err = clk_prepare_enable(cryp->clk_cryp);
+	if (err)
+		goto err_clk_cryp;
+
+	err = mtk_desc_ring_alloc(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Unable to allocate descriptor rings.\n");
+		goto err_resource;
+	}
+
+	err = mtk_accelerator_init(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Failed to initialize cryptographic engine.\n");
+		goto err_engine;
+	}
+
+	err = mtk_cipher_alg_register(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Unable to register MTK-AES.\n");
+		goto err_cipher;
+	}
+
+	err = mtk_hash_alg_register(cryp);
+	if (err) {
+		dev_err(cryp->dev, "Unable to register MTK-SHA.\n");
+		goto err_hash;
+	}
+
+	platform_set_drvdata(pdev, cryp);
+	return 0;
+
+err_hash:
+	mtk_cipher_alg_release(cryp);
+err_cipher:
+	mtk_dfe_dse_reset(cryp);
+err_engine:
+	mtk_desc_dma_free(cryp);
+err_resource:
+	clk_disable_unprepare(cryp->clk_cryp);
+err_clk_cryp:
+	clk_disable_unprepare(cryp->clk_ethif);
+err_clk_ethif:
+	pm_runtime_put_sync(cryp->dev);
+	pm_runtime_disable(cryp->dev);
+
+	return err;
+}
+
+static int mtk_crypto_remove(struct platform_device *pdev)
+{
+	struct mtk_cryp *cryp = platform_get_drvdata(pdev);
+
+	mtk_hash_alg_release(cryp);
+	mtk_cipher_alg_release(cryp);
+	mtk_desc_dma_free(cryp);
+
+	clk_disable_unprepare(cryp->clk_cryp);
+	clk_disable_unprepare(cryp->clk_ethif);
+
+	pm_runtime_put_sync(cryp->dev);
+	pm_runtime_disable(cryp->dev);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+const struct of_device_id of_crypto_id[] = {
+	{ .compatible = "mediatek,mt7623-crypto" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, of_crypto_id);
+
+static struct platform_driver mtk_crypto_driver = {
+	.probe = mtk_crypto_probe,
+	.remove = mtk_crypto_remove,
+	.driver = {
+		   .name = "mtk-crypto",
+		   .owner = THIS_MODULE,
+		   .of_match_table = of_crypto_id,
+	},
+};
+module_platform_driver(mtk_crypto_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ryder Lee <ryder.lee@mediatek.com>");
+MODULE_DESCRIPTION("Cryptographic accelerator driver for MediaTek SoC");
diff --git a/drivers/crypto/mediatek/mtk-platform.h b/drivers/crypto/mediatek/mtk-platform.h
new file mode 100644
index 0000000..e9651f1
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-platform.h
@@ -0,0 +1,235 @@
+/*
+ * Support for MediaTek cryptographic accelerator.
+ *
+ * Copyright (c) 2016 MediaTek Inc.
+ * Author: Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ */
+
+#ifndef __MTK_PLATFORM_H_
+#define __MTK_PLATFORM_H_
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/interrupt.h>
+
+#define MTK_RDR_THRESH_DEF	0x800001
+
+#define MTK_IRQ_RDR0		BIT(1)
+#define MTK_IRQ_RDR1		BIT(3)
+#define MTK_IRQ_RDR2		BIT(5)
+#define MTK_IRQ_RDR3		BIT(7)
+
+#define MTK_DESC_CNT_CLR	BIT(31)
+#define MTK_DESC_LAST		BIT(22)
+#define MTK_DESC_FIRST		BIT(23)
+#define MTK_DESC_BUF_LEN(x)	((x) & 0x1ffff)
+#define MTK_DESC_CT_LEN(x)	(((x) & 0xff) << 24)
+
+#define SIZE_IN_WORDS(x)	((x) >> 2)
+
+/**
+ * Ring 0/1 are used by AES encrypt and decrypt.
+ * Ring 2/3 are used by SHA.
+ */
+enum {
+	RING0 = 0,
+	RING1,
+	RING2,
+	RING3,
+	RING_MAX,
+};
+
+#define RECORD_NUM		(RING_MAX / 2)
+#define IRQ_NUM			5
+
+/**
+ * struct mtk_desc - DMA descriptor
+ * @hdr:	the descriptor control header
+ * @buf:	DMA address of input buffer segment
+ * @ct:		DMA address of command token that control operation flow
+ * @ct_hdr:	the command token control header
+ * @tag:	the user-defined field
+ * @tfm:	DMA address of transform state
+ * @bound:	align descriptors offset boundary
+ *
+ * Structure passed to the crypto engine to describe where source
+ * data needs to be fetched and how it needs to be processed.
+ */
+struct mtk_desc {
+	u32 hdr;
+	u32 buf;
+	u32 ct;
+	u32 ct_hdr;
+	u32 tag;
+	u32 tfm;
+	u32 bound[2];
+};
+
+/**
+ * struct mtk_ring - Descriptor ring
+ * @cmd_base:	pointer to command descriptor ring base
+ * @cmd_dma:	DMA address of command descriptor ring
+ * @res_base:	pointer to result descriptor ring base
+ * @res_dma:	DMA address of result descriptor ring
+ * @pos:	current position in the ring
+ *
+ * A descriptor ring is a circular buffer that is used to manage
+ * one or more descriptors. There are two type of descriptor rings;
+ * the command descriptor ring and result descriptor ring.
+ */
+struct mtk_ring {
+	struct mtk_desc *cmd_base;
+	dma_addr_t cmd_dma;
+	struct mtk_desc *res_base;
+	dma_addr_t res_dma;
+	u32 pos;
+};
+
+#define MTK_MAX_DESC_NUM	512
+#define MTK_DESC_OFFSET		SIZE_IN_WORDS(sizeof(struct mtk_desc))
+#define MTK_DESC_SIZE		(MTK_DESC_OFFSET - 2)
+#define MTK_MAX_RING_SIZE	((sizeof(struct mtk_desc) * MTK_MAX_DESC_NUM))
+#define MTK_DESC_CNT(x)		((MTK_DESC_OFFSET * (x)) << 2)
+
+/**
+ * struct mtk_aes_dma - Structure that holds sg list info
+ * @sg:		pointer to scatter-gather list
+ * @nents:	number of entries in the sg list
+ * @remainder:	remainder of sg list
+ * @sg_len:	number of entries in the sg mapped list
+ */
+struct mtk_aes_dma {
+	struct scatterlist *sg;
+	int nents;
+	u32 remainder;
+	u32 sg_len;
+};
+
+/**
+ * struct mtk_aes - AES operation record
+ * @queue:	crypto request queue
+ * @req:	pointer to ablkcipher request
+ * @task:	the tasklet is use in AES interrupt
+ * @src:	the structure that holds source sg list info
+ * @dst:	the structure that holds destination sg list info
+ * @aligned_sg:	the scatter list is use to alignment
+ * @real_dst:	pointer to the destination sg list
+ * @total:	request buffer length
+ * @buf:	pointer to page buffer
+ * @info:	pointer to AES transform state and command token
+ * @ct_hdr:	AES command token control field
+ * @ct_size:	size of AES command token
+ * @ct_dma:	DMA address of AES command token
+ * @tfm_dma:	DMA address of AES transform state
+ * @id:		record identification
+ * @flags:	it's describing AES operation state
+ * @lock:	the ablkcipher queue lock
+ *
+ * Structure used to record AES execution state.
+ */
+struct mtk_aes {
+	struct crypto_queue queue;
+	struct ablkcipher_request *req;
+	struct tasklet_struct task;
+	struct mtk_aes_dma src;
+	struct mtk_aes_dma dst;
+
+	struct scatterlist aligned_sg;
+	struct scatterlist *real_dst;
+
+	size_t total;
+	void *buf;
+
+	void *info;
+	u32 ct_hdr;
+	u32 ct_size;
+	dma_addr_t ct_dma;
+	dma_addr_t tfm_dma;
+
+	u8 id;
+	unsigned long flags;
+	/* queue lock */
+	spinlock_t lock;
+};
+
+/**
+ * struct mtk_sha - SHA operation record
+ * @queue:	crypto request queue
+ * @req:	pointer to ahash request
+ * @task:	the tasklet is use in SHA interrupt
+ * @info:	pointer to SHA transform state and command token
+ * @ct_hdr:	SHA command token control field
+ * @ct_size:	size of SHA command token
+ * @ct_dma:	DMA address of SHA command token
+ * @tfm_dma:	DMA address of SHA transform state
+ * @id:		record identification
+ * @flags:	it's describing SHA operation state
+ * @lock:	the ablkcipher queue lock
+ *
+ * Structure used to record SHA execution state.
+ */
+struct mtk_sha {
+	struct crypto_queue queue;
+	struct ahash_request *req;
+	struct tasklet_struct task;
+
+	void *info;
+	u32 ct_hdr;
+	u32 ct_size;
+	dma_addr_t ct_dma;
+	dma_addr_t tfm_dma;
+
+	u8 id;
+	unsigned long flags;
+	/* queue lock */
+	spinlock_t lock;
+};
+
+/**
+ * struct mtk_cryp - Cryptographic device
+ * @base:	pointer to mapped register I/O base
+ * @dev:	pointer to device
+ * @clk_ethif:	pointer to ethif clock
+ * @clk_cryp:	pointer to crypto clock
+ * @irq:	global system and rings IRQ
+ * @ring:	pointer to execution state of AES
+ * @aes:	pointer to execution state of SHA
+ * @sha:	each execution record map to a ring
+ * @aes_list:	device list of AES
+ * @sha_list:	device list of SHA
+ * @tmp:	pointer to temporary buffer for internal use
+ * @tmp_dma:	DMA address of temporary buffer
+ * @rec:	it's used to select SHA record for tfm
+ *
+ * Structure storing cryptographic device information.
+ */
+struct mtk_cryp {
+	void __iomem *base;
+	struct device *dev;
+	struct clk *clk_ethif;
+	struct clk *clk_cryp;
+	int irq[IRQ_NUM];
+
+	struct mtk_ring *ring[RING_MAX];
+	struct mtk_aes *aes[RECORD_NUM];
+	struct mtk_sha *sha[RECORD_NUM];
+
+	struct list_head aes_list;
+	struct list_head sha_list;
+
+	void *tmp;
+	dma_addr_t tmp_dma;
+	bool rec;
+};
+
+int mtk_cipher_alg_register(struct mtk_cryp *cryp);
+void mtk_cipher_alg_release(struct mtk_cryp *cryp);
+int mtk_hash_alg_register(struct mtk_cryp *cryp);
+void mtk_hash_alg_release(struct mtk_cryp *cryp);
+
+#endif /* __MTK_PLATFORM_H_ */
diff --git a/drivers/crypto/mediatek/mtk-regs.h b/drivers/crypto/mediatek/mtk-regs.h
new file mode 100644
index 0000000..94f4eb8
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-regs.h
@@ -0,0 +1,194 @@
+/*
+ * Support for MediaTek cryptographic accelerator.
+ *
+ * Copyright (c) 2016 MediaTek Inc.
+ * Author: Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ */
+
+#ifndef __MTK_REGS_H__
+#define __MTK_REGS_H__
+
+/* HIA, Command Descriptor Ring Manager */
+#define CDR_BASE_ADDR_LO(x)		(0x0 + ((x) << 12))
+#define CDR_BASE_ADDR_HI(x)		(0x4 + ((x) << 12))
+#define CDR_DATA_BASE_ADDR_LO(x)	(0x8 + ((x) << 12))
+#define CDR_DATA_BASE_ADDR_HI(x)	(0xC + ((x) << 12))
+#define CDR_ACD_BASE_ADDR_LO(x)		(0x10 + ((x) << 12))
+#define CDR_ACD_BASE_ADDR_HI(x)		(0x14 + ((x) << 12))
+#define CDR_RING_SIZE(x)		(0x18 + ((x) << 12))
+#define CDR_DESC_SIZE(x)		(0x1C + ((x) << 12))
+#define CDR_CFG(x)			(0x20 + ((x) << 12))
+#define CDR_DMA_CFG(x)			(0x24 + ((x) << 12))
+#define CDR_THRESH(x)			(0x28 + ((x) << 12))
+#define CDR_PREP_COUNT(x)		(0x2C + ((x) << 12))
+#define CDR_PROC_COUNT(x)		(0x30 + ((x) << 12))
+#define CDR_PREP_PNTR(x)		(0x34 + ((x) << 12))
+#define CDR_PROC_PNTR(x)		(0x38 + ((x) << 12))
+#define CDR_STAT(x)			(0x3C + ((x) << 12))
+
+/* HIA, Result Descriptor Ring Manager */
+#define RDR_BASE_ADDR_LO(x)		(0x800 + ((x) << 12))
+#define RDR_BASE_ADDR_HI(x)		(0x804 + ((x) << 12))
+#define RDR_DATA_BASE_ADDR_LO(x)	(0x808 + ((x) << 12))
+#define RDR_DATA_BASE_ADDR_HI(x)	(0x80C + ((x) << 12))
+#define RDR_ACD_BASE_ADDR_LO(x)		(0x810 + ((x) << 12))
+#define RDR_ACD_BASE_ADDR_HI(x)		(0x814 + ((x) << 12))
+#define RDR_RING_SIZE(x)		(0x818 + ((x) << 12))
+#define RDR_DESC_SIZE(x)		(0x81C + ((x) << 12))
+#define RDR_CFG(x)			(0x820 + ((x) << 12))
+#define RDR_DMA_CFG(x)			(0x824 + ((x) << 12))
+#define RDR_THRESH(x)			(0x828 + ((x) << 12))
+#define RDR_PREP_COUNT(x)		(0x82C + ((x) << 12))
+#define RDR_PROC_COUNT(x)		(0x830 + ((x) << 12))
+#define RDR_PREP_PNTR(x)		(0x834 + ((x) << 12))
+#define RDR_PROC_PNTR(x)		(0x838 + ((x) << 12))
+#define RDR_STAT(x)			(0x83C + ((x) << 12))
+
+/* HIA, Ring AIC */
+#define AIC_POL_CTRL(x)			(0xE000 - ((x) << 12))
+#define	AIC_TYPE_CTRL(x)		(0xE004 - ((x) << 12))
+#define	AIC_ENABLE_CTRL(x)		(0xE008 - ((x) << 12))
+#define	AIC_RAW_STAL(x)			(0xE00C - ((x) << 12))
+#define	AIC_ENABLE_SET(x)		(0xE00C - ((x) << 12))
+#define	AIC_ENABLED_STAT(x)		(0xE010 - ((x) << 12))
+#define	AIC_ACK(x)			(0xE010 - ((x) << 12))
+#define	AIC_ENABLE_CLR(x)		(0xE014 - ((x) << 12))
+#define	AIC_OPTIONS(x)			(0xE018 - ((x) << 12))
+#define	AIC_VERSION(x)			(0xE01C - ((x) << 12))
+
+/* HIA, Global AIC */
+#define AIC_G_POL_CTRL			0xF800
+#define AIC_G_TYPE_CTRL			0xF804
+#define AIC_G_ENABLE_CTRL		0xF808
+#define AIC_G_RAW_STAT			0xF80C
+#define AIC_G_ENABLE_SET		0xF80C
+#define AIC_G_ENABLED_STAT		0xF810
+#define AIC_G_ACK			0xF810
+#define AIC_G_ENABLE_CLR		0xF814
+#define AIC_G_OPTIONS			0xF818
+#define AIC_G_VERSION			0xF81C
+
+/* HIA, Data Fetch Engine */
+#define DFE_CFG				0xF000
+#define DFE_PRIO_0			0xF010
+#define DFE_PRIO_1			0xF014
+#define DFE_PRIO_2			0xF018
+#define DFE_PRIO_3			0xF01C
+
+/* HIA, Data Fetch Engine access monitoring for CDR */
+#define DFE_RING_REGION_LO(x)		(0xF080 + ((x) << 3))
+#define DFE_RING_REGION_HI(x)		(0xF084 + ((x) << 3))
+
+/* HIA, Data Fetch Engine thread control and status for thread */
+#define DFE_THR_CTRL			0xF200
+#define DFE_THR_STAT			0xF204
+#define DFE_THR_DESC_CTRL		0xF208
+#define DFE_THR_DESC_DPTR_LO		0xF210
+#define DFE_THR_DESC_DPTR_HI		0xF214
+#define DFE_THR_DESC_ACDPTR_LO		0xF218
+#define DFE_THR_DESC_ACDPTR_HI		0xF21C
+
+/* HIA, Data Store Engine */
+#define DSE_CFG				0xF400
+#define DSE_PRIO_0			0xF410
+#define DSE_PRIO_1			0xF414
+#define DSE_PRIO_2			0xF418
+#define DSE_PRIO_3			0xF41C
+
+/* HIA, Data Store Engine access monitoring for RDR */
+#define DSE_RING_REGION_LO(x)		(0xF480 + ((x) << 3))
+#define DSE_RING_REGION_HI(x)		(0xF484 + ((x) << 3))
+
+/* HIA, Data Store Engine thread control and status for thread */
+#define DSE_THR_CTRL			0xF600
+#define DSE_THR_STAT			0xF604
+#define DSE_THR_DESC_CTRL		0xF608
+#define DSE_THR_DESC_DPTR_LO		0xF610
+#define DSE_THR_DESC_DPTR_HI		0xF614
+#define DSE_THR_DESC_S_DPTR_LO		0xF618
+#define DSE_THR_DESC_S_DPTR_HI		0xF61C
+#define DSE_THR_ERROR_STAT		0xF620
+
+/* HIA Global */
+#define HIA_MST_CTRL			0xFFF4
+#define HIA_OPTIONS			0xFFF8
+#define HIA_VERSION			0xFFFC
+
+/* Processing Engine Input Side, Processing Engine */
+#define PE_IN_DBUF_THRESH		0x10000
+#define PE_IN_TBUF_THRESH		0x10100
+
+/* Packet Engine Configuration / Status Registers */
+#define PE_TOKEN_CTRL_STAT		0x11000
+#define PE_FUNCTION_EN			0x11004
+#define PE_CONTEXT_CTRL			0x11008
+#define PE_INTERRUPT_CTRL_STAT		0x11010
+#define PE_CONTEXT_STAT			0x1100C
+#define PE_OUT_TRANS_CTRL_STAT		0x11018
+#define PE_OUT_BUF_CTRL			0x1101C
+
+/* Packet Engine PRNG Registers */
+#define PE_PRNG_STAT			0x11040
+#define PE_PRNG_CTRL			0x11044
+#define PE_PRNG_SEED_L			0x11048
+#define PE_PRNG_SEED_H			0x1104C
+#define PE_PRNG_KEY_0_L			0x11050
+#define PE_PRNG_KEY_0_H			0x11054
+#define PE_PRNG_KEY_1_L			0x11058
+#define PE_PRNG_KEY_1_H			0x1105C
+#define PE_PRNG_RES_0			0x11060
+#define PE_PRNG_RES_1			0x11064
+#define PE_PRNG_RES_2			0x11068
+#define PE_PRNG_RES_3			0x1106C
+#define PE_PRNG_LFSR_L			0x11070
+#define PE_PRNG_LFSR_H			0x11074
+
+/* Packet Engine AIC */
+#define PE_EIP96_AIC_POL_CTRL		0x113C0
+#define PE_EIP96_AIC_TYPE_CTRL		0x113C4
+#define PE_EIP96_AIC_ENABLE_CTRL	0x113C8
+#define PE_EIP96_AIC_RAW_STAT		0x113CC
+#define PE_EIP96_AIC_ENABLE_SET		0x113CC
+#define PE_EIP96_AIC_ENABLED_STAT	0x113D0
+#define PE_EIP96_AIC_ACK		0x113D0
+#define PE_EIP96_AIC_ENABLE_CLR		0x113D4
+#define PE_EIP96_AIC_OPTIONS		0x113D8
+#define PE_EIP96_AIC_VERSION		0x113DC
+
+/* Packet Engine Options & Version Registers */
+#define PE_EIP96_OPTIONS		0x113F8
+#define PE_EIP96_VERSION		0x113FC
+
+/* Processing Engine Output Side */
+#define PE_OUT_DBUF_THRESH		0x11C00
+#define PE_OUT_TBUF_THRESH		0x11D00
+
+/* Processing Engine Local AIC */
+#define PE_AIC_POL_CTRL			0x11F00
+#define PE_AIC_TYPE_CTRL		0x11F04
+#define PE_AIC_ENABLE_CTRL		0x11F08
+#define PE_AIC_RAW_STAT			0x11F0C
+#define PE_AIC_ENABLE_SET		0x11F0C
+#define PE_AIC_ENABLED_STAT		0x11F10
+#define PE_AIC_ENABLE_CLR		0x11F14
+#define PE_AIC_OPTIONS			0x11F18
+#define PE_AIC_VERSION			0x11F1C
+
+/* Processing Engine General Configuration and Version */
+#define PE_IN_FLIGHT			0x11FF0
+#define PE_OPTIONS			0x11FF8
+#define PE_VERSION			0x11FFC
+
+/* EIP-97 - Global */
+#define EIP97_CLOCK_STATE		0x1FFE4
+#define EIP97_FORCE_CLOCK_ON		0x1FFE8
+#define EIP97_FORCE_CLOCK_OFF		0x1FFEC
+#define EIP97_MST_CTRL			0x1FFF4
+#define EIP97_OPTIONS			0x1FFF8
+#define EIP97_VERSION			0x1FFFC
+#endif /* __MTK_REGS_H__ */
diff --git a/drivers/crypto/mediatek/mtk-sha.c b/drivers/crypto/mediatek/mtk-sha.c
new file mode 100644
index 0000000..191dee2
--- /dev/null
+++ b/drivers/crypto/mediatek/mtk-sha.c
@@ -0,0 +1,1423 @@
+/*
+ * Cryptographic API.
+ *
+ * Support for MediaTek SHA1/SHA2 hardware accelerator.
+ *
+ * Copyright (c) 2016 MediaTek Inc.
+ * Author: Ryder Lee <ryder.lee@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Some ideas are from atmel-sha.c and omap-sham.c drivers.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <linux/crypto.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+#include "mtk-platform.h"
+#include "mtk-regs.h"
+
+#define SHA_ALIGN_MSK		(sizeof(u32) - 1)
+#define SHA_QUEUE_SIZE		512
+#define SHA_TMP_STATE_SIZE	512
+
+#define SHA_DATA_LEN_MSK	GENMASK(16, 0)
+#define SHA_BUFFER_LEN		((u32)PAGE_SIZE)
+
+#define SHA_OP_UPDATE		1
+#define SHA_OP_FINAL		2
+
+/* SHA command token */
+#define SHA_CT_SIZE		5
+#define SHA_CT_CTRL_HDR		0x02220000
+#define SHA_COMMAND0		0x03020000
+#define SHA_COMMAND1		0x21060000
+#define SHA_COMMAND2		0xe0e63802
+
+/* SHA transform information */
+#define SHA_TFM_HASH		(0x2 << 0)
+#define SHA_TFM_DIG_TYPE	(0x1 << 21)
+#define SHA_TFM_SIZE(x)		((x) << 8)
+#define SHA_TFM_START		(0x1 << 4)
+#define SHA_TFM_CONTINUE		(0x1 << 5)
+#define SHA_TFM_HASH_STORE	(0x1 << 19)
+#define SHA_TFM_SHA1		(0x2 << 23)
+#define SHA_TFM_SHA256		(0x3 << 23)
+#define SHA_TFM_SHA224		(0x4 << 23)
+#define SHA_TFM_SHA512		(0x5 << 23)
+#define SHA_TFM_SHA384		(0x6 << 23)
+#define SHA_TFM_DIGEST(x)	(((x) & 0xf) << 24)
+
+/* SHA flags */
+#define SHA_FLAGS_BUSY		BIT(0)
+#define	SHA_FLAGS_FINAL		BIT(1)
+#define SHA_FLAGS_FINUP		BIT(2)
+#define SHA_FLAGS_SG		BIT(3)
+#define SHA_FLAGS_ALGO_MASK	GENMASK(8, 4)
+#define SHA_FLAGS_SHA1		BIT(4)
+#define SHA_FLAGS_SHA224	BIT(5)
+#define SHA_FLAGS_SHA256	BIT(6)
+#define SHA_FLAGS_SHA384	BIT(7)
+#define SHA_FLAGS_SHA512	BIT(8)
+#define SHA_FLAGS_HMAC		BIT(9)
+#define SHA_FLAGS_PAD		BIT(10)
+
+/**
+ * SHA command token(CT) is a set of hardware instructions that
+ * are used to control engine's processing flow of sha, and it
+ * contains the first two words of transform state.
+ */
+struct mtk_sha_ct {
+	u32 tfm_ctrl0;
+	u32 tfm_ctrl1;
+	u32 ct_ctrl0;
+	u32 ct_ctrl1;
+	u32 ct_ctrl2;
+};
+
+/**
+ * SHA transform state(tfm) is used to define SHA transform state
+ * and store the result digest that produce by crypto engine.
+ */
+struct mtk_sha_tfm {
+	u32 tfm_ctrl0;
+	u32 tfm_ctrl1;
+	/* store result digests */
+	u8 digest[SHA512_DIGEST_SIZE] __aligned(sizeof(u32));
+};
+
+/**
+ * mtk_sha_info consists of command token and transform state
+ * of SHA, its role is similar to mtk_aes_info.
+ */
+struct mtk_sha_info {
+	struct mtk_sha_ct ct;
+	struct mtk_sha_tfm tfm;
+};
+
+struct mtk_sha_reqctx {
+	struct mtk_sha_info info;
+	unsigned long flags;
+	unsigned long op;
+
+	u64 digcnt;
+	bool start;
+	size_t bufcnt;
+	dma_addr_t dma_addr;
+
+	/* walk state */
+	struct scatterlist *sg;
+	u32 offset;	/* offset in current sg */
+	u32 total;	/* total request */
+	size_t ds;
+	size_t bs;
+
+	u8 *buffer;
+};
+
+struct mtk_sha_hmac_ctx {
+	struct crypto_shash	*shash;
+	u8 ipad[SHA512_BLOCK_SIZE] __aligned(sizeof(u32));
+	u8 opad[SHA512_BLOCK_SIZE] __aligned(sizeof(u32));
+};
+
+struct mtk_sha_ctx {
+	struct mtk_cryp *cryp;
+	unsigned long flags;
+	u8 id;
+	u8 buf[SHA_BUFFER_LEN] __aligned(sizeof(u32));
+
+	struct mtk_sha_hmac_ctx	base[0];
+};
+
+struct mtk_sha_drv {
+	struct list_head dev_list;
+	/* device list lock */
+	spinlock_t lock;
+};
+
+static struct mtk_sha_drv mtk_sha = {
+	.dev_list = LIST_HEAD_INIT(mtk_sha.dev_list),
+	.lock = __SPIN_LOCK_UNLOCKED(mtk_sha.lock),
+};
+
+static int mtk_sha_handle_queue(struct mtk_cryp *cryp, u8 id,
+				struct ahash_request *req);
+
+static inline u32 mtk_sha_read(struct mtk_cryp *cryp, u32 offset)
+{
+	return readl_relaxed(cryp->base + offset);
+}
+
+static inline void mtk_sha_write(struct mtk_cryp *cryp,
+				 u32 offset, u32 value)
+{
+	writel_relaxed(value, cryp->base + offset);
+}
+
+static struct mtk_cryp *mtk_sha_find_dev(struct mtk_sha_ctx *tctx)
+{
+	struct mtk_cryp *cryp = NULL;
+	struct mtk_cryp *tmp;
+
+	spin_lock_bh(&mtk_sha.lock);
+	if (!tctx->cryp) {
+		list_for_each_entry(tmp, &mtk_sha.dev_list, sha_list) {
+			cryp = tmp;
+			break;
+		}
+		tctx->cryp = cryp;
+	} else {
+		cryp = tctx->cryp;
+	}
+
+	/*
+	 * Assign record id to tfm in round-robin fashion, and this
+	 * will help tfm to bind  to corresponding descriptor rings.
+	 */
+	tctx->id = cryp->rec;
+	cryp->rec = !cryp->rec;
+
+	spin_unlock_bh(&mtk_sha.lock);
+
+	return cryp;
+}
+
+static int mtk_sha_append_sg(struct mtk_sha_reqctx *ctx)
+{
+	size_t count;
+
+	while ((ctx->bufcnt < SHA_BUFFER_LEN) && ctx->total) {
+		count = min(ctx->sg->length - ctx->offset, ctx->total);
+		count = min(count, SHA_BUFFER_LEN - ctx->bufcnt);
+
+		if (count <= 0) {
+			/*
+			 * Check if count <= 0 because the buffer is full or
+			 * because the sg length is 0. In the latest case,
+			 * check if there is another sg in the list, a 0 length
+			 * sg doesn't necessarily mean the end of the sg list.
+			 */
+			if ((ctx->sg->length == 0) && !sg_is_last(ctx->sg)) {
+				ctx->sg = sg_next(ctx->sg);
+				continue;
+			} else {
+				break;
+			}
+		}
+
+		scatterwalk_map_and_copy(ctx->buffer + ctx->bufcnt, ctx->sg,
+					 ctx->offset, count, 0);
+
+		ctx->bufcnt += count;
+		ctx->offset += count;
+		ctx->total -= count;
+
+		if (ctx->offset == ctx->sg->length) {
+			ctx->sg = sg_next(ctx->sg);
+			if (ctx->sg)
+				ctx->offset = 0;
+			else
+				ctx->total = 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * The purpose of this padding is to ensure that the padded message is a
+ * multiple of 512 bits (SHA1/SHA224/SHA256) or 1024 bits (SHA384/SHA512).
+ * The bit "1" is appended at the end of the message followed by
+ * "padlen-1" zero bits. Then a 64 bits block (SHA1/SHA224/SHA256) or
+ * 128 bits block (SHA384/SHA512) equals to the message length in bits
+ * is appended.
+ *
+ * For SHA1/SHA224/SHA256, padlen is calculated as followed:
+ *  - if message length < 56 bytes then padlen = 56 - message length
+ *  - else padlen = 64 + 56 - message length
+ *
+ * For SHA384/SHA512, padlen is calculated as followed:
+ *  - if message length < 112 bytes then padlen = 112 - message length
+ *  - else padlen = 128 + 112 - message length
+ */
+static void mtk_sha_fill_padding(struct mtk_sha_reqctx *ctx, u32 len)
+{
+	u32 index, padlen;
+	u64 bits[2];
+	u64 size = ctx->digcnt;
+
+	size += ctx->bufcnt;
+	size += len;
+
+	bits[1] = cpu_to_be64(size << 3);
+	bits[0] = cpu_to_be64(size >> 61);
+
+	if (ctx->flags & (SHA_FLAGS_SHA384 | SHA_FLAGS_SHA512)) {
+		index = ctx->bufcnt & 0x7f;
+		padlen = (index < 112) ? (112 - index) : ((128 + 112) - index);
+		*(ctx->buffer + ctx->bufcnt) = 0x80;
+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen - 1);
+		memcpy(ctx->buffer + ctx->bufcnt + padlen, bits, 16);
+		ctx->bufcnt += padlen + 16;
+		ctx->flags |= SHA_FLAGS_PAD;
+	} else {
+		index = ctx->bufcnt & 0x3f;
+		padlen = (index < 56) ? (56 - index) : ((64 + 56) - index);
+		*(ctx->buffer + ctx->bufcnt) = 0x80;
+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen - 1);
+		memcpy(ctx->buffer + ctx->bufcnt + padlen, &bits[1], 8);
+		ctx->bufcnt += padlen + 8;
+		ctx->flags |= SHA_FLAGS_PAD;
+	}
+}
+
+/* Initialize basic transform information of SHA */
+static void mtk_sha_info_init(struct mtk_sha *sha,
+			      struct mtk_sha_reqctx *ctx)
+{
+	struct mtk_sha_info *info = sha->info;
+	struct mtk_sha_ct *ct = &info->ct;
+	struct mtk_sha_tfm *tfm = &info->tfm;
+
+	sha->ct_hdr = SHA_CT_CTRL_HDR;
+	sha->ct_size = SHA_CT_SIZE;
+
+	tfm->tfm_ctrl0 = SHA_TFM_HASH | SHA_TFM_DIG_TYPE |
+			 SHA_TFM_SIZE(SIZE_IN_WORDS(ctx->ds));
+
+	switch (ctx->flags & SHA_FLAGS_ALGO_MASK) {
+	case SHA_FLAGS_SHA1:
+		tfm->tfm_ctrl0 |= SHA_TFM_SHA1;
+		break;
+	case SHA_FLAGS_SHA224:
+		tfm->tfm_ctrl0 |= SHA_TFM_SHA224;
+		break;
+	case SHA_FLAGS_SHA256:
+		tfm->tfm_ctrl0 |= SHA_TFM_SHA256;
+		break;
+	case SHA_FLAGS_SHA384:
+		tfm->tfm_ctrl0 |= SHA_TFM_SHA384;
+		break;
+	case SHA_FLAGS_SHA512:
+		tfm->tfm_ctrl0 |= SHA_TFM_SHA512;
+		break;
+
+	default:
+		/* Should not happen... */
+		return;
+	}
+
+	tfm->tfm_ctrl1 = SHA_TFM_HASH_STORE;
+	ct->tfm_ctrl0 = tfm->tfm_ctrl0 | SHA_TFM_CONTINUE | SHA_TFM_START;
+	ct->tfm_ctrl1 = tfm->tfm_ctrl1;
+
+	ct->ct_ctrl0 = SHA_COMMAND0;
+	ct->ct_ctrl1 = SHA_COMMAND1;
+	ct->ct_ctrl2 = SHA_COMMAND2 | SHA_TFM_DIGEST(SIZE_IN_WORDS(ctx->ds));
+}
+
+/* Update input data length of transform information and map it. */
+static int mtk_sha_info_map(struct mtk_cryp *cryp,
+			    struct mtk_sha *sha, size_t len)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	struct mtk_sha_info *info = sha->info;
+	struct mtk_sha_ct *ct = &info->ct;
+
+	if (ctx->start)
+		ctx->start = false;
+	else
+		ct->tfm_ctrl0 &= ~SHA_TFM_START;
+
+	sha->ct_hdr = (sha->ct_hdr & ~SHA_DATA_LEN_MSK) | len;
+	ct->ct_ctrl0 = (ct->ct_ctrl0 & ~SHA_DATA_LEN_MSK) | len;
+
+	ctx->digcnt += len;
+
+	sha->ct_dma = dma_map_single(cryp->dev, info, sizeof(*info),
+				      DMA_BIDIRECTIONAL);
+	if (unlikely(dma_mapping_error(cryp->dev, sha->ct_dma))) {
+		dev_err(cryp->dev, "dma %d bytes error\n", sizeof(*info));
+		return -EINVAL;
+	}
+	sha->tfm_dma = sha->ct_dma + sizeof(*ct);
+
+	return 0;
+}
+
+/*
+ * Because of hardware limitation, we must pre-calculate the inner
+ * and outer digest that need to be processed firstly by engine, then
+ * apply the result digest to the input message. These complex hashing
+ * procedures limits HMAC performance, so we use fallback SW encoding.
+ */
+static int mtk_sha_finish_hmac(struct ahash_request *req)
+{
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(req->base.tfm);
+	struct mtk_sha_hmac_ctx *bctx = tctx->base;
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	SHASH_DESC_ON_STACK(shash, bctx->shash);
+
+	shash->tfm = bctx->shash;
+	shash->flags = 0; /* not CRYPTO_TFM_REQ_MAY_SLEEP */
+
+	return crypto_shash_init(shash) ?:
+	       crypto_shash_update(shash, bctx->opad, ctx->bs) ?:
+	       crypto_shash_finup(shash, req->result, ctx->ds, req->result);
+}
+
+/* Initialize request context */
+static int mtk_sha_init(struct ahash_request *req)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct mtk_sha_ctx *tctx = crypto_ahash_ctx(tfm);
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	ctx->flags = 0;
+	ctx->ds = crypto_ahash_digestsize(tfm);
+
+	switch (ctx->ds) {
+	case SHA1_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA1;
+		ctx->bs = SHA1_BLOCK_SIZE;
+		break;
+	case SHA224_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA224;
+		ctx->bs = SHA224_BLOCK_SIZE;
+		break;
+	case SHA256_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA256;
+		ctx->bs = SHA256_BLOCK_SIZE;
+		break;
+	case SHA384_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA384;
+		ctx->bs = SHA384_BLOCK_SIZE;
+		break;
+	case SHA512_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA512;
+		ctx->bs = SHA512_BLOCK_SIZE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ctx->bufcnt = 0;
+	ctx->digcnt = 0;
+	ctx->buffer = tctx->buf;
+	ctx->start = true;
+
+	if (tctx->flags & SHA_FLAGS_HMAC) {
+		struct mtk_sha_hmac_ctx *bctx = tctx->base;
+
+		memcpy(ctx->buffer, bctx->ipad, ctx->bs);
+		ctx->bufcnt = ctx->bs;
+		ctx->flags |= SHA_FLAGS_HMAC;
+	}
+
+	return 0;
+}
+
+static int mtk_sha_xmit(struct mtk_cryp *cryp, struct mtk_sha *sha,
+			dma_addr_t addr, size_t len)
+{
+	struct mtk_ring *ring = cryp->ring[sha->id];
+	struct mtk_desc *cmd = ring->cmd_base + ring->pos;
+	struct mtk_desc *res = ring->res_base + ring->pos;
+	int err;
+
+	err = mtk_sha_info_map(cryp, sha, len);
+	if (err)
+		return err;
+
+	/* Fill command and result descriptors */
+	res->hdr = MTK_DESC_FIRST | MTK_DESC_LAST |
+		    MTK_DESC_BUF_LEN(len);
+
+	res->buf = cryp->tmp_dma;
+
+	cmd->hdr = MTK_DESC_FIRST | MTK_DESC_LAST |
+		    MTK_DESC_BUF_LEN(len) |
+		    MTK_DESC_CT_LEN(sha->ct_size);
+
+	cmd->buf = addr;
+	cmd->ct = sha->ct_dma;
+	cmd->ct_hdr = sha->ct_hdr;
+	cmd->tfm = sha->tfm_dma;
+
+	if (++ring->pos == MTK_MAX_DESC_NUM)
+		ring->pos = 0;
+
+	/*
+	 * make sure that all changes to the dma ring are done before we
+	 * start engine.
+	 */
+	wmb();
+	/* Start DMA transfer */
+	mtk_sha_write(cryp, RDR_PREP_COUNT(sha->id), MTK_DESC_CNT(1));
+	mtk_sha_write(cryp, CDR_PREP_COUNT(sha->id), MTK_DESC_CNT(1));
+
+	return -EINPROGRESS;
+}
+
+static int mtk_sha_xmit2(struct mtk_cryp *cryp, struct mtk_sha *sha,
+			 struct mtk_sha_reqctx *ctx, size_t len1, size_t len2)
+{
+	struct mtk_ring *ring = cryp->ring[sha->id];
+	struct mtk_desc *cmd = ring->cmd_base + ring->pos;
+	struct mtk_desc *res = ring->res_base + ring->pos;
+	int err;
+
+	err = mtk_sha_info_map(cryp, sha, len1 + len2);
+	if (err)
+		return err;
+
+	/* Fill command and result descriptors */
+	res->hdr = MTK_DESC_BUF_LEN(len1) | MTK_DESC_FIRST;
+	res->buf = cryp->tmp_dma;
+
+	cmd->hdr = MTK_DESC_BUF_LEN(len1) | MTK_DESC_FIRST |
+					MTK_DESC_CT_LEN(sha->ct_size);
+	cmd->buf = sg_dma_address(ctx->sg);
+	cmd->ct = sha->ct_dma;
+	cmd->ct_hdr = sha->ct_hdr;
+	cmd->tfm = sha->tfm_dma;
+
+	if (++ring->pos == MTK_MAX_DESC_NUM)
+		ring->pos = 0;
+
+	cmd = ring->cmd_base + ring->pos;
+	res = ring->res_base + ring->pos;
+
+	res->hdr = MTK_DESC_BUF_LEN(len2) | MTK_DESC_LAST;
+	res->buf = cryp->tmp_dma;
+
+	cmd->hdr = MTK_DESC_BUF_LEN(len2) | MTK_DESC_LAST;
+	cmd->buf = ctx->dma_addr;
+
+	if (++ring->pos == MTK_MAX_DESC_NUM)
+		ring->pos = 0;
+
+	/*
+	 * make sure that all changes to the dma ring are done before we
+	 * start engine.
+	 */
+	wmb();
+	/* Start DMA transfer */
+	mtk_sha_write(cryp, RDR_PREP_COUNT(sha->id), MTK_DESC_CNT(2));
+	mtk_sha_write(cryp, CDR_PREP_COUNT(sha->id), MTK_DESC_CNT(2));
+
+	return -EINPROGRESS;
+}
+
+static int mtk_sha_dma_map(struct mtk_cryp *cryp, struct mtk_sha *sha,
+			   struct mtk_sha_reqctx *ctx, size_t count)
+{
+	ctx->dma_addr = dma_map_single(cryp->dev, ctx->buffer,
+				SHA_BUFFER_LEN, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(cryp->dev, ctx->dma_addr))) {
+		dev_err(cryp->dev, "dma map error\n");
+		return -EINVAL;
+	}
+
+	ctx->flags &= ~SHA_FLAGS_SG;
+
+	return mtk_sha_xmit(cryp, sha, ctx->dma_addr, count);
+}
+
+static int mtk_sha_update_slow(struct mtk_cryp *cryp, struct mtk_sha *sha)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	size_t count;
+	u32 final;
+
+	mtk_sha_append_sg(ctx);
+
+	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
+
+	dev_dbg(cryp->dev, "slow: bufcnt: %u\n", ctx->bufcnt);
+
+	if (final) {
+		sha->flags |= SHA_FLAGS_FINAL;
+		mtk_sha_fill_padding(ctx, 0);
+	}
+
+	if (final || (ctx->bufcnt == SHA_BUFFER_LEN && ctx->total)) {
+		count = ctx->bufcnt;
+		ctx->bufcnt = 0;
+
+		return mtk_sha_dma_map(cryp, sha, ctx, count);
+	}
+	return 0;
+}
+
+static int mtk_sha_update_start(struct mtk_cryp *cryp, struct mtk_sha *sha)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+	u32 len, final, tail;
+	struct scatterlist *sg;
+
+	if (!ctx->total)
+		return 0;
+
+	if (ctx->bufcnt || ctx->offset)
+		return mtk_sha_update_slow(cryp, sha);
+
+	sg = ctx->sg;
+
+	if (!IS_ALIGNED(sg->offset, sizeof(u32)))
+		return mtk_sha_update_slow(cryp, sha);
+
+	if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, ctx->bs))
+		/* size is not ctx->bs aligned */
+		return mtk_sha_update_slow(cryp, sha);
+
+	len = min(ctx->total, sg->length);
+
+	if (sg_is_last(sg)) {
+		if (!(ctx->flags & SHA_FLAGS_FINUP)) {
+			/* not last sg must be ctx->bs aligned */
+			tail = len & (ctx->bs - 1);
+			len -= tail;
+		}
+	}
+
+	ctx->total -= len;
+	ctx->offset = len; /* offset where to start slow */
+
+	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
+
+	/* Add padding */
+	if (final) {
+		size_t count;
+
+		tail = len & (ctx->bs - 1);
+		len -= tail;
+		ctx->total += tail;
+		ctx->offset = len; /* offset where to start slow */
+
+		sg = ctx->sg;
+		mtk_sha_append_sg(ctx);
+		mtk_sha_fill_padding(ctx, len);
+
+		ctx->dma_addr = dma_map_single(cryp->dev, ctx->buffer,
+			SHA_BUFFER_LEN, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(cryp->dev, ctx->dma_addr))) {
+			dev_err(cryp->dev, "dma map bytes error\n");
+			return -EINVAL;
+		}
+
+		sha->flags |= SHA_FLAGS_FINAL;
+		count = ctx->bufcnt;
+		ctx->bufcnt = 0;
+
+		if (len == 0) {
+			ctx->flags &= ~SHA_FLAGS_SG;
+			return mtk_sha_xmit(cryp, sha, ctx->dma_addr, count);
+
+		} else {
+			ctx->sg = sg;
+			if (!dma_map_sg(cryp->dev, ctx->sg, 1, DMA_TO_DEVICE)) {
+				dev_err(cryp->dev, "dma_map_sg error\n");
+				return -EINVAL;
+			}
+
+			ctx->flags |= SHA_FLAGS_SG;
+			return mtk_sha_xmit2(cryp, sha, ctx, len, count);
+		}
+	}
+
+	if (!dma_map_sg(cryp->dev, ctx->sg, 1, DMA_TO_DEVICE)) {
+		dev_err(cryp->dev, "dma_map_sg  error\n");
+		return -EINVAL;
+	}
+
+	ctx->flags |= SHA_FLAGS_SG;
+
+	return mtk_sha_xmit(cryp, sha, sg_dma_address(ctx->sg), len);
+}
+
+static int mtk_sha_final_req(struct mtk_cryp *cryp, struct mtk_sha *sha)
+{
+	struct ahash_request *req = sha->req;
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+	size_t count;
+
+	mtk_sha_fill_padding(ctx, 0);
+
+	sha->flags |= SHA_FLAGS_FINAL;
+	count = ctx->bufcnt;
+	ctx->bufcnt = 0;
+
+	return mtk_sha_dma_map(cryp, sha, ctx, count);
+}
+
+/* copy ready hash (+ finalize hmac) */
+static int mtk_sha_finish(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+	u8 *digest = ctx->info.tfm.digest;
+
+	memcpy(req->result, digest, ctx->ds);
+
+	if (ctx->flags & SHA_FLAGS_HMAC)
+		return mtk_sha_finish_hmac(req);
+
+	return 0;
+}
+
+static void mtk_sha_finish_req(struct mtk_cryp *cryp,
+			       struct mtk_sha *sha, int err)
+{
+	if (likely(!err && (SHA_FLAGS_FINAL & sha->flags)))
+		err = mtk_sha_finish(sha->req);
+
+	sha->flags &= ~(SHA_FLAGS_BUSY | SHA_FLAGS_FINAL);
+
+	sha->req->base.complete(&sha->req->base, err);
+
+	/* handle new request */
+	mtk_sha_handle_queue(cryp, sha->id - RING2, NULL);
+}
+
+static int mtk_sha_handle_queue(struct mtk_cryp *cryp, u8 id,
+				struct ahash_request *req)
+{
+	struct mtk_sha *sha = cryp->sha[id];
+	struct crypto_async_request *async_req, *backlog;
+	struct mtk_sha_reqctx *ctx;
+	unsigned long flags;
+	int err = 0, ret = 0;
+
+	spin_lock_irqsave(&sha->lock, flags);
+	if (req)
+		ret = ahash_enqueue_request(&sha->queue, req);
+
+	if (SHA_FLAGS_BUSY & sha->flags) {
+		spin_unlock_irqrestore(&sha->lock, flags);
+		return ret;
+	}
+
+	backlog = crypto_get_backlog(&sha->queue);
+	async_req = crypto_dequeue_request(&sha->queue);
+	if (async_req)
+		sha->flags |= SHA_FLAGS_BUSY;
+	spin_unlock_irqrestore(&sha->lock, flags);
+
+	if (!async_req)
+		return ret;
+
+	if (backlog)
+		backlog->complete(backlog, -EINPROGRESS);
+
+	req = ahash_request_cast(async_req);
+	ctx = ahash_request_ctx(req);
+
+	sha->req = req;
+	sha->info = &ctx->info;
+
+	mtk_sha_info_init(sha, ctx);
+
+	if (ctx->op == SHA_OP_UPDATE) {
+		err = mtk_sha_update_start(cryp, sha);
+		if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP))
+			/* no final() after finup() */
+			err = mtk_sha_final_req(cryp, sha);
+	} else if (ctx->op == SHA_OP_FINAL) {
+		err = mtk_sha_final_req(cryp, sha);
+	}
+
+	if (unlikely(err != -EINPROGRESS))
+		/* task will not finish it, so do it here */
+		mtk_sha_finish_req(cryp, sha, err);
+
+	return ret;
+}
+
+static int mtk_sha_enqueue(struct ahash_request *req, u32 op)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(req->base.tfm);
+
+	ctx->op = op;
+
+	return mtk_sha_handle_queue(tctx->cryp, tctx->id, req);
+}
+
+static void mtk_sha_unmap(struct mtk_cryp *cryp, struct mtk_sha *sha)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(sha->req);
+
+	dma_unmap_single(cryp->dev, sha->ct_dma,
+			 sizeof(struct mtk_sha_info), DMA_BIDIRECTIONAL);
+
+	if (ctx->flags & SHA_FLAGS_SG) {
+		dma_unmap_sg(cryp->dev, ctx->sg, 1, DMA_TO_DEVICE);
+		if (ctx->sg->length == ctx->offset) {
+			ctx->sg = sg_next(ctx->sg);
+			if (ctx->sg)
+				ctx->offset = 0;
+		}
+		if (ctx->flags & SHA_FLAGS_PAD) {
+			dma_unmap_single(cryp->dev, ctx->dma_addr,
+					 SHA_BUFFER_LEN, DMA_TO_DEVICE);
+		}
+	} else
+		dma_unmap_single(cryp->dev, ctx->dma_addr,
+				 SHA_BUFFER_LEN, DMA_TO_DEVICE);
+}
+
+static void mtk_sha_complete(struct mtk_cryp *cryp, struct mtk_sha *sha)
+{
+	int err = 0;
+
+	err = mtk_sha_update_start(cryp, sha);
+	if (err != -EINPROGRESS)
+		mtk_sha_finish_req(cryp, sha, err);
+}
+
+static int mtk_sha_update(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	ctx->total = req->nbytes;
+	ctx->sg = req->src;
+	ctx->offset = 0;
+
+	if ((ctx->bufcnt + ctx->total < SHA_BUFFER_LEN) &&
+	    !(ctx->flags & SHA_FLAGS_FINUP))
+		return mtk_sha_append_sg(ctx);
+
+	return mtk_sha_enqueue(req, SHA_OP_UPDATE);
+}
+
+static int mtk_sha_final(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	ctx->flags |= SHA_FLAGS_FINUP;
+
+	if (ctx->flags & SHA_FLAGS_PAD)
+		return mtk_sha_finish(req);
+
+	return mtk_sha_enqueue(req, SHA_OP_FINAL);
+}
+
+static int mtk_sha_finup(struct ahash_request *req)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+	int err1, err2;
+
+	ctx->flags |= SHA_FLAGS_FINUP;
+
+	err1 = mtk_sha_update(req);
+	if (err1 == -EINPROGRESS || err1 == -EBUSY)
+		return err1;
+	/*
+	 * final() has to be always called to cleanup resources
+	 * even if update() failed
+	 */
+	err2 = mtk_sha_final(req);
+
+	return err1 ?: err2;
+}
+
+static int mtk_sha_digest(struct ahash_request *req)
+{
+	return mtk_sha_init(req) ?: mtk_sha_finup(req);
+}
+
+static int mtk_sha_setkey(struct crypto_ahash *tfm,
+			  const unsigned char *key, u32 keylen)
+{
+	struct mtk_sha_ctx *tctx = crypto_ahash_ctx(tfm);
+	struct mtk_sha_hmac_ctx *bctx = tctx->base;
+	size_t bs = crypto_shash_blocksize(bctx->shash);
+	size_t ds = crypto_shash_digestsize(bctx->shash);
+	int err, i;
+
+	SHASH_DESC_ON_STACK(shash, bctx->shash);
+
+	shash->tfm = bctx->shash;
+	shash->flags = crypto_shash_get_flags(bctx->shash) &
+			CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	if (keylen > bs) {
+		err = crypto_shash_digest(shash, key, keylen, bctx->ipad);
+		if (err)
+			return err;
+		keylen = ds;
+	} else {
+		memcpy(bctx->ipad, key, keylen);
+	}
+
+	memset(bctx->ipad + keylen, 0, bs - keylen);
+	memcpy(bctx->opad, bctx->ipad, bs);
+
+	for (i = 0; i < bs; i++) {
+		bctx->ipad[i] ^= 0x36;
+		bctx->opad[i] ^= 0x5c;
+	}
+
+	return err;
+}
+
+static int mtk_sha_export(struct ahash_request *req, void *out)
+{
+	const struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	memcpy(out, ctx, sizeof(*ctx));
+	return 0;
+}
+
+static int mtk_sha_import(struct ahash_request *req, const void *in)
+{
+	struct mtk_sha_reqctx *ctx = ahash_request_ctx(req);
+
+	memcpy(ctx, in, sizeof(*ctx));
+	return 0;
+}
+
+static int mtk_sha_cra_init_alg(struct crypto_tfm *tfm,
+				const char *alg_base)
+{
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(tfm);
+	struct mtk_cryp *cryp = NULL;
+
+	cryp = mtk_sha_find_dev(tctx);
+	if (!cryp)
+		return -ENODEV;
+
+	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+				 sizeof(struct mtk_sha_reqctx));
+
+	if (alg_base) {
+		struct mtk_sha_hmac_ctx *bctx = tctx->base;
+
+		tctx->flags |= SHA_FLAGS_HMAC;
+		bctx->shash = crypto_alloc_shash(alg_base, 0,
+					CRYPTO_ALG_NEED_FALLBACK);
+		if (IS_ERR(bctx->shash)) {
+			pr_err("base driver %s could not be loaded.\n",
+			       alg_base);
+
+			return PTR_ERR(bctx->shash);
+		}
+	}
+	return 0;
+}
+
+static int mtk_sha_cra_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, NULL);
+}
+
+static int mtk_sha_cra_sha1_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha1");
+}
+
+static int mtk_sha_cra_sha224_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha224");
+}
+
+static int mtk_sha_cra_sha256_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha256");
+}
+
+static int mtk_sha_cra_sha384_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha384");
+}
+
+static int mtk_sha_cra_sha512_init(struct crypto_tfm *tfm)
+{
+	return mtk_sha_cra_init_alg(tfm, "sha512");
+}
+
+static void mtk_sha_cra_exit(struct crypto_tfm *tfm)
+{
+	struct mtk_sha_ctx *tctx = crypto_tfm_ctx(tfm);
+
+	if (tctx->flags & SHA_FLAGS_HMAC) {
+		struct mtk_sha_hmac_ctx *bctx = tctx->base;
+
+		crypto_free_shash(bctx->shash);
+	}
+}
+
+static struct ahash_alg algs_sha1_sha224_sha256[] = {
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA1_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha1",
+		.cra_driver_name	= "mtk-sha1",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA1_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA224_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha224",
+		.cra_driver_name	= "mtk-sha224",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA224_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA256_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha256",
+		.cra_driver_name	= "mtk-sha256",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA256_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA1_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha1)",
+		.cra_driver_name	= "mtk-hmac-sha1",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA1_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha1_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA224_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha224)",
+		.cra_driver_name	= "mtk-hmac-sha224",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA224_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha224_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA256_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha256)",
+		.cra_driver_name	= "mtk-hmac-sha256",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA256_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha256_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+};
+
+static struct ahash_alg algs_sha384_sha512[] = {
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA384_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha384",
+		.cra_driver_name	= "mtk-sha384",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA384_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.halg.digestsize	= SHA512_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "sha512",
+		.cra_driver_name	= "mtk-sha512",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA384_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha384)",
+		.cra_driver_name	= "mtk-hmac-sha384",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA384_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha384_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+{
+	.init		= mtk_sha_init,
+	.update		= mtk_sha_update,
+	.final		= mtk_sha_final,
+	.finup		= mtk_sha_finup,
+	.digest		= mtk_sha_digest,
+	.export		= mtk_sha_export,
+	.import		= mtk_sha_import,
+	.setkey		= mtk_sha_setkey,
+	.halg.digestsize	= SHA512_DIGEST_SIZE,
+	.halg.statesize = sizeof(struct mtk_sha_reqctx),
+	.halg.base	= {
+		.cra_name		= "hmac(sha512)",
+		.cra_driver_name	= "mtk-hmac-sha512",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC |
+					  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct mtk_sha_ctx) +
+					sizeof(struct mtk_sha_hmac_ctx),
+		.cra_alignmask		= SHA_ALIGN_MSK,
+		.cra_module		= THIS_MODULE,
+		.cra_init		= mtk_sha_cra_sha512_init,
+		.cra_exit		= mtk_sha_cra_exit,
+	}
+},
+};
+
+static void mtk_sha_task0(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_sha *sha = cryp->sha[0];
+
+	mtk_sha_unmap(cryp, sha);
+	mtk_sha_complete(cryp, sha);
+}
+
+static void mtk_sha_task1(unsigned long data)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)data;
+	struct mtk_sha *sha = cryp->sha[1];
+
+	mtk_sha_unmap(cryp, sha);
+	mtk_sha_complete(cryp, sha);
+}
+
+static irqreturn_t mtk_sha_ring2_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_sha *sha = cryp->sha[0];
+	u32 val = mtk_sha_read(cryp, RDR_STAT(RING2));
+
+	mtk_sha_write(cryp, RDR_STAT(RING2), val);
+
+	if (likely((SHA_FLAGS_BUSY & sha->flags))) {
+		mtk_sha_write(cryp, RDR_PROC_COUNT(RING2), MTK_DESC_CNT_CLR);
+		mtk_sha_write(cryp, RDR_THRESH(RING2), MTK_RDR_THRESH_DEF);
+
+		tasklet_schedule(&sha->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mtk_sha_ring3_irq(int irq, void *dev_id)
+{
+	struct mtk_cryp *cryp = (struct mtk_cryp *)dev_id;
+	struct mtk_sha *sha = cryp->sha[1];
+	u32 val = mtk_sha_read(cryp, RDR_STAT(RING3));
+
+	mtk_sha_write(cryp, RDR_STAT(RING3), val);
+
+	if (likely((SHA_FLAGS_BUSY & sha->flags))) {
+		mtk_sha_write(cryp, RDR_PROC_COUNT(RING3), MTK_DESC_CNT_CLR);
+		mtk_sha_write(cryp, RDR_THRESH(RING3), MTK_RDR_THRESH_DEF);
+
+		tasklet_schedule(&sha->task);
+	} else {
+		dev_warn(cryp->dev, "AES interrupt when no active requests.\n");
+	}
+	return IRQ_HANDLED;
+}
+
+/*
+ * The purpose of two SHA records is used to get extra performance.
+ * It is similar to mtk_aes_record_init().
+ */
+static int mtk_sha_record_init(struct mtk_cryp *cryp)
+{
+	struct mtk_sha **sha = cryp->sha;
+	int i, err = -ENOMEM;
+
+	for (i = 0; i < RECORD_NUM; i++) {
+		sha[i] = kzalloc(sizeof(**sha), GFP_KERNEL);
+		if (!sha[i])
+			goto err_cleanup;
+
+		sha[i]->id = i + RING2;
+
+		spin_lock_init(&sha[i]->lock);
+		crypto_init_queue(&sha[i]->queue, SHA_QUEUE_SIZE);
+	}
+
+	tasklet_init(&sha[0]->task, mtk_sha_task0, (unsigned long)cryp);
+	tasklet_init(&sha[1]->task, mtk_sha_task1, (unsigned long)cryp);
+
+	cryp->rec = 1;
+
+	return 0;
+
+err_cleanup:
+	for (; i--; )
+		kfree(sha[i]);
+	return err;
+}
+
+static void mtk_sha_record_free(struct mtk_cryp *cryp)
+{
+	int i;
+
+	for (i = 0; i < RECORD_NUM; i++) {
+		tasklet_kill(&cryp->sha[i]->task);
+		kfree(cryp->sha[i]);
+	}
+}
+
+static void mtk_sha_unregister_algs(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha1_sha224_sha256); i++)
+		crypto_unregister_ahash(&algs_sha1_sha224_sha256[i]);
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha384_sha512); i++)
+		crypto_unregister_ahash(&algs_sha384_sha512[i]);
+}
+
+static int mtk_sha_register_algs(void)
+{
+	int err, i;
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha1_sha224_sha256); i++) {
+		err = crypto_register_ahash(&algs_sha1_sha224_sha256[i]);
+		if (err)
+			goto err_sha_224_256_algs;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(algs_sha384_sha512); i++) {
+		err = crypto_register_ahash(&algs_sha384_sha512[i]);
+		if (err)
+			goto err_sha_384_512_algs;
+	}
+
+	return 0;
+
+err_sha_384_512_algs:
+	for (; i--; )
+		crypto_unregister_ahash(&algs_sha384_sha512[i]);
+	i = ARRAY_SIZE(algs_sha1_sha224_sha256);
+err_sha_224_256_algs:
+	for (; i--; )
+		crypto_unregister_ahash(&algs_sha1_sha224_sha256[i]);
+
+	return err;
+}
+
+int mtk_hash_alg_register(struct mtk_cryp *cryp)
+{
+	int err;
+
+	INIT_LIST_HEAD(&cryp->sha_list);
+
+	/* Initialize two hash records */
+	err = mtk_sha_record_init(cryp);
+	if (err)
+		goto err_record;
+
+	/* Ring2 irq is use by SHA record0 */
+	err = devm_request_irq(cryp->dev, cryp->irq[RING2],
+			       mtk_sha_ring2_irq, IRQF_TRIGGER_LOW,
+			       "mtk-sha", cryp);
+	if (err) {
+		dev_err(cryp->dev, "unable to request sha irq0.\n");
+		goto err_res;
+	}
+
+	/* Ring3 irq is use by SHA record1 */
+	err = devm_request_irq(cryp->dev, cryp->irq[RING3],
+			       mtk_sha_ring3_irq, IRQF_TRIGGER_LOW,
+			       "mtk-sha", cryp);
+	if (err) {
+		dev_err(cryp->dev, "unable to request sha irq1.\n");
+		goto err_res;
+	}
+
+	/* enable ring2 and ring3 interrupt for hash */
+	mtk_sha_write(cryp, AIC_ENABLE_SET(RING2), MTK_IRQ_RDR2);
+	mtk_sha_write(cryp, AIC_ENABLE_SET(RING3), MTK_IRQ_RDR3);
+
+	cryp->tmp = dma_alloc_coherent(cryp->dev, SHA_TMP_STATE_SIZE,
+					&cryp->tmp_dma, GFP_KERNEL);
+	if (!cryp->tmp) {
+		dev_err(cryp->dev, "unable to allocate tmp buffer.\n");
+		err = -EINVAL;
+		goto err_res;
+	}
+
+	spin_lock(&mtk_sha.lock);
+	list_add_tail(&cryp->sha_list, &mtk_sha.dev_list);
+	spin_unlock(&mtk_sha.lock);
+
+	err = mtk_sha_register_algs();
+	if (err)
+		goto err_algs;
+
+	return 0;
+
+err_algs:
+	spin_lock(&mtk_sha.lock);
+	list_del(&cryp->sha_list);
+	spin_unlock(&mtk_sha.lock);
+	dma_free_coherent(cryp->dev, SHA_TMP_STATE_SIZE,
+			  cryp->tmp, cryp->tmp_dma);
+err_res:
+	mtk_sha_record_free(cryp);
+err_record:
+
+	dev_err(cryp->dev, "mtk-sha initialization failed.\n");
+	return err;
+}
+
+void mtk_hash_alg_release(struct mtk_cryp *cryp)
+{
+	spin_lock(&mtk_sha.lock);
+	list_del(&cryp->sha_list);
+	spin_unlock(&mtk_sha.lock);
+
+	mtk_sha_unregister_algs();
+	dma_free_coherent(cryp->dev, SHA_TMP_STATE_SIZE,
+			  cryp->tmp, cryp->tmp_dma);
+	mtk_sha_record_free(cryp);
+}
-- 
1.9.1

^ permalink raw reply related

* [PATCH v1 0/2] Add MediaTek crypto acclelrator driver
From: Ryder Lee @ 2016-12-05  7:01 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, Matthias Brugger
  Cc: devicetree, linux-mediatek, linux-kernel, linux-crypto,
	linux-arm-kernel, Sean Wang, Roy Luo, Ryder Lee

Hello,

This adds support for the MediaTek hardware accelerator on 
mt7623 SoC.

This driver currently implement: 
- SHA1 and SHA2 family(HMAC) hash alogrithms.
- AES block cipher in CBC/ECB mode with 128/196/256 bits keys.

Changes since v1:
- remove EXPORT_SYMBOL
- remove unused PRNG setting
- sort headers in alphabetical order
- add a definition for IRQ unmber
- replace ambiguous definition
- add more annotation and function comment
- add COMPILE_TEST in Kconfig


Ryder Lee (2):
  Add crypto driver support for some MediaTek chips
  crypto: mediatek - add DT bindings documentation

 .../devicetree/bindings/crypto/mediatek-crypto.txt |   32 +
 drivers/crypto/Kconfig                             |   17 +
 drivers/crypto/Makefile                            |    1 +
 drivers/crypto/mediatek/Makefile                   |    2 +
 drivers/crypto/mediatek/mtk-aes.c                  |  763 +++++++++++
 drivers/crypto/mediatek/mtk-platform.c             |  580 ++++++++
 drivers/crypto/mediatek/mtk-platform.h             |  235 ++++
 drivers/crypto/mediatek/mtk-regs.h                 |  194 +++
 drivers/crypto/mediatek/mtk-sha.c                  | 1423 ++++++++++++++++++++
 9 files changed, 3247 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
 create mode 100644 drivers/crypto/mediatek/Makefile
 create mode 100644 drivers/crypto/mediatek/mtk-aes.c
 create mode 100644 drivers/crypto/mediatek/mtk-platform.c
 create mode 100644 drivers/crypto/mediatek/mtk-platform.h
 create mode 100644 drivers/crypto/mediatek/mtk-regs.h
 create mode 100644 drivers/crypto/mediatek/mtk-sha.c

-- 
1.9.1

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox