Linux cryptographic layer development
 help / color / mirror / Atom feed
* [PATCH 1/4] crypto: testmgr - avoid overlap in chunked tests
From: Ard Biesheuvel @ 2016-11-24 15:43 UTC (permalink / raw)
  To: linux-crypto, herbert, linux-arm-kernel, catalin.marinas,
	will.deacon, linux
  Cc: steve.capper, dingtianhong, yangshengkai, yuehaibing, hanjun.guo,
	Ard Biesheuvel
In-Reply-To: <1480002201-1427-1-git-send-email-ard.biesheuvel@linaro.org>

The IDXn offsets are chosen such that tap values (which may go up to
255) end up overlapping in the xbuf allocation. In particular, IDX1
and IDX3 are too close together, so update IDX3 to avoid this issue.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 crypto/testmgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 62dffa0028ac..15650597dcc9 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -62,7 +62,7 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
  */
 #define IDX1		32
 #define IDX2		32400
-#define IDX3		1
+#define IDX3		511
 #define IDX4		8193
 #define IDX5		22222
 #define IDX6		17101
-- 
2.7.4

^ permalink raw reply related

* [PATCH 0/4] crypto: CRCT10DIF support for ARM and arm64
From: Ard Biesheuvel @ 2016-11-24 15:43 UTC (permalink / raw)
  To: linux-crypto, herbert, linux-arm-kernel, catalin.marinas,
	will.deacon, linux
  Cc: steve.capper, dingtianhong, yangshengkai, yuehaibing, hanjun.guo,
	Ard Biesheuvel

First of all, apologies to Yue Haibing for stealing his thunder, to some
extent. But after reviewing (and replying to) his patch, I noticed that his
code is not original code, but simply a transliteration of the existing Intel
code that resides in arch/x86/crypto/crct10dif-pcl-asm_64.S, but with the
license and copyright statement removed. 

So, if we are going to transliterate code, let's credit the original authors,
even if the resulting code does not look like the code you started out with.

Then, I noticed that we could stay *much* closer to the original, and that
there is no need for jump tables or computed gotos at all. So I got a bit
carried away, and ended up reimplementing the whole thing, for both arm and64
and ARM.

Patch #1 fixes an issue in testmgr that results in spurious false negatives
in the chunking tests if the third chunk exceeds 31 bytes.

Patch #2 expands the existing CRCT10DIF test cases, to ensure that all
code paths are actually covered.

Patch #3 is a straight transliteration of the Intel code to arm64.

Patch #4 is a straight transliteration of the Intel code to ARM. This patch
is against patch #3 (using --find-copies-harder) so that it is easy to
see how the ARM code deviates from the arm64 code.

NOTE: this code uses the 64x64->128 bit polynomial multiply instruction,
which is only available on cores that implement the v8 Crypto Extensions.

Ard Biesheuvel (4):
  crypto: testmgr - avoid overlap in chunked tests
  crypto: testmgr - add/enhance test cases for CRC-T10DIF
  crypto: arm64/crct10dif - port x86 SSE implementation to arm64
  crypto: arm/crct10dif - port x86 SSE implementation to ARM

 arch/arm/crypto/Kconfig               |   5 +
 arch/arm/crypto/Makefile              |   2 +
 arch/arm/crypto/crct10dif-ce-core.S   | 569 ++++++++++++++++++++
 arch/arm/crypto/crct10dif-ce-glue.c   |  89 +++
 arch/arm64/crypto/Kconfig             |   5 +
 arch/arm64/crypto/Makefile            |   3 +
 arch/arm64/crypto/crct10dif-ce-core.S | 518 ++++++++++++++++++
 arch/arm64/crypto/crct10dif-ce-glue.c |  80 +++
 crypto/testmgr.c                      |   2 +-
 crypto/testmgr.h                      |  70 ++-
 10 files changed, 1314 insertions(+), 29 deletions(-)
 create mode 100644 arch/arm/crypto/crct10dif-ce-core.S
 create mode 100644 arch/arm/crypto/crct10dif-ce-glue.c
 create mode 100644 arch/arm64/crypto/crct10dif-ce-core.S
 create mode 100644 arch/arm64/crypto/crct10dif-ce-glue.c

-- 
2.7.4

^ permalink raw reply

* [PATCH 4/4] crypto: arm/crct10dif - port x86 SSE implementation to ARM
From: Ard Biesheuvel @ 2016-11-24 15:43 UTC (permalink / raw)
  To: linux-crypto, herbert, linux-arm-kernel, catalin.marinas,
	will.deacon, linux
  Cc: steve.capper, Ard Biesheuvel, yuehaibing, hanjun.guo,
	dingtianhong, yangshengkai
In-Reply-To: <1480002201-1427-1-git-send-email-ard.biesheuvel@linaro.org>

This is a straight transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides under in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/Kconfig                        |   5 +
 arch/arm/crypto/Makefile                       |   2 +
 arch/{arm64 => arm}/crypto/crct10dif-ce-core.S | 457 +++++++++++---------
 arch/{arm64 => arm}/crypto/crct10dif-ce-glue.c |  23 +-
 4 files changed, 277 insertions(+), 210 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..fce801fa52a1 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
 	  that is part of the ARMv8 Crypto Extensions
 
+config CRYPTO_CRCT10DIF_ARM_CE
+	tristate "CRCT10DIF digest algorithm using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC_T10DIF
+	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..fc77265014b7 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
 
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
+crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
similarity index 60%
copy from arch/arm64/crypto/crct10dif-ce-core.S
copy to arch/arm/crypto/crct10dif-ce-core.S
index 9148ebd3470a..30168b0f8581 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -1,5 +1,5 @@
 //
-// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
 //
 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
 //
@@ -71,20 +71,43 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-	.text
-	.cpu		generic+crypto
-
-	arg1_low32	.req	w0
-	arg2		.req	x1
-	arg3		.req	x2
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...)		code
+#endif
 
-	vzr		.req	v13
+	.text
+	.fpu		crypto-neon-fp-armv8
+
+	arg1_low32	.req	r0
+	arg2		.req	r1
+	arg3		.req	r2
+
+	qzr		.req	q13
+
+	q0l		.req	d0
+	q0h		.req	d1
+	q1l		.req	d2
+	q1h		.req	d3
+	q2l		.req	d4
+	q2h		.req	d5
+	q3l		.req	d6
+	q3h		.req	d7
+	q4l		.req	d8
+	q4h		.req	d9
+	q5l		.req	d10
+	q5h		.req	d11
+	q6l		.req	d12
+	q6h		.req	d13
+	q7l		.req	d14
+	q7h		.req	d15
 
 ENTRY(crc_t10dif_pmull)
-	stp		x29, x30, [sp, #-32]!
-	mov		x29, sp
+	push		{r4, lr}
+	sub		sp, sp, #0x10
 
-	movi		vzr.16b, #0		// init zero register
+	vmov.i8		qzr, #0			// init zero register
 
 	// adjust the 16-bit initial_crc value, scale it to 32 bits
 	lsl		arg1_low32, arg1_low32, #16
@@ -93,41 +116,44 @@ ENTRY(crc_t10dif_pmull)
 	cmp		arg3, #256
 
 	// for sizes less than 128, we can't fold 64B at a time...
-	b.lt		_less_than_128
+	blt		_less_than_128
 
 	// load the initial crc value
 	// crc value does not need to be byte-reflected, but it needs
 	// to be moved to the high part of the register.
 	// because data will be byte-reflected and will align with
 	// initial crc at correct place.
-	movi		v10.16b, #0
-	mov		v10.s[3], arg1_low32		// initial crc
+	vmov		s0, arg1_low32		// initial crc
+	vext.8		q10, qzr, q0, #4
 
 	// receive the initial 64B data, xor the initial crc value
-	ld1		{v0.2d-v3.2d}, [arg2], #0x40
-	ld1		{v4.2d-v7.2d}, [arg2], #0x40
-CPU_LE(	rev64		v0.16b, v0.16b		)
-CPU_LE(	rev64		v1.16b, v1.16b		)
-CPU_LE(	rev64		v2.16b, v2.16b		)
-CPU_LE(	rev64		v3.16b, v3.16b		)
-CPU_LE(	rev64		v4.16b, v4.16b		)
-CPU_LE(	rev64		v5.16b, v5.16b		)
-CPU_LE(	rev64		v6.16b, v6.16b		)
-CPU_LE(	rev64		v7.16b, v7.16b		)
-
-	ext		v0.16b, v0.16b, v0.16b, #8
-	ext		v1.16b, v1.16b, v1.16b, #8
-	ext		v2.16b, v2.16b, v2.16b, #8
-	ext		v3.16b, v3.16b, v3.16b, #8
-	ext		v4.16b, v4.16b, v4.16b, #8
-	ext		v5.16b, v5.16b, v5.16b, #8
-	ext		v6.16b, v6.16b, v6.16b, #8
-	ext		v7.16b, v7.16b, v7.16b, #8
+	vld1.64		{q0-q1}, [arg2]!
+	vld1.64		{q2-q3}, [arg2]!
+	vld1.64		{q4-q5}, [arg2]!
+	vld1.64		{q6-q7}, [arg2]!
+CPU_LE(	vrev64.8	q0, q0			)
+CPU_LE(	vrev64.8	q1, q1			)
+CPU_LE(	vrev64.8	q2, q2			)
+CPU_LE(	vrev64.8	q3, q3			)
+CPU_LE(	vrev64.8	q4, q4			)
+CPU_LE(	vrev64.8	q5, q5			)
+CPU_LE(	vrev64.8	q6, q6			)
+CPU_LE(	vrev64.8	q7, q7			)
+
+	vext.8		q0, q0, q0, #8
+	vext.8		q1, q1, q1, #8
+	vext.8		q2, q2, q2, #8
+	vext.8		q3, q3, q3, #8
+	vext.8		q4, q4, q4, #8
+	vext.8		q5, q5, q5, #8
+	vext.8		q6, q6, q6, #8
+	vext.8		q7, q7, q7, #8
 
 	// XOR the initial_crc value
-	eor		v0.16b, v0.16b, v10.16b
+	veor.8		q0, q0, q10
 
-	ldr		q10, rk3	// xmm10 has rk3 and rk4
+	adrl		ip, rk3
+	vld1.64		{q10}, [ip]	// xmm10 has rk3 and rk4
 					// type of pmull instruction
 					// will determine which constant to use
 
@@ -146,32 +172,32 @@ CPU_LE(	rev64		v7.16b, v7.16b		)
 _fold_64_B_loop:
 
 	.macro		fold64, reg1, reg2
-	ld1		{v11.2d-v12.2d}, [arg2], #0x20
-CPU_LE(	rev64		v11.16b, v11.16b	)
-CPU_LE(	rev64		v12.16b, v12.16b	)
-	ext		v11.16b, v11.16b, v11.16b, #8
-	ext		v12.16b, v12.16b, v12.16b, #8
-
-	pmull2		v8.1q, \reg1\().2d, v10.2d
-	pmull		\reg1\().1q, \reg1\().1d, v10.1d
-	pmull2		v9.1q, \reg2\().2d, v10.2d
-	pmull		\reg2\().1q, \reg2\().1d, v10.1d
-
-	eor		\reg1\().16b, \reg1\().16b, v11.16b
-	eor		\reg2\().16b, \reg2\().16b, v12.16b
-	eor		\reg1\().16b, \reg1\().16b, v8.16b
-	eor		\reg2\().16b, \reg2\().16b, v9.16b
+	vld1.64		{q11-q12}, [arg2]!
+CPU_LE(	vrev64.8	q11, q11		)
+CPU_LE(	vrev64.8	q12, q12		)
+	vext.8		q11, q11, q11, #8
+	vext.8		q12, q12, q12, #8
+
+	vmull.p64	q8, \reg1\()h, d21
+	vmull.p64	\reg1\(), \reg1\()l, d20
+	vmull.p64	q9, \reg2\()h, d21
+	vmull.p64	\reg2\(), \reg2\()l, d20
+
+	veor.8		\reg1, \reg1, q11
+	veor.8		\reg2, \reg2, q12
+	veor.8		\reg1, \reg1, q8
+	veor.8		\reg2, \reg2, q9
 	.endm
 
-	fold64		v0, v1
-	fold64		v2, v3
-	fold64		v4, v5
-	fold64		v6, v7
+	fold64		q0, q1
+	fold64		q2, q3
+	fold64		q4, q5
+	fold64		q6, q7
 
 	subs		arg3, arg3, #128
 
 	// check if there is another 64B in the buffer to be able to fold
-	b.ge		_fold_64_B_loop
+	bge		_fold_64_B_loop
 
 	// at this point, the buffer pointer is pointing at the last y Bytes
 	// of the buffer the 64B of folded data is in 4 of the vector
@@ -181,46 +207,47 @@ CPU_LE(	rev64		v12.16b, v12.16b	)
 	// constants
 
 	.macro		fold16, rk, reg
-	ldr		q10, \rk
-	pmull		v8.1q, \reg\().1d, v10.1d
-	pmull2		\reg\().1q, \reg\().2d, v10.2d
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, \reg\().16b
+	vldr		d20, \rk
+	vldr		d21, \rk + 8
+	vmull.p64	q8, \reg\()l, d20
+	vmull.p64	\reg\(), \reg\()h, d21
+	veor.8		q7, q7, q8
+	veor.8		q7, q7, \reg
 	.endm
 
-	fold16		rk9, v0
-	fold16		rk11, v1
-	fold16		rk13, v2
-	fold16		rk15, v3
-	fold16		rk17, v4
-	fold16		rk19, v5
-	fold16		rk1, v6
+	fold16		rk9, q0
+	fold16		rk11, q1
+	fold16		rk13, q2
+	fold16		rk15, q3
+	fold16		rk17, q4
+	fold16		rk19, q5
+	fold16		rk1, q6
 
 	// instead of 64, we add 48 to the loop counter to save 1 instruction
 	// from the loop instead of a cmp instruction, we use the negative
 	// flag with the jl instruction
 	adds		arg3, arg3, #(128-16)
-	b.lt		_final_reduction_for_128
+	blt		_final_reduction_for_128
 
 	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
 	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
 	// continue folding 16B at a time
 
 _16B_reduction_loop:
-	pmull		v8.1q, v7.1d, v10.1d
-	pmull2		v7.1q, v7.2d, v10.2d
-	eor		v7.16b, v7.16b, v8.16b
-
-	ld1		{v0.2d}, [arg2], #16
-CPU_LE(	rev64		v0.16b, v0.16b		)
-	ext		v0.16b, v0.16b, v0.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vmull.p64	q8, d14, d20
+	vmull.p64	q7, d15, d21
+	veor.8		q7, q7, q8
+
+	vld1.64		{q0}, [arg2]!
+CPU_LE(	vrev64.8	q0, q0		)
+	vext.8		q0, q0, q0, #8
+	veor.8		q7, q7, q0
 	subs		arg3, arg3, #16
 
 	// instead of a cmp instruction, we utilize the flags with the
 	// jge instruction equivalent of: cmp arg3, 16-16
 	// check if there is any more 16B in the buffer to be able to fold
-	b.ge		_16B_reduction_loop
+	bge		_16B_reduction_loop
 
 	// now we have 16+z bytes left to reduce, where 0<= z < 16.
 	// first, we reduce the data in the xmm7 register
@@ -229,99 +256,104 @@ _final_reduction_for_128:
 	// check if any more data to fold. If not, compute the CRC of
 	// the final 128 bits
 	adds		arg3, arg3, #16
-	b.eq		_128_done
+	beq		_128_done
 
 	// here we are getting data that is less than 16 bytes.
 	// since we know that there was data before the pointer, we can
 	// offset the input pointer before the actual point, to receive
 	// exactly 16 bytes. after that the registers need to be adjusted.
 _get_last_two_regs:
-	mov		v2.16b, v7.16b
+	vmov		q2, q7
 
 	add		arg2, arg2, arg3
 	sub		arg2, arg2, #16
-	ld1		{v1.2d}, [arg2]
-CPU_LE(	rev64		v1.16b, v1.16b		)
-	ext		v1.16b, v1.16b, v1.16b, #8
+	vld1.64		{q1}, [arg2]
+CPU_LE(	vrev64.8	q1, q1			)
+	vext.8		q1, q1, q1, #8
 
 	// get rid of the extra data that was loaded before
 	// load the shift constant
-	adr		x4, tbl_shf_table + 16
-	sub		x4, x4, arg3
-	ld1		{v0.16b}, [x4]
+	adr		lr, tbl_shf_table + 16
+	sub		lr, lr, arg3
+	vld1.8		{q0}, [lr]
 
 	// shift v2 to the left by arg3 bytes
-	tbl		v2.16b, {v2.16b}, v0.16b
+	vmov		q9, q2
+	vtbl.8		d4, {d18-d19}, d0
+	vtbl.8		d5, {d18-d19}, d1
 
 	// shift v7 to the right by 16-arg3 bytes
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
+	vmov.i8		q9, #0x80
+	veor.8		q0, q0, q9
+	vmov		q9, q7
+	vtbl.8		d14, {d18-d19}, d0
+	vtbl.8		d15, {d18-d19}, d1
 
 	// blend
-	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
-	bsl		v0.16b, v2.16b, v1.16b
+	vshr.s8		q0, q0, #7		// convert to 8-bit mask
+	vbsl.8		q0, q2, q1
 
 	// fold 16 Bytes
-	pmull		v8.1q, v7.1d, v10.1d
-	pmull2		v7.1q, v7.2d, v10.2d
-	eor		v7.16b, v7.16b, v8.16b
-	eor		v7.16b, v7.16b, v0.16b
+	vmull.p64	q8, d14, d20
+	vmull.p64	q7, d15, d21
+	veor.8		q7, q7, q8
+	veor.8		q7, q7, q0
 
 _128_done:
 	// compute crc of a 128-bit value
-	ldr		q10, rk5		// rk5 and rk6 in xmm10
+	vldr		d20, rk5
+	vldr		d21, rk6		// rk5 and rk6 in xmm10
 
 	// 64b fold
-	mov		v0.16b, v7.16b
-	ext		v7.16b, v7.16b, v7.16b, #8
-	pmull		v7.1q, v7.1d, v10.1d
-	ext		v0.16b, vzr.16b, v0.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vmov		q0, q7
+	vmull.p64	q7, d15, d20
+	vext.8		q0, qzr, q0, #8
+	veor.8		q7, q7, q0
 
 	// 32b fold
-	mov		v0.16b, v7.16b
-	mov		v0.s[3], vzr.s[0]
-	ext		v7.16b, v7.16b, vzr.16b, #12
-	ext		v9.16b, v10.16b, v10.16b, #8
-	pmull		v7.1q, v7.1d, v9.1d
-	eor		v7.16b, v7.16b, v0.16b
+	veor.8		d1, d1, d1
+	vmov		d0, d14
+	vmov		s2, s30
+	vext.8		q7, q7, qzr, #12
+	vmull.p64	q7, d14, d21
+	veor.8		q7, q7, q0
 
 	// barrett reduction
 _barrett:
-	ldr		q10, rk7
-	mov		v0.16b, v7.16b
-	ext		v7.16b, v7.16b, v7.16b, #8
+	vldr		d20, rk7
+	vldr		d21, rk8
+	vmov.8		q0, q7
 
-	pmull		v7.1q, v7.1d, v10.1d
-	ext		v7.16b, vzr.16b, v7.16b, #12
-	pmull2		v7.1q, v7.2d, v10.2d
-	ext		v7.16b, vzr.16b, v7.16b, #12
-	eor		v7.16b, v7.16b, v0.16b
-	mov		w0, v7.s[1]
+	vmull.p64	q7, d15, d20
+	vext.8		q7, qzr, q7, #12
+	vmull.p64	q7, d15, d21
+	vext.8		q7, qzr, q7, #12
+	veor.8		q7, q7, q0
+	vmov		r0, s29
 
 _cleanup:
 	// scale the result back to 16 bits
-	lsr		x0, x0, #16
-	ldp		x29, x30, [sp], #32
-	ret
+	lsr		r0, r0, #16
+	add		sp, sp, #0x10
+	pop		{r4, pc}
 
 	.align		4
 _less_than_128:
 
 	// check if there is enough buffer to be able to fold 16B at a time
 	cmp		arg3, #32
-	b.lt		_less_than_32
+	blt		_less_than_32
 
 	// now if there is, load the constants
-	ldr		q10, rk1		// rk1 and rk2 in xmm10
+	vldr		d20, rk1
+	vldr		d21, rk2		// rk1 and rk2 in xmm10
 
-	movi		v0.16b, #0
-	mov		v0.s[3], arg1_low32	// get the initial crc value
-	ld1		{v7.2d}, [arg2], #0x10
-CPU_LE(	rev64		v7.16b, v7.16b		)
-	ext		v7.16b, v7.16b, v7.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vmov.i8		q0, #0
+	vmov		s3, arg1_low32		// get the initial crc value
+	vld1.64		{q7}, [arg2]!
+CPU_LE(	vrev64.8	q7, q7		)
+	vext.8		q7, q7, q7, #8
+	veor.8		q7, q7, q0
 
 	// update the counter. subtract 32 instead of 16 to save one
 	// instruction from the loop
@@ -331,21 +363,23 @@ CPU_LE(	rev64		v7.16b, v7.16b		)
 
 	.align		4
 _less_than_32:
-	cbz		arg3, _cleanup
+	teq		arg3, #0
+	beq		_cleanup
 
-	movi		v0.16b, #0
-	mov		v0.s[3], arg1_low32	// get the initial crc value
+	vmov.i8		q0, #0
+	vmov		s3, arg1_low32		// get the initial crc value
 
 	cmp		arg3, #16
-	b.eq		_exact_16_left
-	b.lt		_less_than_16_left
+	beq		_exact_16_left
+	blt		_less_than_16_left
 
-	ld1		{v7.2d}, [arg2], #0x10
-CPU_LE(	rev64		v7.16b, v7.16b		)
-	ext		v7.16b, v7.16b, v7.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vld1.64		{q7}, [arg2]!
+CPU_LE(	vrev64.8	q7, q7		)
+	vext.8		q7, q7, q7, #8
+	veor.8		q7, q7, q0
 	sub		arg3, arg3, #16
-	ldr		q10, rk1		// rk1 and rk2 in xmm10
+	vldr		d20, rk1
+	vldr		d21, rk2		// rk1 and rk2 in xmm10
 	b		_get_last_two_regs
 
 	.align		4
@@ -353,117 +387,124 @@ _less_than_16_left:
 	// use stack space to load data less than 16 bytes, zero-out
 	// the 16B in memory first.
 
-	add		x11, sp, #0x10
-	stp		xzr, xzr, [x11]
+	vst1.8		{qzr}, [sp]
+	mov		ip, sp
 
 	cmp		arg3, #4
-	b.lt		_only_less_than_4
+	blt		_only_less_than_4
 
 	// backup the counter value
-	mov		x9, arg3
-	tbz		arg3, #3, _less_than_8_left
+	mov		lr, arg3
+	cmp		arg3, #8
+	blt		_less_than_8_left
 
 	// load 8 Bytes
-	ldr		x0, [arg2], #8
-	str		x0, [x11], #8
+	ldr		r0, [arg2], #4
+	ldr		r3, [arg2], #4
+	str		r0, [ip], #4
+	str		r3, [ip], #4
 	sub		arg3, arg3, #8
 
 _less_than_8_left:
-	tbz		arg3, #2, _less_than_4_left
+	cmp		arg3, #4
+	blt		_less_than_4_left
 
 	// load 4 Bytes
-	ldr		w0, [arg2], #4
-	str		w0, [x11], #4
+	ldr		r0, [arg2], #4
+	str		r0, [ip], #4
 	sub		arg3, arg3, #4
 
 _less_than_4_left:
-	tbz		arg3, #1, _less_than_2_left
+	cmp		arg3, #2
+	blt		_less_than_2_left
 
 	// load 2 Bytes
-	ldrh		w0, [arg2], #2
-	strh		w0, [x11], #2
+	ldrh		r0, [arg2], #2
+	strh		r0, [ip], #2
 	sub		arg3, arg3, #2
 
 _less_than_2_left:
-	cbz		arg3, _zero_left
+	cmp		arg3, #1
+	blt		_zero_left
 
 	// load 1 Byte
-	ldrb		w0, [arg2]
-	strb		w0, [x11]
+	ldrb		r0, [arg2]
+	strb		r0, [ip]
 
 _zero_left:
-	add		x11, sp, #0x10
-	ld1		{v7.2d}, [x11]
-CPU_LE(	rev64		v7.16b, v7.16b		)
-	ext		v7.16b, v7.16b, v7.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vld1.64		{q7}, [sp]
+CPU_LE(	vrev64.8	q7, q7		)
+	vext.8		q7, q7, q7, #8
+	veor.8		q7, q7, q0
 
 	// shl r9, 4
-	adr		x0, tbl_shf_table + 16
-	sub		x0, x0, x9
-	ld1		{v0.16b}, [x0]
-	movi		v9.16b, #0x80
-	eor		v0.16b, v0.16b, v9.16b
-	tbl		v7.16b, {v7.16b}, v0.16b
+	adr		ip, tbl_shf_table + 16
+	sub		ip, ip, lr
+	vld1.8		{q0}, [ip]
+	vmov.i8		q9, #0x80
+	veor.8		q0, q0, q9
+	vmov		q9, q7
+	vtbl.8		d14, {d18-d19}, d0
+	vtbl.8		d15, {d18-d19}, d1
 
 	b		_128_done
 
 	.align		4
 _exact_16_left:
-	ld1		{v7.2d}, [arg2]
-CPU_LE(	rev64		v7.16b, v7.16b		)
-	ext		v7.16b, v7.16b, v7.16b, #8
-	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
+	vld1.64		{q7}, [arg2]
+CPU_LE(	vrev64.8	q7, q7			)
+	vext.8		q7, q7, q7, #8
+	veor.8		q7, q7, q0		// xor the initial crc value
 
 	b		_128_done
 
 _only_less_than_4:
 	cmp		arg3, #3
-	b.lt		_only_less_than_3
+	blt		_only_less_than_3
 
 	// load 3 Bytes
-	ldrh		w0, [arg2]
-	strh		w0, [x11]
+	ldrh		r0, [arg2]
+	strh		r0, [ip]
 
-	ldrb		w0, [arg2, #2]
-	strb		w0, [x11, #2]
+	ldrb		r0, [arg2, #2]
+	strb		r0, [ip, #2]
 
-	ld1		{v7.2d}, [x11]
-CPU_LE(	rev64		v7.16b, v7.16b		)
-	ext		v7.16b, v7.16b, v7.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vld1.64		{q7}, [ip]
+CPU_LE(	vrev64.8	q7, q7			)
+	vext.8		q7, q7, q7, #8
+	veor.8		q7, q7, q0
 
-	ext		v7.16b, v7.16b, vzr.16b, #5
+	vext.8		q7, q7, qzr, #5
 	b		_barrett
 
 _only_less_than_3:
 	cmp		arg3, #2
-	b.lt		_only_less_than_2
+	blt		_only_less_than_2
 
 	// load 2 Bytes
-	ldrh		w0, [arg2]
-	strh		w0, [x11]
+	ldrh		r0, [arg2]
+	strh		r0, [ip]
 
-	ld1		{v7.2d}, [x11]
-CPU_LE(	rev64		v7.16b, v7.16b		)
-	ext		v7.16b, v7.16b, v7.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vld1.64		{q7}, [ip]
+CPU_LE(	vrev64.8	q7, q7			)
+	vext.8		q7, q7, q7, #8
+	veor.8		q7, q7, q0
 
-	ext		v7.16b, v7.16b, vzr.16b, #6
+	vext.8		q7, q7, qzr, #6
 	b		_barrett
 
 _only_less_than_2:
 
 	// load 1 Byte
-	ldrb		w0, [arg2]
-	strb		w0, [x11]
+	ldrb		r0, [arg2]
+	strb		r0, [ip]
 
-	ld1		{v7.2d}, [x11]
-CPU_LE(	rev64		v7.16b, v7.16b		)
-	ext		v7.16b, v7.16b, v7.16b, #8
-	eor		v7.16b, v7.16b, v0.16b
+	vld1.64		{q7}, [ip]
+CPU_LE(	vrev64.8	q7, q7			)
+	vext.8		q7, q7, q7, #8
+	veor.8		q7, q7, q0
 
-	ext		v7.16b, v7.16b, vzr.16b, #7
+	vext.8		q7, q7, qzr, #7
 	b		_barrett
 
 ENDPROC(crc_t10dif_pmull)
@@ -482,16 +523,26 @@ ENDPROC(crc_t10dif_pmull)
 // rk7 = floor(2^64/Q)
 // rk8 = Q
 
-rk1:	.octa		0x06df0000000000002d56000000000000
-rk3:	.octa		0x7cf50000000000009d9d000000000000
-rk5:	.octa		0x13680000000000002d56000000000000
-rk7:	.octa		0x000000018bb7000000000001f65a57f8
-rk9:	.octa		0xbfd6000000000000ceae000000000000
-rk11:	.octa		0x713c0000000000001e16000000000000
-rk13:	.octa		0x80a6000000000000f7f9000000000000
-rk15:	.octa		0xe658000000000000044c000000000000
-rk17:	.octa		0xa497000000000000ad18000000000000
-rk19:	.octa		0xe7b50000000000006ee3000000000000
+rk1:	.quad		0x2d56000000000000
+rk2:	.quad		0x06df000000000000
+rk3:	.quad		0x9d9d000000000000
+rk4:	.quad		0x7cf5000000000000
+rk5:	.quad		0x2d56000000000000
+rk6:	.quad		0x1368000000000000
+rk7:	.quad		0x00000001f65a57f8
+rk8:	.quad		0x000000018bb70000
+rk9:	.quad		0xceae000000000000
+rk10:	.quad		0xbfd6000000000000
+rk11:	.quad		0x1e16000000000000
+rk12:	.quad		0x713c000000000000
+rk13:	.quad		0xf7f9000000000000
+rk14:	.quad		0x80a6000000000000
+rk15:	.quad		0x044c000000000000
+rk16:	.quad		0xe658000000000000
+rk17:	.quad		0xad18000000000000
+rk18:	.quad		0xa497000000000000
+rk19:	.quad		0x6ee3000000000000
+rk20:	.quad		0xe7b5000000000000
 
 tbl_shf_table:
 // use these values for shift constants for the tbl/tbx instruction
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
similarity index 76%
copy from arch/arm64/crypto/crct10dif-ce-glue.c
copy to arch/arm/crypto/crct10dif-ce-glue.c
index d11f33dae79c..e717538d902c 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -1,5 +1,5 @@
 /*
- * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
  *
  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
@@ -8,7 +8,6 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/cpufeature.h>
 #include <linux/crc-t10dif.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -18,6 +17,7 @@
 #include <crypto/internal/hash.h>
 
 #include <asm/neon.h>
+#include <asm/simd.h>
 
 asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
 
@@ -34,9 +34,13 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
 {
 	u16 *crc = shash_desc_ctx(desc);
 
-	kernel_neon_begin_partial(14);
-	*crc = crc_t10dif_pmull(*crc, data, length);
-	kernel_neon_end();
+	if (may_use_simd()) {
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull(*crc, data, length);
+		kernel_neon_end();
+	} else {
+		*crc = crc_t10dif_generic(*crc, data, length);
+	}
 
 	return 0;
 }
@@ -57,7 +61,7 @@ static struct shash_alg crc_t10dif_alg = {
 
 	.descsize		= CRC_T10DIF_DIGEST_SIZE,
 	.base.cra_name		= "crct10dif",
-	.base.cra_driver_name	= "crct10dif-arm64-ce",
+	.base.cra_driver_name	= "crct10dif-arm-ce",
 	.base.cra_priority	= 200,
 	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
@@ -65,6 +69,9 @@ static struct shash_alg crc_t10dif_alg = {
 
 static int __init crc_t10dif_mod_init(void)
 {
+	if (!(elf_hwcap2 & HWCAP2_PMULL))
+		return -ENODEV;
+
 	return crypto_register_shash(&crc_t10dif_alg);
 }
 
@@ -73,8 +80,10 @@ static void __exit crc_t10dif_mod_exit(void)
 	crypto_unregister_shash(&crc_t10dif_alg);
 }
 
-module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
+module_init(crc_t10dif_mod_init);
 module_exit(crc_t10dif_mod_exit);
 
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-arm-ce");
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/2] mpi: Fix NULL ptr dereference in mpi_powm() [ver #3]
From: David Howells @ 2016-11-24 13:23 UTC (permalink / raw)
  To: jmorris
  Cc: Dmitry Kasatkin, linux-kernel, stable, dhowells,
	linux-security-module, keyrings, linux-crypto, linux-ima-devel,
	Andrey Ryabinin
In-Reply-To: <147999377574.9697.16315343355948647181.stgit@warthog.procyon.org.uk>

From: Andrey Ryabinin <aryabinin@virtuozzo.com>

This fixes CVE-2016-8650.

If mpi_powm() is given a zero exponent, it wants to immediately return
either 1 or 0, depending on the modulus.  However, if the result was
initalised with zero limb space, no limbs space is allocated and a
NULL-pointer exception ensues.

Fix this by allocating a minimal amount of limb space for the result when
the 0-exponent case when the result is 1 and not touching the limb space
when the result is 0.

This affects the use of RSA keys and X.509 certificates that carry them.

BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffff8138ce5d>] mpi_powm+0x32/0x7e6
PGD 0
Oops: 0002 [#1] SMP
Modules linked in:
CPU: 3 PID: 3014 Comm: keyctl Not tainted 4.9.0-rc6-fscache+ #278
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
task: ffff8804011944c0 task.stack: ffff880401294000
RIP: 0010:[<ffffffff8138ce5d>]  [<ffffffff8138ce5d>] mpi_powm+0x32/0x7e6
RSP: 0018:ffff880401297ad8  EFLAGS: 00010212
RAX: 0000000000000000 RBX: ffff88040868bec0 RCX: ffff88040868bba0
RDX: ffff88040868b260 RSI: ffff88040868bec0 RDI: ffff88040868bee0
RBP: ffff880401297ba8 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000047 R11: ffffffff8183b210 R12: 0000000000000000
R13: ffff8804087c7600 R14: 000000000000001f R15: ffff880401297c50
FS:  00007f7a7918c700(0000) GS:ffff88041fb80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000401250000 CR4: 00000000001406e0
Stack:
 ffff88040868bec0 0000000000000020 ffff880401297b00 ffffffff81376cd4
 0000000000000100 ffff880401297b10 ffffffff81376d12 ffff880401297b30
 ffffffff81376f37 0000000000000100 0000000000000000 ffff880401297ba8
Call Trace:
 [<ffffffff81376cd4>] ? __sg_page_iter_next+0x43/0x66
 [<ffffffff81376d12>] ? sg_miter_get_next_page+0x1b/0x5d
 [<ffffffff81376f37>] ? sg_miter_next+0x17/0xbd
 [<ffffffff8138ba3a>] ? mpi_read_raw_from_sgl+0xf2/0x146
 [<ffffffff8132a95c>] rsa_verify+0x9d/0xee
 [<ffffffff8132acca>] ? pkcs1pad_sg_set_buf+0x2e/0xbb
 [<ffffffff8132af40>] pkcs1pad_verify+0xc0/0xe1
 [<ffffffff8133cb5e>] public_key_verify_signature+0x1b0/0x228
 [<ffffffff8133d974>] x509_check_for_self_signed+0xa1/0xc4
 [<ffffffff8133cdde>] x509_cert_parse+0x167/0x1a1
 [<ffffffff8133d609>] x509_key_preparse+0x21/0x1a1
 [<ffffffff8133c3d7>] asymmetric_key_preparse+0x34/0x61
 [<ffffffff812fc9f3>] key_create_or_update+0x145/0x399
 [<ffffffff812fe227>] SyS_add_key+0x154/0x19e
 [<ffffffff81001c2b>] do_syscall_64+0x80/0x191
 [<ffffffff816825e4>] entry_SYSCALL64_slow_path+0x25/0x25
Code: 56 41 55 41 54 53 48 81 ec a8 00 00 00 44 8b 71 04 8b 42 04 4c 8b 67 18 45 85 f6 89 45 80 0f 84 b4 06 00 00 85 c0 75 2f 41 ff ce <49> c7 04 24 01 00 00 00 b0 01 75 0b 48 8b 41 18 48 83 38 01 0f
RIP  [<ffffffff8138ce5d>] mpi_powm+0x32/0x7e6
 RSP <ffff880401297ad8>
CR2: 0000000000000000
---[ end trace d82015255d4a5d8d ]---

Basically, this is a backport of a libgcrypt patch:

	http://git.gnupg.org/cgi-bin/gitweb.cgi?p=libgcrypt.git;a=patch;h=6e1adb05d290aeeb1c230c763970695f4a538526

Fixes: cdec9cb5167a ("crypto: GnuPG based MPI lib - source files (part 1)")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
cc: linux-ima-devel@lists.sourceforge.net
cc: stable@vger.kernel.org
---

 lib/mpi/mpi-pow.c |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c
index 5464c8744ea9..e24388a863a7 100644
--- a/lib/mpi/mpi-pow.c
+++ b/lib/mpi/mpi-pow.c
@@ -64,8 +64,13 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod)
 	if (!esize) {
 		/* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
 		 * depending on if MOD equals 1.  */
-		rp[0] = 1;
 		res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1;
+		if (res->nlimbs) {
+			if (mpi_resize(res, 1) < 0)
+				goto enomem;
+			rp = res->d;
+			rp[0] = 1;
+		}
 		res->sign = 0;
 		goto leave;
 	}


^ permalink raw reply related

* [PATCH 1/2] X.509: Fix double free in x509_cert_parse() [ver #3]
From: David Howells @ 2016-11-24 13:23 UTC (permalink / raw)
  To: jmorris
  Cc: linux-kernel, stable, dhowells, linux-security-module, keyrings,
	linux-crypto, Andrey Ryabinin
In-Reply-To: <147999377574.9697.16315343355948647181.stgit@warthog.procyon.org.uk>

From: Andrey Ryabinin <aryabinin@virtuozzo.com>

We shouldn't free cert->pub->key in x509_cert_parse() because
x509_free_certificate() also does this:
	BUG: Double free or freeing an invalid pointer
	...
	Call Trace:
	 [<ffffffff81896c20>] dump_stack+0x63/0x83
	 [<ffffffff81356571>] kasan_object_err+0x21/0x70
	 [<ffffffff81356ed9>] kasan_report_double_free+0x49/0x60
	 [<ffffffff813561ad>] kasan_slab_free+0x9d/0xc0
	 [<ffffffff81350b7a>] kfree+0x8a/0x1a0
	 [<ffffffff81844fbf>] public_key_free+0x1f/0x30
	 [<ffffffff818455d4>] x509_free_certificate+0x24/0x90
	 [<ffffffff818460bc>] x509_cert_parse+0x2bc/0x300
	 [<ffffffff81846cae>] x509_key_preparse+0x3e/0x330
	 [<ffffffff818444cf>] asymmetric_key_preparse+0x6f/0x100
	 [<ffffffff8178bec0>] key_create_or_update+0x260/0x5f0
	 [<ffffffff8178e6d9>] SyS_add_key+0x199/0x2a0
	 [<ffffffff821d823b>] entry_SYSCALL_64_fastpath+0x1e/0xad
	Object at ffff880110bd1900, in cache kmalloc-512 size: 512
	....
	Freed:
	PID = 2579
	[<ffffffff8104283b>] save_stack_trace+0x1b/0x20
	[<ffffffff813558f6>] save_stack+0x46/0xd0
	[<ffffffff81356183>] kasan_slab_free+0x73/0xc0
	[<ffffffff81350b7a>] kfree+0x8a/0x1a0
	[<ffffffff818460a3>] x509_cert_parse+0x2a3/0x300
	[<ffffffff81846cae>] x509_key_preparse+0x3e/0x330
	[<ffffffff818444cf>] asymmetric_key_preparse+0x6f/0x100
	[<ffffffff8178bec0>] key_create_or_update+0x260/0x5f0
	[<ffffffff8178e6d9>] SyS_add_key+0x199/0x2a0
	[<ffffffff821d823b>] entry_SYSCALL_64_fastpath+0x1e/0xad

Fixes: db6c43bd2132 ("crypto: KEYS: convert public key and digsig asym to the akcipher api")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: David Howells <dhowells@redhat.com>
---

 crypto/asymmetric_keys/x509_cert_parser.c |    1 -
 1 file changed, 1 deletion(-)

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 865f46ea724f..c80765b211cf 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -133,7 +133,6 @@ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen)
 	return cert;
 
 error_decode:
-	kfree(cert->pub->key);
 	kfree(ctx);
 error_no_ctx:
 	x509_free_certificate(cert);

^ permalink raw reply related

* [PATCH 0/2] KEYS: Fixes [ver #3]
From: David Howells @ 2016-11-24 13:22 UTC (permalink / raw)
  To: jmorris
  Cc: dhowells, linux-security-module, keyrings, linux-kernel,
	linux-crypto


Hi James,

Can you pull these patches please and pass them on to Linus?  They include
the following:

 (1) Fix mpi_powm()'s handling of a number with a zero exponent [CVE-2016-8650].

 (2) Fix double free in X.509 error handling.

Ver #3:

 - Integrate my and Andrey's patches for mpi_powm() and use mpi_resize()
   instead of RESIZE_IF_NEEDED() - the latter adds a duplicate check into
   the execution path of a trivial case we don't normally expect to be
   taken.

Ver #2:

 - Use RESIZE_IF_NEEDED() to conditionally resize the result rather than
   manually doing this.

The patches can be found here also:

	http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=keys-fixes

Tagged thusly:

	git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
	keys-fixes-20161124-3

David
---
Andrey Ryabinin (2):
      X.509: Fix double free in x509_cert_parse()
      mpi: Fix NULL ptr dereference in mpi_powm()


 crypto/asymmetric_keys/x509_cert_parser.c |    1 -
 lib/mpi/mpi-pow.c                         |    7 ++++++-
 2 files changed, 6 insertions(+), 2 deletions(-)


^ permalink raw reply

* 53486 linux-crypto
From: e.camilla.johansson @ 2016-11-24  6:32 UTC (permalink / raw)
  To: linux-crypto

[-- Attachment #1: INFO_93352944885680_linux-crypto.zip --]
[-- Type: application/zip, Size: 2549 bytes --]

^ permalink raw reply

* [PATCH] crypto: acomp - don't use stack buffer in test_acomp()
From: Eric Biggers @ 2016-11-23 18:24 UTC (permalink / raw)
  To: linux-crypto
  Cc: Giovanni Cabiddu, Herbert Xu, David S. Miller, Andy Lutomirski,
	Eric Biggers

With virtually-mapped stacks (CONFIG_VMAP_STACK=y), using the
scatterlist crypto API with stack buffers is not allowed, and with
appropriate debugging options will cause the
'BUG_ON(!virt_addr_valid(buf));' in sg_set_buf() to be triggered.
Use a heap buffer instead.

Fixes: d7db7a882deb ("crypto: acomp - update testmgr with support for acomp")
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/testmgr.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ded50b6..aca1b7b 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1448,17 +1448,21 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 {
 	const char *algo = crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm));
 	unsigned int i;
-	char output[COMP_BUF_SIZE];
+	char *output;
 	int ret;
 	struct scatterlist src, dst;
 	struct acomp_req *req;
 	struct tcrypt_result result;
 
+	output = kmalloc(COMP_BUF_SIZE, GFP_KERNEL);
+	if (!output)
+		return -ENOMEM;
+
 	for (i = 0; i < ctcount; i++) {
 		unsigned int dlen = COMP_BUF_SIZE;
 		int ilen = ctemplate[i].inlen;
 
-		memset(output, 0, sizeof(output));
+		memset(output, 0, dlen);
 		init_completion(&result.completion);
 		sg_init_one(&src, ctemplate[i].input, ilen);
 		sg_init_one(&dst, output, dlen);
@@ -1507,7 +1511,7 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 		unsigned int dlen = COMP_BUF_SIZE;
 		int ilen = dtemplate[i].inlen;
 
-		memset(output, 0, sizeof(output));
+		memset(output, 0, dlen);
 		init_completion(&result.completion);
 		sg_init_one(&src, dtemplate[i].input, ilen);
 		sg_init_one(&dst, output, dlen);
@@ -1555,6 +1559,7 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 	ret = 0;
 
 out:
+	kfree(output);
 	return ret;
 }
 
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH] mpi: Fix NULL ptr dereference in mpi_powm()
From: Andrey Ryabinin @ 2016-11-23 16:28 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S . Miller, Nicolai Stange, linux-crypto, linux-kernel,
	Andrey Ryabinin, stable

Parsing certain certificates (see [1]) triggers NULL-ptr
dereference in mpi_powm():

	BUG: unable to handle kernel NULL pointer dereference at (null)
	IP: [<ffffffff818eb118>] mpi_powm+0xf8/0x10b0
	...
	Call Trace:
	 [<ffffffff817e06a6>] _rsa_dec.isra.2+0x66/0x80
	 [<ffffffff817e07c3>] rsa_verify+0x103/0x1c0
	 [<ffffffff817e1683>] pkcs1pad_verify+0x1c3/0x220
	 [<ffffffff8184549a>] public_key_verify_signature+0x3fa/0x4d0
	 [<ffffffff818473c7>] x509_check_for_self_signed+0x167/0x1e0
	 [<ffffffff8184607e>] x509_cert_parse+0x27e/0x300
	 [<ffffffff81846cae>] x509_key_preparse+0x3e/0x330
	 [<ffffffff818444cf>] asymmetric_key_preparse+0x6f/0x100
	 [<ffffffff8178bec0>] key_create_or_update+0x260/0x5f0
	 [<ffffffff8178e6d9>] SyS_add_key+0x199/0x2a0
	 [<ffffffff821d823b>] entry_SYSCALL_64_fastpath+0x1e/0xad

This happens because mpi_alloc(0) doesn't allocate the limb space.
Fix this by allocating the result if needed.

Basically, this is a backport of libgcrypt patch [2].

[1] http://seclists.org/fulldisclosure/2016/Nov/76
[2] http://git.gnupg.org/cgi-bin/gitweb.cgi?p=libgcrypt.git;a=patch;h=6e1adb05d290aeeb1c230c763970695f4a538526

Fixes: cdec9cb5167a ("crypto: GnuPG based MPI lib - source files (part 1)")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: <stable@vger.kernel.org>
---
 lib/mpi/mpi-pow.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c
index 5464c87..9e28d12 100644
--- a/lib/mpi/mpi-pow.c
+++ b/lib/mpi/mpi-pow.c
@@ -64,8 +64,13 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod)
 	if (!esize) {
 		/* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
 		 * depending on if MOD equals 1.  */
-		rp[0] = 1;
 		res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1;
+		if (res->nlimbs) {
+			if (RESIZE_IF_NEEDED(res, 1) < 0)
+				goto enomem;
+			rp = res->d;
+			rp[0] = 1;
+		}
 		res->sign = 0;
 		goto leave;
 	}
-- 
2.7.3

^ permalink raw reply related

* [PATCH] X.509: Fix double free in x509_cert_parse()
From: Andrey Ryabinin @ 2016-11-23 16:16 UTC (permalink / raw)
  To: David Howells
  Cc: Tadeusz Struk, Herbert Xu, David S. Miller, keyrings,
	linux-crypto, linux-kernel, Andrey Ryabinin, stable

We shouldn't free cert->pub->key in x509_cert_parse() because
x509_free_certificate() also does this:
	BUG: Double free or freeing an invalid pointer
	...
	Call Trace:
	 [<ffffffff81896c20>] dump_stack+0x63/0x83
	 [<ffffffff81356571>] kasan_object_err+0x21/0x70
	 [<ffffffff81356ed9>] kasan_report_double_free+0x49/0x60
	 [<ffffffff813561ad>] kasan_slab_free+0x9d/0xc0
	 [<ffffffff81350b7a>] kfree+0x8a/0x1a0
	 [<ffffffff81844fbf>] public_key_free+0x1f/0x30
	 [<ffffffff818455d4>] x509_free_certificate+0x24/0x90
	 [<ffffffff818460bc>] x509_cert_parse+0x2bc/0x300
	 [<ffffffff81846cae>] x509_key_preparse+0x3e/0x330
	 [<ffffffff818444cf>] asymmetric_key_preparse+0x6f/0x100
	 [<ffffffff8178bec0>] key_create_or_update+0x260/0x5f0
	 [<ffffffff8178e6d9>] SyS_add_key+0x199/0x2a0
	 [<ffffffff821d823b>] entry_SYSCALL_64_fastpath+0x1e/0xad
	Object at ffff880110bd1900, in cache kmalloc-512 size: 512
	....
	Freed:
	PID = 2579
	[<ffffffff8104283b>] save_stack_trace+0x1b/0x20
	[<ffffffff813558f6>] save_stack+0x46/0xd0
	[<ffffffff81356183>] kasan_slab_free+0x73/0xc0
	[<ffffffff81350b7a>] kfree+0x8a/0x1a0
	[<ffffffff818460a3>] x509_cert_parse+0x2a3/0x300
	[<ffffffff81846cae>] x509_key_preparse+0x3e/0x330
	[<ffffffff818444cf>] asymmetric_key_preparse+0x6f/0x100
	[<ffffffff8178bec0>] key_create_or_update+0x260/0x5f0
	[<ffffffff8178e6d9>] SyS_add_key+0x199/0x2a0
	[<ffffffff821d823b>] entry_SYSCALL_64_fastpath+0x1e/0xad

Fixes: db6c43bd2132 ("crypto: KEYS: convert public key and digsig asym to the akcipher api")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: <stable@vger.kernel.org>
---
 crypto/asymmetric_keys/x509_cert_parser.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 865f46e..c80765b 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -133,7 +133,6 @@ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen)
 	return cert;
 
 error_decode:
-	kfree(cert->pub->key);
 	kfree(ctx);
 error_no_ctx:
 	x509_free_certificate(cert);
-- 
2.7.3

^ permalink raw reply related

* Crypto Fixes for 4.9
From: Herbert Xu @ 2016-11-23  5:36 UTC (permalink / raw)
  To: Linus Torvalds, David S. Miller, Linux Kernel Mailing List,
	Linux Crypto Mailing List
In-Reply-To: <20161119102748.GA4277@gondor.apana.org.au>

Hi Linus:

The last push broke algif_hash for all shash implementations,
so this is a follow-up to fix that.  It also fixes a problem
in the crypto scatterwalk that triggers a BUG_ON with certain
debugging options due to the new vmalloced-stack code.


Please pull from

git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git linus


Herbert Xu (2):
      crypto: algif_hash - Fix result clobbering in recvmsg
      crypto: scatterwalk - Remove unnecessary aliasing check in map_and_copy

 crypto/algif_hash.c  |    2 +-
 crypto/scatterwalk.c |    4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] crypto: powerpc - Rename CRYPT_CRC32C_VPMSUM option
From: Anton Blanchard @ 2016-11-22 22:23 UTC (permalink / raw)
  To: Jean Delvare
  Cc: linux-crypto, Herbert Xu, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, David S. Miller
In-Reply-To: <20161122103244.38d435cd@endymion>

Hi Jean,

> For consistency with the other 246 kernel configuration options,
> rename CRYPT_CRC32C_VPMSUM to CRYPTO_CRC32C_VPMSUM.

Thanks! Not sure how I missed that.

Acked-by: Anton Blanchard <anton@samba.org>

Anton

^ permalink raw reply

* [PATCH 06/10] crypto: caam - remove unneded dependencies on CRYPTO_DEV_FSL_CAAM
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

Remove dependency on CRYPTO_DEV_FSL_CAAM where superfluous:
depends on CRYPTO_DEV_FSL_CAAM && CRYPTO_DEV_FSL_CAAM_JR
is equivalent to
depends on CRYPTO_DEV_FSL_CAAM_JR
since CRYPTO_DEV_FSL_CAAM_JR depends on CRYPTO_DEV_FSL_CAAM.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/Kconfig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/caam/Kconfig b/drivers/crypto/caam/Kconfig
index 64bf3024b680..ebeada75ab2d 100644
--- a/drivers/crypto/caam/Kconfig
+++ b/drivers/crypto/caam/Kconfig
@@ -74,7 +74,7 @@ config CRYPTO_DEV_FSL_CAAM_INTC_TIME_THLD
 
 config CRYPTO_DEV_FSL_CAAM_CRYPTO_API
 	tristate "Register algorithm implementations with the Crypto API"
-	depends on CRYPTO_DEV_FSL_CAAM && CRYPTO_DEV_FSL_CAAM_JR
+	depends on CRYPTO_DEV_FSL_CAAM_JR
 	default y
 	select CRYPTO_AEAD
 	select CRYPTO_AUTHENC
@@ -89,7 +89,7 @@ config CRYPTO_DEV_FSL_CAAM_CRYPTO_API
 
 config CRYPTO_DEV_FSL_CAAM_AHASH_API
 	tristate "Register hash algorithm implementations with Crypto API"
-	depends on CRYPTO_DEV_FSL_CAAM && CRYPTO_DEV_FSL_CAAM_JR
+	depends on CRYPTO_DEV_FSL_CAAM_JR
 	default y
 	select CRYPTO_HASH
 	help
@@ -101,7 +101,7 @@ config CRYPTO_DEV_FSL_CAAM_AHASH_API
 
 config CRYPTO_DEV_FSL_CAAM_PKC_API
         tristate "Register public key cryptography implementations with Crypto API"
-        depends on CRYPTO_DEV_FSL_CAAM && CRYPTO_DEV_FSL_CAAM_JR
+        depends on CRYPTO_DEV_FSL_CAAM_JR
         default y
         select CRYPTO_RSA
         help
@@ -113,7 +113,7 @@ config CRYPTO_DEV_FSL_CAAM_PKC_API
 
 config CRYPTO_DEV_FSL_CAAM_RNG_API
 	tristate "Register caam device for hwrng API"
-	depends on CRYPTO_DEV_FSL_CAAM && CRYPTO_DEV_FSL_CAAM_JR
+	depends on CRYPTO_DEV_FSL_CAAM_JR
 	default y
 	select CRYPTO_RNG
 	select HW_RANDOM
-- 
2.4.4

^ permalink raw reply related

* [PATCH 08/10] crypto: caam - consolidate split key length computation
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

Move split key length and padded length computation from caamalg.c
and caamhash.c to key_gen.c.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/caamalg.c  | 24 +++------------------
 drivers/crypto/caam/caamhash.c | 24 ++-------------------
 drivers/crypto/caam/key_gen.c  | 47 +++++++++++++++++++++++++++++++++++++++++-
 drivers/crypto/caam/key_gen.h  |  3 ++-
 4 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 78b0b7c17205..767ffaea9649 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -586,18 +586,9 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc,
 	return 0;
 }
 
-static u32 gen_split_aead_key(struct caam_ctx *ctx, const u8 *key_in,
-			      u32 authkeylen)
-{
-	return gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key_in,
-			     authkeylen);
-}
-
 static int aead_setkey(struct crypto_aead *aead,
 			       const u8 *key, unsigned int keylen)
 {
-	/* Sizes for MDHA pads (*not* keys): MD5, SHA1, 224, 256, 384, 512 */
-	static const u8 mdpadlen[] = { 16, 20, 32, 32, 64, 64 };
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
 	struct crypto_authenc_keys keys;
@@ -606,26 +597,17 @@ static int aead_setkey(struct crypto_aead *aead,
 	if (crypto_authenc_extractkeys(&keys, key, keylen) != 0)
 		goto badkey;
 
-	/* Pick class 2 key length from algorithm submask */
-	ctx->adata.keylen = mdpadlen[(ctx->adata.algtype &
-				      OP_ALG_ALGSEL_SUBMASK) >>
-				     OP_ALG_ALGSEL_SHIFT] * 2;
-	ctx->adata.keylen_pad = ALIGN(ctx->adata.keylen, 16);
-
-	if (ctx->adata.keylen_pad + keys.enckeylen > CAAM_MAX_KEY_SIZE)
-		goto badkey;
-
 #ifdef DEBUG
 	printk(KERN_ERR "keylen %d enckeylen %d authkeylen %d\n",
 	       keys.authkeylen + keys.enckeylen, keys.enckeylen,
 	       keys.authkeylen);
-	printk(KERN_ERR "split_key_len %d split_key_pad_len %d\n",
-	       ctx->adata.keylen, ctx->adata.keylen_pad);
 	print_hex_dump(KERN_ERR, "key in @"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1);
 #endif
 
-	ret = gen_split_aead_key(ctx, keys.authkey, keys.authkeylen);
+	ret = gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, keys.authkey,
+			    keys.authkeylen, CAAM_MAX_KEY_SIZE -
+			    keys.enckeylen);
 	if (ret) {
 		goto badkey;
 	}
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index d3f0ae16a73b..505007d0277c 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -398,12 +398,6 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	return 0;
 }
 
-static int gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
-			      u32 keylen)
-{
-	return gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key_in, keylen);
-}
-
 /* Digest hash size if it is too large */
 static int hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 			   u32 *keylen, u8 *key_out, u32 digestsize)
@@ -483,8 +477,6 @@ static int hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 static int ahash_setkey(struct crypto_ahash *ahash,
 			const u8 *key, unsigned int keylen)
 {
-	/* Sizes for MDHA pads (*not* keys): MD5, SHA1, 224, 256, 384, 512 */
-	static const u8 mdpadlen[] = { 16, 20, 32, 32, 64, 64 };
 	struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash);
 	struct device *jrdev = ctx->jrdev;
 	int blocksize = crypto_tfm_alg_blocksize(&ahash->base);
@@ -509,20 +501,8 @@ static int ahash_setkey(struct crypto_ahash *ahash,
 		key = hashed_key;
 	}
 
-	/* Pick class 2 key length from algorithm submask */
-	ctx->adata.keylen = mdpadlen[(ctx->adata.algtype &
-				      OP_ALG_ALGSEL_SUBMASK) >>
-				     OP_ALG_ALGSEL_SHIFT] * 2;
-	ctx->adata.keylen_pad = ALIGN(ctx->adata.keylen, 16);
-
-#ifdef DEBUG
-	printk(KERN_ERR "split_key_len %d split_key_pad_len %d\n",
-	       ctx->adata.keylen, ctx->adata.keylen_pad);
-	print_hex_dump(KERN_ERR, "key in @"__stringify(__LINE__)": ",
-		       DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1);
-#endif
-
-	ret = gen_split_hash_key(ctx, key, keylen);
+	ret = gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key, keylen,
+			    CAAM_MAX_HASH_KEY_SIZE);
 	if (ret)
 		goto bad_free_key;
 
diff --git a/drivers/crypto/caam/key_gen.c b/drivers/crypto/caam/key_gen.c
index 621199a02f2e..1bb2816a9b4d 100644
--- a/drivers/crypto/caam/key_gen.c
+++ b/drivers/crypto/caam/key_gen.c
@@ -10,6 +10,36 @@
 #include "desc_constr.h"
 #include "key_gen.h"
 
+/**
+ * split_key_len - Compute MDHA split key length for a given algorithm
+ * @hash: Hashing algorithm selection, one of OP_ALG_ALGSEL_* - MD5, SHA1,
+ *        SHA224, SHA384, SHA512.
+ *
+ * Return: MDHA split key length
+ */
+static inline u32 split_key_len(u32 hash)
+{
+	/* Sizes for MDHA pads (*not* keys): MD5, SHA1, 224, 256, 384, 512 */
+	static const u8 mdpadlen[] = { 16, 20, 32, 32, 64, 64 };
+	u32 idx;
+
+	idx = (hash & OP_ALG_ALGSEL_SUBMASK) >> OP_ALG_ALGSEL_SHIFT;
+
+	return (u32)(mdpadlen[idx] * 2);
+}
+
+/**
+ * split_key_pad_len - Compute MDHA split key pad length for a given algorithm
+ * @hash: Hashing algorithm selection, one of OP_ALG_ALGSEL_* - MD5, SHA1,
+ *        SHA224, SHA384, SHA512.
+ *
+ * Return: MDHA split key pad length
+ */
+static inline u32 split_key_pad_len(u32 hash)
+{
+	return ALIGN(split_key_len(hash), 16);
+}
+
 void split_key_done(struct device *dev, u32 *desc, u32 err,
 			   void *context)
 {
@@ -42,13 +72,28 @@ Split key generation-----------------------------------------------
 			@0xffe04000
 */
 int gen_split_key(struct device *jrdev, u8 *key_out,
-		  struct alginfo * const adata, const u8 *key_in, u32 keylen)
+		  struct alginfo * const adata, const u8 *key_in, u32 keylen,
+		  int max_keylen)
 {
 	u32 *desc;
 	struct split_key_result result;
 	dma_addr_t dma_addr_in, dma_addr_out;
 	int ret = -ENOMEM;
 
+	adata->keylen = split_key_len(adata->algtype & OP_ALG_ALGSEL_MASK);
+	adata->keylen_pad = split_key_pad_len(adata->algtype &
+					      OP_ALG_ALGSEL_MASK);
+
+#ifdef DEBUG
+	dev_err(jrdev, "split keylen %d split keylen padded %d\n",
+		adata->keylen, adata->keylen_pad);
+	print_hex_dump(KERN_ERR, "ctx.key@" __stringify(__LINE__)": ",
+		       DUMP_PREFIX_ADDRESS, 16, 4, key_in, keylen, 1);
+#endif
+
+	if (adata->keylen_pad > max_keylen)
+		return -EINVAL;
+
 	desc = kmalloc(CAAM_CMD_SZ * 6 + CAAM_PTR_SZ * 2, GFP_KERNEL | GFP_DMA);
 	if (!desc) {
 		dev_err(jrdev, "unable to allocate key input memory\n");
diff --git a/drivers/crypto/caam/key_gen.h b/drivers/crypto/caam/key_gen.h
index e87483c6057b..4628f389eb64 100644
--- a/drivers/crypto/caam/key_gen.h
+++ b/drivers/crypto/caam/key_gen.h
@@ -13,4 +13,5 @@ struct split_key_result {
 void split_key_done(struct device *dev, u32 *desc, u32 err, void *context);
 
 int gen_split_key(struct device *jrdev, u8 *key_out,
-		  struct alginfo * const adata, const u8 *key_in, u32 keylen);
+		  struct alginfo * const adata, const u8 *key_in, u32 keylen,
+		  int max_keylen);
-- 
2.4.4

^ permalink raw reply related

* [PATCH 03/10] crypto: caam - remove superfluous alg_op algorithm param
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

Information carried by alg_op can be deduced from adata->algtype
plus some fixed flags.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/caamalg.c  | 64 ++----------------------------------------
 drivers/crypto/caam/caamhash.c | 20 ++++---------
 drivers/crypto/caam/key_gen.c  |  7 +++--
 drivers/crypto/caam/key_gen.h  |  3 +-
 4 files changed, 13 insertions(+), 81 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 4141143cce7d..48fc000d86bf 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -149,7 +149,6 @@ static struct list_head alg_list;
 struct caam_alg_entry {
 	int class1_alg_type;
 	int class2_alg_type;
-	int alg_op;
 	bool rfc3686;
 	bool geniv;
 };
@@ -217,7 +216,6 @@ struct caam_ctx {
 	dma_addr_t sh_desc_enc_dma;
 	dma_addr_t sh_desc_dec_dma;
 	dma_addr_t sh_desc_givenc_dma;
-	u32 alg_op;
 	u8 key[CAAM_MAX_KEY_SIZE];
 	dma_addr_t key_dma;
 	struct alginfo adata;
@@ -1366,7 +1364,7 @@ static u32 gen_split_aead_key(struct caam_ctx *ctx, const u8 *key_in,
 			      u32 authkeylen)
 {
 	return gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key_in,
-			     authkeylen, ctx->alg_op);
+			     authkeylen);
 }
 
 static int aead_setkey(struct crypto_aead *aead,
@@ -1383,7 +1381,8 @@ static int aead_setkey(struct crypto_aead *aead,
 		goto badkey;
 
 	/* Pick class 2 key length from algorithm submask */
-	ctx->adata.keylen = mdpadlen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
+	ctx->adata.keylen = mdpadlen[(ctx->adata.algtype &
+				      OP_ALG_ALGSEL_SUBMASK) >>
 				     OP_ALG_ALGSEL_SHIFT] * 2;
 	ctx->adata.keylen_pad = ALIGN(ctx->adata.keylen, 16);
 
@@ -2988,7 +2987,6 @@ struct caam_alg_template {
 	} template_u;
 	u32 class1_alg_type;
 	u32 class2_alg_type;
-	u32 alg_op;
 };
 
 static struct caam_alg_template driver_algs[] = {
@@ -3173,7 +3171,6 @@ static struct caam_aead_alg driver_aeads[] = {
 		.caam = {
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3195,7 +3192,6 @@ static struct caam_aead_alg driver_aeads[] = {
 		.caam = {
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3217,7 +3213,6 @@ static struct caam_aead_alg driver_aeads[] = {
 		.caam = {
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3239,7 +3234,6 @@ static struct caam_aead_alg driver_aeads[] = {
 		.caam = {
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3261,7 +3255,6 @@ static struct caam_aead_alg driver_aeads[] = {
 		.caam = {
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3283,7 +3276,6 @@ static struct caam_aead_alg driver_aeads[] = {
 		.caam = {
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3305,7 +3297,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3328,7 +3319,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3351,7 +3341,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3374,7 +3363,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3397,7 +3385,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3420,7 +3407,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3443,7 +3429,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3466,7 +3451,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3489,7 +3473,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3512,7 +3495,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3535,7 +3517,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3558,7 +3539,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_AES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3581,7 +3561,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 		}
 	},
 	{
@@ -3604,7 +3583,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		}
 	},
@@ -3628,7 +3606,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3652,7 +3629,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3676,7 +3652,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3700,7 +3675,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3724,7 +3698,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3748,7 +3721,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3772,7 +3744,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3796,7 +3767,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3820,7 +3790,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3844,7 +3813,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_3DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3867,7 +3835,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3890,7 +3857,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3913,7 +3879,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3936,7 +3901,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -3959,7 +3923,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -3982,7 +3945,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -4005,7 +3967,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -4028,7 +3989,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -4051,7 +4011,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -4074,7 +4033,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -4097,7 +4055,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 		},
 	},
 	{
@@ -4120,7 +4077,6 @@ static struct caam_aead_alg driver_aeads[] = {
 			.class1_alg_type = OP_ALG_ALGSEL_DES | OP_ALG_AAI_CBC,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 			.geniv = true,
 		},
 	},
@@ -4145,7 +4101,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 		},
 	},
@@ -4170,7 +4125,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_MD5 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 			.geniv = true,
 		},
@@ -4196,7 +4150,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 		},
 	},
@@ -4221,7 +4174,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA1 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 			.geniv = true,
 		},
@@ -4247,7 +4199,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 		},
 	},
@@ -4272,7 +4223,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA224 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 			.geniv = true,
 		},
@@ -4298,7 +4248,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 		},
 	},
@@ -4323,7 +4272,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA256 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 			.geniv = true,
 		},
@@ -4349,7 +4297,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 		},
 	},
@@ -4374,7 +4321,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA384 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 			.geniv = true,
 		},
@@ -4400,7 +4346,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 		},
 	},
@@ -4425,7 +4370,6 @@ static struct caam_aead_alg driver_aeads[] = {
 					   OP_ALG_AAI_CTR_MOD128,
 			.class2_alg_type = OP_ALG_ALGSEL_SHA512 |
 					   OP_ALG_AAI_HMAC_PRECOMP,
-			.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 			.rfc3686 = true,
 			.geniv = true,
 		},
@@ -4449,7 +4393,6 @@ static int caam_init_common(struct caam_ctx *ctx, struct caam_alg_entry *caam)
 	/* copy descriptor header template value */
 	ctx->cdata.algtype = OP_TYPE_CLASS1_ALG | caam->class1_alg_type;
 	ctx->adata.algtype = OP_TYPE_CLASS2_ALG | caam->class2_alg_type;
-	ctx->alg_op = OP_TYPE_CLASS2_ALG | caam->alg_op;
 
 	return 0;
 }
@@ -4570,7 +4513,6 @@ static struct caam_crypto_alg *caam_alg_alloc(struct caam_alg_template
 
 	t_alg->caam.class1_alg_type = template->class1_alg_type;
 	t_alg->caam.class2_alg_type = template->class2_alg_type;
-	t_alg->caam.alg_op = template->alg_op;
 
 	return t_alg;
 }
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 5e569ff06b4b..8e4530d68208 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -108,7 +108,6 @@ struct caam_hash_ctx {
 	dma_addr_t sh_desc_fin_dma;
 	dma_addr_t sh_desc_digest_dma;
 	struct device *jrdev;
-	u32 alg_op;
 	u8 key[CAAM_MAX_HASH_KEY_SIZE];
 	dma_addr_t key_dma;
 	int ctx_len;
@@ -402,8 +401,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 static int gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 			      u32 keylen)
 {
-	return gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key_in, keylen,
-			     ctx->alg_op);
+	return gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key_in, keylen);
 }
 
 /* Digest hash size if it is too large */
@@ -512,7 +510,8 @@ static int ahash_setkey(struct crypto_ahash *ahash,
 	}
 
 	/* Pick class 2 key length from algorithm submask */
-	ctx->adata.keylen = mdpadlen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
+	ctx->adata.keylen = mdpadlen[(ctx->adata.algtype &
+				      OP_ALG_ALGSEL_SUBMASK) >>
 				     OP_ALG_ALGSEL_SHIFT] * 2;
 	ctx->adata.keylen_pad = ALIGN(ctx->adata.keylen, 16);
 
@@ -1654,7 +1653,6 @@ struct caam_hash_template {
 	unsigned int blocksize;
 	struct ahash_alg template_ahash;
 	u32 alg_type;
-	u32 alg_op;
 };
 
 /* ahash descriptors */
@@ -1680,7 +1678,6 @@ static struct caam_hash_template driver_hash[] = {
 			},
 		},
 		.alg_type = OP_ALG_ALGSEL_SHA1,
-		.alg_op = OP_ALG_ALGSEL_SHA1 | OP_ALG_AAI_HMAC,
 	}, {
 		.name = "sha224",
 		.driver_name = "sha224-caam",
@@ -1702,7 +1699,6 @@ static struct caam_hash_template driver_hash[] = {
 			},
 		},
 		.alg_type = OP_ALG_ALGSEL_SHA224,
-		.alg_op = OP_ALG_ALGSEL_SHA224 | OP_ALG_AAI_HMAC,
 	}, {
 		.name = "sha256",
 		.driver_name = "sha256-caam",
@@ -1724,7 +1720,6 @@ static struct caam_hash_template driver_hash[] = {
 			},
 		},
 		.alg_type = OP_ALG_ALGSEL_SHA256,
-		.alg_op = OP_ALG_ALGSEL_SHA256 | OP_ALG_AAI_HMAC,
 	}, {
 		.name = "sha384",
 		.driver_name = "sha384-caam",
@@ -1746,7 +1741,6 @@ static struct caam_hash_template driver_hash[] = {
 			},
 		},
 		.alg_type = OP_ALG_ALGSEL_SHA384,
-		.alg_op = OP_ALG_ALGSEL_SHA384 | OP_ALG_AAI_HMAC,
 	}, {
 		.name = "sha512",
 		.driver_name = "sha512-caam",
@@ -1768,7 +1762,6 @@ static struct caam_hash_template driver_hash[] = {
 			},
 		},
 		.alg_type = OP_ALG_ALGSEL_SHA512,
-		.alg_op = OP_ALG_ALGSEL_SHA512 | OP_ALG_AAI_HMAC,
 	}, {
 		.name = "md5",
 		.driver_name = "md5-caam",
@@ -1790,14 +1783,12 @@ static struct caam_hash_template driver_hash[] = {
 			},
 		},
 		.alg_type = OP_ALG_ALGSEL_MD5,
-		.alg_op = OP_ALG_ALGSEL_MD5 | OP_ALG_AAI_HMAC,
 	},
 };
 
 struct caam_hash_alg {
 	struct list_head entry;
 	int alg_type;
-	int alg_op;
 	struct ahash_alg ahash_alg;
 };
 
@@ -1831,9 +1822,9 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
 	}
 	/* copy descriptor header template value */
 	ctx->adata.algtype = OP_TYPE_CLASS2_ALG | caam_hash->alg_type;
-	ctx->alg_op = OP_TYPE_CLASS2_ALG | caam_hash->alg_op;
 
-	ctx->ctx_len = runninglen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
+	ctx->ctx_len = runninglen[(ctx->adata.algtype &
+				   OP_ALG_ALGSEL_SUBMASK) >>
 				  OP_ALG_ALGSEL_SHIFT];
 
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
@@ -1923,7 +1914,6 @@ caam_hash_alloc(struct caam_hash_template *template,
 	alg->cra_type = &crypto_ahash_type;
 
 	t_alg->alg_type = template->alg_type;
-	t_alg->alg_op = template->alg_op;
 
 	return t_alg;
 }
diff --git a/drivers/crypto/caam/key_gen.c b/drivers/crypto/caam/key_gen.c
index df287e751df1..621199a02f2e 100644
--- a/drivers/crypto/caam/key_gen.c
+++ b/drivers/crypto/caam/key_gen.c
@@ -42,8 +42,7 @@ Split key generation-----------------------------------------------
 			@0xffe04000
 */
 int gen_split_key(struct device *jrdev, u8 *key_out,
-		  struct alginfo * const adata, const u8 *key_in, u32 keylen,
-		  u32 alg_op)
+		  struct alginfo * const adata, const u8 *key_in, u32 keylen)
 {
 	u32 *desc;
 	struct split_key_result result;
@@ -74,7 +73,9 @@ int gen_split_key(struct device *jrdev, u8 *key_out,
 	append_key(desc, dma_addr_in, keylen, CLASS_2 | KEY_DEST_CLASS_REG);
 
 	/* Sets MDHA up into an HMAC-INIT */
-	append_operation(desc, alg_op | OP_ALG_DECRYPT | OP_ALG_AS_INIT);
+	append_operation(desc, (adata->algtype & OP_ALG_ALGSEL_MASK) |
+			 OP_ALG_AAI_HMAC | OP_TYPE_CLASS2_ALG | OP_ALG_DECRYPT |
+			 OP_ALG_AS_INIT);
 
 	/*
 	 * do a FIFO_LOAD of zero, this will trigger the internal key expansion
diff --git a/drivers/crypto/caam/key_gen.h b/drivers/crypto/caam/key_gen.h
index 511882af0596..e87483c6057b 100644
--- a/drivers/crypto/caam/key_gen.h
+++ b/drivers/crypto/caam/key_gen.h
@@ -13,5 +13,4 @@ struct split_key_result {
 void split_key_done(struct device *dev, u32 *desc, u32 err, void *context);
 
 int gen_split_key(struct device *jrdev, u8 *key_out,
-		  struct alginfo * const adata, const u8 *key_in, u32 keylen,
-		  u32 alg_op);
+		  struct alginfo * const adata, const u8 *key_in, u32 keylen);
-- 
2.4.4

^ permalink raw reply related

* [PATCH 02/10] crypto: caam - group algorithm related params
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

In preparation of factoring out the shared descriptors,
struct alginfo is introduced to group the algorithm related
parameters.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/caamalg.c     | 394 +++++++++++++++++++++-----------------
 drivers/crypto/caam/caamhash.c    |  48 +++--
 drivers/crypto/caam/desc_constr.h |  19 ++
 drivers/crypto/caam/key_gen.c     |  12 +-
 drivers/crypto/caam/key_gen.h     |   6 +-
 5 files changed, 274 insertions(+), 205 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 567e234fb49b..4141143cce7d 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -217,14 +217,11 @@ struct caam_ctx {
 	dma_addr_t sh_desc_enc_dma;
 	dma_addr_t sh_desc_dec_dma;
 	dma_addr_t sh_desc_givenc_dma;
-	u32 class1_alg_type;
-	u32 class2_alg_type;
 	u32 alg_op;
 	u8 key[CAAM_MAX_KEY_SIZE];
 	dma_addr_t key_dma;
-	unsigned int enckeylen;
-	unsigned int split_key_len;
-	unsigned int split_key_pad_len;
+	struct alginfo adata;
+	struct alginfo cdata;
 	unsigned int authsize;
 };
 
@@ -232,7 +229,7 @@ static void init_sh_desc_key_aead(u32 *desc, struct caam_ctx *ctx,
 				  int keys_fit_inline, bool is_rfc3686)
 {
 	u32 *key_jump_cmd;
-	unsigned int enckeylen = ctx->enckeylen;
+	unsigned int enckeylen = ctx->cdata.keylen;
 
 	/* Note: Context registers are saved. */
 	init_sh_desc(desc, HDR_SHARE_SERIAL | HDR_SAVECTX);
@@ -250,24 +247,23 @@ static void init_sh_desc_key_aead(u32 *desc, struct caam_ctx *ctx,
 		enckeylen -= CTR_RFC3686_NONCE_SIZE;
 
 	if (keys_fit_inline) {
-		append_key_as_imm(desc, ctx->key, ctx->split_key_pad_len,
-				  ctx->split_key_len, CLASS_2 |
-				  KEY_DEST_MDHA_SPLIT | KEY_ENC);
-		append_key_as_imm(desc, (void *)ctx->key +
-				  ctx->split_key_pad_len, enckeylen,
+		append_key_as_imm(desc, (void *)ctx->adata.key,
+				  ctx->adata.keylen_pad, ctx->adata.keylen,
+				  CLASS_2 | KEY_DEST_MDHA_SPLIT | KEY_ENC);
+		append_key_as_imm(desc, (void *)ctx->cdata.key, enckeylen,
 				  enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
 	} else {
-		append_key(desc, ctx->key_dma, ctx->split_key_len, CLASS_2 |
+		append_key(desc, ctx->adata.key, ctx->adata.keylen, CLASS_2 |
 			   KEY_DEST_MDHA_SPLIT | KEY_ENC);
-		append_key(desc, ctx->key_dma + ctx->split_key_pad_len,
-			   enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+		append_key(desc, ctx->cdata.key, enckeylen, CLASS_1 |
+			   KEY_DEST_CLASS_REG);
 	}
 
 	/* Load Counter into CONTEXT1 reg */
 	if (is_rfc3686) {
 		u32 *nonce;
 
-		nonce = (u32 *)((void *)ctx->key + ctx->split_key_pad_len +
+		nonce = (u32 *)((void *)ctx->key + ctx->adata.keylen_pad +
 			       enckeylen);
 		append_load_as_imm(desc, nonce, CTR_RFC3686_NONCE_SIZE,
 				   LDST_CLASS_IND_CCB |
@@ -286,7 +282,6 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	bool keys_fit_inline = false;
 	u32 *key_jump_cmd, *jump_cmd, *read_move_cmd, *write_move_cmd;
 	u32 *desc;
 
@@ -295,8 +290,13 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
 	if (DESC_AEAD_NULL_ENC_LEN + AEAD_DESC_JOB_IO_LEN +
-	    ctx->split_key_pad_len <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->adata.keylen_pad <= CAAM_DESC_BYTES_MAX) {
+		ctx->adata.key_inline = true;
+		ctx->adata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->adata.key_inline = false;
+		ctx->adata.key = ctx->key_dma;
+	}
 
 	/* aead_encrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
@@ -306,12 +306,12 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	/* Skip if already shared */
 	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
 				   JUMP_COND_SHRD);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, ctx->key, ctx->split_key_pad_len,
-				  ctx->split_key_len, CLASS_2 |
-				  KEY_DEST_MDHA_SPLIT | KEY_ENC);
+	if (ctx->adata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->adata.key,
+				  ctx->adata.keylen_pad, ctx->adata.keylen,
+				  CLASS_2 | KEY_DEST_MDHA_SPLIT | KEY_ENC);
 	else
-		append_key(desc, ctx->key_dma, ctx->split_key_len, CLASS_2 |
+		append_key(desc, ctx->adata.key, ctx->adata.keylen, CLASS_2 |
 			   KEY_DEST_MDHA_SPLIT | KEY_ENC);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
@@ -336,8 +336,8 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 				     (0x8 << MOVE_LEN_SHIFT));
 
 	/* Class 2 operation */
-	append_operation(desc, ctx->class2_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* Read and write cryptlen bytes */
 	aead_append_src_dst(desc, FIFOLD_TYPE_MSG | FIFOLD_TYPE_FLUSH1);
@@ -370,10 +370,14 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	keys_fit_inline = false;
 	if (DESC_AEAD_NULL_DEC_LEN + DESC_JOB_IO_LEN +
-	    ctx->split_key_pad_len <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->adata.keylen_pad <= CAAM_DESC_BYTES_MAX) {
+		ctx->adata.key_inline = true;
+		ctx->adata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->adata.key_inline = false;
+		ctx->adata.key = ctx->key_dma;
+	}
 
 	desc = ctx->sh_desc_dec;
 
@@ -383,18 +387,18 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	/* Skip if already shared */
 	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
 				   JUMP_COND_SHRD);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, ctx->key, ctx->split_key_pad_len,
-				  ctx->split_key_len, CLASS_2 |
-				  KEY_DEST_MDHA_SPLIT | KEY_ENC);
+	if (ctx->adata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->adata.key,
+				  ctx->adata.keylen_pad, ctx->adata.keylen,
+				  CLASS_2 | KEY_DEST_MDHA_SPLIT | KEY_ENC);
 	else
-		append_key(desc, ctx->key_dma, ctx->split_key_len, CLASS_2 |
+		append_key(desc, ctx->adata.key, ctx->adata.keylen, CLASS_2 |
 			   KEY_DEST_MDHA_SPLIT | KEY_ENC);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
 	/* Class 2 operation */
-	append_operation(desc, ctx->class2_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_DECRYPT | OP_ALG_ICV_ON);
 
 	/* assoclen + cryptlen = seqoutlen */
 	append_math_sub(desc, REG2, SEQOUTLEN, REG0, CAAM_CMD_SZ);
@@ -465,7 +469,7 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	u32 geniv, moveiv;
 	u32 ctx1_iv_off = 0;
 	u32 *desc;
-	const bool ctr_mode = ((ctx->class1_alg_type & OP_ALG_AAI_MASK) ==
+	const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) ==
 			       OP_ALG_AAI_CTR_MOD128);
 	const bool is_rfc3686 = alg->caam.rfc3686;
 
@@ -473,7 +477,7 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 		return 0;
 
 	/* NULL encryption / decryption */
-	if (!ctx->enckeylen)
+	if (!ctx->cdata.keylen)
 		return aead_null_set_sh_desc(aead);
 
 	/*
@@ -498,12 +502,18 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	keys_fit_inline = false;
 	if (DESC_AEAD_ENC_LEN + AUTHENC_DESC_JOB_IO_LEN +
-	    ctx->split_key_pad_len + ctx->enckeylen +
+	    ctx->adata.keylen_pad + ctx->cdata.keylen +
 	    (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0) <=
-	    CAAM_DESC_BYTES_MAX)
+	    CAAM_DESC_BYTES_MAX) {
 		keys_fit_inline = true;
+		ctx->adata.key = (uintptr_t)ctx->key;
+		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
+	} else {
+		keys_fit_inline = false;
+		ctx->adata.key = ctx->key_dma;
+		ctx->cdata.key = ctx->key_dma + ctx->adata.keylen_pad;
+	}
 
 	/* aead_encrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
@@ -512,8 +522,8 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	init_sh_desc_key_aead(desc, ctx, keys_fit_inline, is_rfc3686);
 
 	/* Class 2 operation */
-	append_operation(desc, ctx->class2_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* Read and write assoclen bytes */
 	append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ);
@@ -534,8 +544,8 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 				      LDST_OFFSET_SHIFT));
 
 	/* Class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* Read and write cryptlen bytes */
 	append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
@@ -564,12 +574,18 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	keys_fit_inline = false;
 	if (DESC_AEAD_DEC_LEN + AUTHENC_DESC_JOB_IO_LEN +
-	    ctx->split_key_pad_len + ctx->enckeylen +
+	    ctx->adata.keylen_pad + ctx->cdata.keylen +
 	    (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0) <=
-	    CAAM_DESC_BYTES_MAX)
+	    CAAM_DESC_BYTES_MAX) {
 		keys_fit_inline = true;
+		ctx->adata.key = (uintptr_t)ctx->key;
+		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
+	} else {
+		keys_fit_inline = false;
+		ctx->adata.key = ctx->key_dma;
+		ctx->cdata.key = ctx->key_dma + ctx->adata.keylen_pad;
+	}
 
 	/* aead_decrypt shared descriptor */
 	desc = ctx->sh_desc_dec;
@@ -578,8 +594,8 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	init_sh_desc_key_aead(desc, ctx, keys_fit_inline, is_rfc3686);
 
 	/* Class 2 operation */
-	append_operation(desc, ctx->class2_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_DECRYPT | OP_ALG_ICV_ON);
 
 	/* Read and write assoclen bytes */
 	append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ);
@@ -612,10 +628,10 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 
 	/* Choose operation */
 	if (ctr_mode)
-		append_operation(desc, ctx->class1_alg_type |
+		append_operation(desc, ctx->cdata.algtype |
 				 OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT);
 	else
-		append_dec_op1(desc, ctx->class1_alg_type);
+		append_dec_op1(desc, ctx->cdata.algtype);
 
 	/* Read and write cryptlen bytes */
 	append_math_add(desc, VARSEQINLEN, SEQOUTLEN, REG0, CAAM_CMD_SZ);
@@ -646,12 +662,18 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	keys_fit_inline = false;
 	if (DESC_AEAD_GIVENC_LEN + AUTHENC_DESC_JOB_IO_LEN +
-	    ctx->split_key_pad_len + ctx->enckeylen +
+	    ctx->adata.keylen_pad + ctx->cdata.keylen +
 	    (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0) <=
-	    CAAM_DESC_BYTES_MAX)
+	    CAAM_DESC_BYTES_MAX) {
 		keys_fit_inline = true;
+		ctx->adata.key = (uintptr_t)ctx->key;
+		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
+	} else {
+		keys_fit_inline = false;
+		ctx->adata.key = ctx->key_dma;
+		ctx->cdata.key = ctx->key_dma + ctx->adata.keylen_pad;
+	}
 
 	/* aead_givencrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
@@ -682,8 +704,8 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 		    (ivsize << MOVE_LEN_SHIFT));
 
 	/* Return to encryption */
-	append_operation(desc, ctx->class2_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* Read and write assoclen bytes */
 	append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ);
@@ -712,8 +734,8 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 				      LDST_OFFSET_SHIFT));
 
 	/* Class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* Will write ivsize + cryptlen */
 	append_math_add(desc, VARSEQOUTLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
@@ -764,12 +786,11 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	bool keys_fit_inline = false;
 	u32 *key_jump_cmd, *zero_payload_jump_cmd,
 	    *zero_assoc_jump_cmd1, *zero_assoc_jump_cmd2;
 	u32 *desc;
 
-	if (!ctx->enckeylen || !ctx->authsize)
+	if (!ctx->cdata.keylen || !ctx->authsize)
 		return 0;
 
 	/*
@@ -778,8 +799,13 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 	 * must fit into the 64-word Descriptor h/w Buffer
 	 */
 	if (DESC_GCM_ENC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->enckeylen <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+		ctx->cdata.key_inline = true;
+		ctx->cdata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->cdata.key_inline = false;
+		ctx->cdata.key = ctx->key_dma;
+	}
 
 	desc = ctx->sh_desc_enc;
 
@@ -788,17 +814,18 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 	/* skip key loading if they are loaded due to sharing */
 	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
 				   JUMP_COND_SHRD | JUMP_COND_SELF);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-				  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	if (ctx->cdata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->cdata.key,
+				  ctx->cdata.keylen, ctx->cdata.keylen,
+				  CLASS_1 | KEY_DEST_CLASS_REG);
 	else
-		append_key(desc, ctx->key_dma, ctx->enckeylen,
-			   CLASS_1 | KEY_DEST_CLASS_REG);
+		append_key(desc, ctx->cdata.key, ctx->cdata.keylen, CLASS_1 |
+			   KEY_DEST_CLASS_REG);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
 	/* class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* if assoclen + cryptlen is ZERO, skip to ICV write */
 	append_math_sub(desc, VARSEQOUTLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
@@ -870,10 +897,14 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	keys_fit_inline = false;
 	if (DESC_GCM_DEC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->enckeylen <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+		ctx->cdata.key_inline = true;
+		ctx->cdata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->cdata.key_inline = false;
+		ctx->cdata.key = ctx->key_dma;
+	}
 
 	desc = ctx->sh_desc_dec;
 
@@ -883,17 +914,18 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 	key_jump_cmd = append_jump(desc, JUMP_JSL |
 				   JUMP_TEST_ALL | JUMP_COND_SHRD |
 				   JUMP_COND_SELF);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-				  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	if (ctx->cdata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->cdata.key,
+				  ctx->cdata.keylen, ctx->cdata.keylen,
+				  CLASS_1 | KEY_DEST_CLASS_REG);
 	else
-		append_key(desc, ctx->key_dma, ctx->enckeylen,
-			   CLASS_1 | KEY_DEST_CLASS_REG);
+		append_key(desc, ctx->cdata.key, ctx->cdata.keylen, CLASS_1 |
+			   KEY_DEST_CLASS_REG);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
 	/* class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_DECRYPT | OP_ALG_ICV_ON);
 
 	/* if assoclen is ZERO, skip reading the assoc data */
 	append_math_add(desc, VARSEQINLEN, ZERO, REG3, CAAM_CMD_SZ);
@@ -964,11 +996,10 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	bool keys_fit_inline = false;
 	u32 *key_jump_cmd;
 	u32 *desc;
 
-	if (!ctx->enckeylen || !ctx->authsize)
+	if (!ctx->cdata.keylen || !ctx->authsize)
 		return 0;
 
 	/*
@@ -977,8 +1008,13 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 	 * must fit into the 64-word Descriptor h/w Buffer
 	 */
 	if (DESC_RFC4106_ENC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->enckeylen <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+		ctx->cdata.key_inline = true;
+		ctx->cdata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->cdata.key_inline = false;
+		ctx->cdata.key = ctx->key_dma;
+	}
 
 	desc = ctx->sh_desc_enc;
 
@@ -987,17 +1023,18 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 	/* Skip key loading if it is loaded due to sharing */
 	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
 				   JUMP_COND_SHRD);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-				  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	if (ctx->cdata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->cdata.key,
+				  ctx->cdata.keylen, ctx->cdata.keylen,
+				  CLASS_1 | KEY_DEST_CLASS_REG);
 	else
-		append_key(desc, ctx->key_dma, ctx->enckeylen,
-			   CLASS_1 | KEY_DEST_CLASS_REG);
+		append_key(desc, ctx->cdata.key, ctx->cdata.keylen, CLASS_1 |
+			   KEY_DEST_CLASS_REG);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
 	/* Class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	append_math_sub_imm_u32(desc, VARSEQINLEN, REG3, IMM, 8);
 	append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ);
@@ -1049,10 +1086,14 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	keys_fit_inline = false;
 	if (DESC_RFC4106_DEC_LEN + DESC_JOB_IO_LEN +
-	    ctx->enckeylen <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+		ctx->cdata.key_inline = true;
+		ctx->cdata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->cdata.key_inline = false;
+		ctx->cdata.key = ctx->key_dma;
+	}
 
 	desc = ctx->sh_desc_dec;
 
@@ -1061,17 +1102,18 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 	/* Skip key loading if it is loaded due to sharing */
 	key_jump_cmd = append_jump(desc, JUMP_JSL |
 				   JUMP_TEST_ALL | JUMP_COND_SHRD);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-				  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	if (ctx->cdata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->cdata.key,
+				  ctx->cdata.keylen, ctx->cdata.keylen,
+				  CLASS_1 | KEY_DEST_CLASS_REG);
 	else
-		append_key(desc, ctx->key_dma, ctx->enckeylen,
-			   CLASS_1 | KEY_DEST_CLASS_REG);
+		append_key(desc, ctx->cdata.key, ctx->cdata.keylen, CLASS_1 |
+			   KEY_DEST_CLASS_REG);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
 	/* Class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_DECRYPT | OP_ALG_ICV_ON);
 
 	append_math_sub_imm_u32(desc, VARSEQINLEN, REG3, IMM, 8);
 	append_math_add(desc, VARSEQOUTLEN, ZERO, REG3, CAAM_CMD_SZ);
@@ -1137,12 +1179,11 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 {
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	bool keys_fit_inline = false;
 	u32 *key_jump_cmd;
 	u32 *read_move_cmd, *write_move_cmd;
 	u32 *desc;
 
-	if (!ctx->enckeylen || !ctx->authsize)
+	if (!ctx->cdata.keylen || !ctx->authsize)
 		return 0;
 
 	/*
@@ -1151,8 +1192,13 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 	 * must fit into the 64-word Descriptor h/w Buffer
 	 */
 	if (DESC_RFC4543_ENC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->enckeylen <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+		ctx->cdata.key_inline = true;
+		ctx->cdata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->cdata.key_inline = false;
+		ctx->cdata.key = ctx->key_dma;
+	}
 
 	desc = ctx->sh_desc_enc;
 
@@ -1161,17 +1207,18 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 	/* Skip key loading if it is loaded due to sharing */
 	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
 				   JUMP_COND_SHRD);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-				  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	if (ctx->cdata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->cdata.key,
+				  ctx->cdata.keylen, ctx->cdata.keylen,
+				  CLASS_1 | KEY_DEST_CLASS_REG);
 	else
-		append_key(desc, ctx->key_dma, ctx->enckeylen,
-			   CLASS_1 | KEY_DEST_CLASS_REG);
+		append_key(desc, ctx->cdata.key, ctx->cdata.keylen, CLASS_1 |
+			   KEY_DEST_CLASS_REG);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
 	/* Class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* assoclen + cryptlen = seqinlen */
 	append_math_sub(desc, REG3, SEQINLEN, REG0, CAAM_CMD_SZ);
@@ -1222,10 +1269,14 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	keys_fit_inline = false;
 	if (DESC_RFC4543_DEC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->enckeylen <= CAAM_DESC_BYTES_MAX)
-		keys_fit_inline = true;
+	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+		ctx->cdata.key_inline = true;
+		ctx->cdata.key = (uintptr_t)ctx->key;
+	} else {
+		ctx->cdata.key_inline = false;
+		ctx->cdata.key = ctx->key_dma;
+	}
 
 	desc = ctx->sh_desc_dec;
 
@@ -1234,17 +1285,18 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 	/* Skip key loading if it is loaded due to sharing */
 	key_jump_cmd = append_jump(desc, JUMP_JSL |
 				   JUMP_TEST_ALL | JUMP_COND_SHRD);
-	if (keys_fit_inline)
-		append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-				  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	if (ctx->cdata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->cdata.key,
+				  ctx->cdata.keylen, ctx->cdata.keylen,
+				  CLASS_1 | KEY_DEST_CLASS_REG);
 	else
-		append_key(desc, ctx->key_dma, ctx->enckeylen,
-			   CLASS_1 | KEY_DEST_CLASS_REG);
+		append_key(desc, ctx->cdata.key, ctx->cdata.keylen, CLASS_1 |
+			   KEY_DEST_CLASS_REG);
 	set_jump_tgt_here(desc, key_jump_cmd);
 
 	/* Class 1 operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT | OP_ALG_ICV_ON);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_DECRYPT | OP_ALG_ICV_ON);
 
 	/* assoclen + cryptlen = seqoutlen */
 	append_math_sub(desc, REG3, SEQOUTLEN, REG0, CAAM_CMD_SZ);
@@ -1313,9 +1365,8 @@ static int rfc4543_setauthsize(struct crypto_aead *authenc,
 static u32 gen_split_aead_key(struct caam_ctx *ctx, const u8 *key_in,
 			      u32 authkeylen)
 {
-	return gen_split_key(ctx->jrdev, ctx->key, ctx->split_key_len,
-			       ctx->split_key_pad_len, key_in, authkeylen,
-			       ctx->alg_op);
+	return gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key_in,
+			     authkeylen, ctx->alg_op);
 }
 
 static int aead_setkey(struct crypto_aead *aead,
@@ -1332,11 +1383,11 @@ static int aead_setkey(struct crypto_aead *aead,
 		goto badkey;
 
 	/* Pick class 2 key length from algorithm submask */
-	ctx->split_key_len = mdpadlen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
-				      OP_ALG_ALGSEL_SHIFT] * 2;
-	ctx->split_key_pad_len = ALIGN(ctx->split_key_len, 16);
+	ctx->adata.keylen = mdpadlen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
+				     OP_ALG_ALGSEL_SHIFT] * 2;
+	ctx->adata.keylen_pad = ALIGN(ctx->adata.keylen, 16);
 
-	if (ctx->split_key_pad_len + keys.enckeylen > CAAM_MAX_KEY_SIZE)
+	if (ctx->adata.keylen_pad + keys.enckeylen > CAAM_MAX_KEY_SIZE)
 		goto badkey;
 
 #ifdef DEBUG
@@ -1344,7 +1395,7 @@ static int aead_setkey(struct crypto_aead *aead,
 	       keys.authkeylen + keys.enckeylen, keys.enckeylen,
 	       keys.authkeylen);
 	printk(KERN_ERR "split_key_len %d split_key_pad_len %d\n",
-	       ctx->split_key_len, ctx->split_key_pad_len);
+	       ctx->adata.keylen, ctx->adata.keylen_pad);
 	print_hex_dump(KERN_ERR, "key in @"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1);
 #endif
@@ -1355,9 +1406,9 @@ static int aead_setkey(struct crypto_aead *aead,
 	}
 
 	/* postpend encryption key to auth split key */
-	memcpy(ctx->key + ctx->split_key_pad_len, keys.enckey, keys.enckeylen);
+	memcpy(ctx->key + ctx->adata.keylen_pad, keys.enckey, keys.enckeylen);
 
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->split_key_pad_len +
+	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->adata.keylen_pad +
 				      keys.enckeylen, DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, ctx->key_dma)) {
 		dev_err(jrdev, "unable to map key i/o memory\n");
@@ -1366,14 +1417,14 @@ static int aead_setkey(struct crypto_aead *aead,
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "ctx.key@"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, ctx->key,
-		       ctx->split_key_pad_len + keys.enckeylen, 1);
+		       ctx->adata.keylen_pad + keys.enckeylen, 1);
 #endif
 
-	ctx->enckeylen = keys.enckeylen;
+	ctx->cdata.keylen = keys.enckeylen;
 
 	ret = aead_set_sh_desc(aead);
 	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->split_key_pad_len +
+		dma_unmap_single(jrdev, ctx->key_dma, ctx->adata.keylen_pad +
 				 keys.enckeylen, DMA_TO_DEVICE);
 	}
 
@@ -1402,11 +1453,11 @@ static int gcm_setkey(struct crypto_aead *aead,
 		dev_err(jrdev, "unable to map key i/o memory\n");
 		return -ENOMEM;
 	}
-	ctx->enckeylen = keylen;
+	ctx->cdata.keylen = keylen;
 
 	ret = gcm_set_sh_desc(aead);
 	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->enckeylen,
+		dma_unmap_single(jrdev, ctx->key_dma, ctx->cdata.keylen,
 				 DMA_TO_DEVICE);
 	}
 
@@ -1434,9 +1485,9 @@ static int rfc4106_setkey(struct crypto_aead *aead,
 	 * The last four bytes of the key material are used as the salt value
 	 * in the nonce. Update the AES key length.
 	 */
-	ctx->enckeylen = keylen - 4;
+	ctx->cdata.keylen = keylen - 4;
 
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->enckeylen,
+	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->cdata.keylen,
 				      DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, ctx->key_dma)) {
 		dev_err(jrdev, "unable to map key i/o memory\n");
@@ -1445,7 +1496,7 @@ static int rfc4106_setkey(struct crypto_aead *aead,
 
 	ret = rfc4106_set_sh_desc(aead);
 	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->enckeylen,
+		dma_unmap_single(jrdev, ctx->key_dma, ctx->cdata.keylen,
 				 DMA_TO_DEVICE);
 	}
 
@@ -1473,9 +1524,9 @@ static int rfc4543_setkey(struct crypto_aead *aead,
 	 * The last four bytes of the key material are used as the salt value
 	 * in the nonce. Update the AES key length.
 	 */
-	ctx->enckeylen = keylen - 4;
+	ctx->cdata.keylen = keylen - 4;
 
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->enckeylen,
+	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->cdata.keylen,
 				      DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, ctx->key_dma)) {
 		dev_err(jrdev, "unable to map key i/o memory\n");
@@ -1484,7 +1535,7 @@ static int rfc4543_setkey(struct crypto_aead *aead,
 
 	ret = rfc4543_set_sh_desc(aead);
 	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->enckeylen,
+		dma_unmap_single(jrdev, ctx->key_dma, ctx->cdata.keylen,
 				 DMA_TO_DEVICE);
 	}
 
@@ -1505,7 +1556,7 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	u8 *nonce;
 	u32 geniv;
 	u32 ctx1_iv_off = 0;
-	const bool ctr_mode = ((ctx->class1_alg_type & OP_ALG_AAI_MASK) ==
+	const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) ==
 			       OP_ALG_AAI_CTR_MOD128);
 	const bool is_rfc3686 = (ctr_mode &&
 				 (strstr(alg_name, "rfc3686") != NULL));
@@ -1539,7 +1590,9 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 		dev_err(jrdev, "unable to map key i/o memory\n");
 		return -ENOMEM;
 	}
-	ctx->enckeylen = keylen;
+	ctx->cdata.keylen = keylen;
+	ctx->cdata.key = (uintptr_t)ctx->key;
+	ctx->cdata.key_inline = true;
 
 	/* ablkcipher_encrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
@@ -1549,9 +1602,8 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 				   JUMP_COND_SHRD);
 
 	/* Load class1 key only */
-	append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-			  ctx->enckeylen, CLASS_1 |
-			  KEY_DEST_CLASS_REG);
+	append_key_as_imm(desc, (void *)ctx->cdata.key, ctx->cdata.keylen,
+			  ctx->cdata.keylen, CLASS_1 | KEY_DEST_CLASS_REG);
 
 	/* Load nonce into CONTEXT1 reg */
 	if (is_rfc3686) {
@@ -1580,8 +1632,8 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 				      LDST_OFFSET_SHIFT));
 
 	/* Load operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* Perform operation */
 	ablkcipher_append_src_dst(desc);
@@ -1608,9 +1660,8 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 				   JUMP_COND_SHRD);
 
 	/* Load class1 key only */
-	append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-			  ctx->enckeylen, CLASS_1 |
-			  KEY_DEST_CLASS_REG);
+	append_key_as_imm(desc, (void *)ctx->cdata.key, ctx->cdata.keylen,
+			  ctx->cdata.keylen, CLASS_1 | KEY_DEST_CLASS_REG);
 
 	/* Load nonce into CONTEXT1 reg */
 	if (is_rfc3686) {
@@ -1640,10 +1691,10 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 
 	/* Choose operation */
 	if (ctr_mode)
-		append_operation(desc, ctx->class1_alg_type |
+		append_operation(desc, ctx->cdata.algtype |
 				 OP_ALG_AS_INITFINAL | OP_ALG_DECRYPT);
 	else
-		append_dec_op1(desc, ctx->class1_alg_type);
+		append_dec_op1(desc, ctx->cdata.algtype);
 
 	/* Perform operation */
 	ablkcipher_append_src_dst(desc);
@@ -1671,9 +1722,8 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 				   JUMP_COND_SHRD);
 
 	/* Load class1 key only */
-	append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-			  ctx->enckeylen, CLASS_1 |
-			  KEY_DEST_CLASS_REG);
+	append_key_as_imm(desc, (void *)ctx->cdata.key, ctx->cdata.keylen,
+			  ctx->cdata.keylen, CLASS_1 | KEY_DEST_CLASS_REG);
 
 	/* Load Nonce into CONTEXT1 reg */
 	if (is_rfc3686) {
@@ -1720,8 +1770,8 @@ static int ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 			    (1 << JUMP_OFFSET_SHIFT));
 
 	/* Load operation */
-	append_operation(desc, ctx->class1_alg_type |
-			 OP_ALG_AS_INITFINAL | OP_ALG_ENCRYPT);
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
+			 OP_ALG_ENCRYPT);
 
 	/* Perform operation */
 	ablkcipher_append_src_dst(desc);
@@ -1764,7 +1814,9 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 		dev_err(jrdev, "unable to map key i/o memory\n");
 		return -ENOMEM;
 	}
-	ctx->enckeylen = keylen;
+	ctx->cdata.keylen = keylen;
+	ctx->cdata.key = (uintptr_t)ctx->key;
+	ctx->cdata.key_inline = true;
 
 	/* xts_ablkcipher_encrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
@@ -1774,8 +1826,8 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 				   JUMP_COND_SHRD);
 
 	/* Load class1 keys only */
-	append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-			  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	append_key_as_imm(desc, (void *)ctx->cdata.key, ctx->cdata.keylen,
+			  ctx->cdata.keylen, CLASS_1 | KEY_DEST_CLASS_REG);
 
 	/* Load sector size with index 40 bytes (0x28) */
 	append_cmd(desc, CMD_LOAD | IMMEDIATE | LDST_SRCDST_BYTE_CONTEXT |
@@ -1794,7 +1846,7 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	append_seq_fifo_load(desc, 8, FIFOLD_CLASS_SKIP);
 
 	/* Load operation */
-	append_operation(desc, ctx->class1_alg_type | OP_ALG_AS_INITFINAL |
+	append_operation(desc, ctx->cdata.algtype | OP_ALG_AS_INITFINAL |
 			 OP_ALG_ENCRYPT);
 
 	/* Perform operation */
@@ -1821,8 +1873,8 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 				   JUMP_COND_SHRD);
 
 	/* Load class1 key only */
-	append_key_as_imm(desc, (void *)ctx->key, ctx->enckeylen,
-			  ctx->enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	append_key_as_imm(desc, (void *)ctx->cdata.key, ctx->cdata.keylen,
+			  ctx->cdata.keylen, CLASS_1 | KEY_DEST_CLASS_REG);
 
 	/* Load sector size with index 40 bytes (0x28) */
 	append_cmd(desc, CMD_LOAD | IMMEDIATE | LDST_SRCDST_BYTE_CONTEXT |
@@ -1841,7 +1893,7 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	append_seq_fifo_load(desc, 8, FIFOLD_CLASS_SKIP);
 
 	/* Load operation */
-	append_dec_op1(desc, ctx->class1_alg_type);
+	append_dec_op1(desc, ctx->cdata.algtype);
 
 	/* Perform operation */
 	ablkcipher_append_src_dst(desc);
@@ -2141,7 +2193,7 @@ static void init_gcm_job(struct aead_request *req,
 			 FIFOLD_TYPE_IV | FIFOLD_TYPE_FLUSH1 | 12 | last);
 	/* Append Salt */
 	if (!generic_gcm)
-		append_data(desc, ctx->key + ctx->enckeylen, 4);
+		append_data(desc, ctx->key + ctx->cdata.keylen, 4);
 	/* Append IV */
 	append_data(desc, req->iv, ivsize);
 	/* End of blank commands */
@@ -2156,7 +2208,7 @@ static void init_authenc_job(struct aead_request *req,
 						 struct caam_aead_alg, aead);
 	unsigned int ivsize = crypto_aead_ivsize(aead);
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
-	const bool ctr_mode = ((ctx->class1_alg_type & OP_ALG_AAI_MASK) ==
+	const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) ==
 			       OP_ALG_AAI_CTR_MOD128);
 	const bool is_rfc3686 = alg->caam.rfc3686;
 	u32 *desc = edesc->hw_desc;
@@ -4395,8 +4447,8 @@ static int caam_init_common(struct caam_ctx *ctx, struct caam_alg_entry *caam)
 	}
 
 	/* copy descriptor header template value */
-	ctx->class1_alg_type = OP_TYPE_CLASS1_ALG | caam->class1_alg_type;
-	ctx->class2_alg_type = OP_TYPE_CLASS2_ALG | caam->class2_alg_type;
+	ctx->cdata.algtype = OP_TYPE_CLASS1_ALG | caam->class1_alg_type;
+	ctx->adata.algtype = OP_TYPE_CLASS2_ALG | caam->class2_alg_type;
 	ctx->alg_op = OP_TYPE_CLASS2_ALG | caam->alg_op;
 
 	return 0;
@@ -4440,7 +4492,7 @@ static void caam_exit_common(struct caam_ctx *ctx)
 	if (ctx->key_dma &&
 	    !dma_mapping_error(ctx->jrdev, ctx->key_dma))
 		dma_unmap_single(ctx->jrdev, ctx->key_dma,
-				 ctx->enckeylen + ctx->split_key_pad_len,
+				 ctx->cdata.keylen + ctx->adata.keylen_pad,
 				 DMA_TO_DEVICE);
 
 	caam_jr_free(ctx->jrdev);
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 86f360853502..5e569ff06b4b 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -108,13 +108,11 @@ struct caam_hash_ctx {
 	dma_addr_t sh_desc_fin_dma;
 	dma_addr_t sh_desc_digest_dma;
 	struct device *jrdev;
-	u32 alg_type;
 	u32 alg_op;
 	u8 key[CAAM_MAX_HASH_KEY_SIZE];
 	dma_addr_t key_dma;
 	int ctx_len;
-	unsigned int split_key_len;
-	unsigned int split_key_pad_len;
+	struct alginfo adata;
 };
 
 /* ahash state */
@@ -223,9 +221,9 @@ static inline int ctx_map_to_sec4_sg(u32 *desc, struct device *jrdev,
 /* Common shared descriptor commands */
 static inline void append_key_ahash(u32 *desc, struct caam_hash_ctx *ctx)
 {
-	append_key_as_imm(desc, ctx->key, ctx->split_key_pad_len,
-			  ctx->split_key_len, CLASS_2 |
-			  KEY_DEST_MDHA_SPLIT | KEY_ENC);
+	append_key_as_imm(desc, ctx->key, ctx->adata.keylen_pad,
+			  ctx->adata.keylen, CLASS_2 | KEY_DEST_MDHA_SPLIT |
+			  KEY_ENC);
 }
 
 /* Append key if it has been set */
@@ -235,7 +233,7 @@ static inline void init_sh_desc_key_ahash(u32 *desc, struct caam_hash_ctx *ctx)
 
 	init_sh_desc(desc, HDR_SHARE_SERIAL);
 
-	if (ctx->split_key_len) {
+	if (ctx->adata.keylen) {
 		/* Skip if already shared */
 		key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
 					   JUMP_COND_SHRD);
@@ -310,7 +308,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	u32 have_key = 0;
 	u32 *desc;
 
-	if (ctx->split_key_len)
+	if (ctx->adata.keylen)
 		have_key = OP_ALG_AAI_HMAC_PRECOMP;
 
 	/* ahash_update shared descriptor */
@@ -323,7 +321,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 		   LDST_CLASS_2_CCB | ctx->ctx_len);
 
 	/* Class 2 operation */
-	append_operation(desc, ctx->alg_type | OP_ALG_AS_UPDATE |
+	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_UPDATE |
 			 OP_ALG_ENCRYPT);
 
 	/* Load data and write to result or context */
@@ -344,7 +342,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	/* ahash_update_first shared descriptor */
 	desc = ctx->sh_desc_update_first;
 
-	ahash_data_to_out(desc, have_key | ctx->alg_type, OP_ALG_AS_INIT,
+	ahash_data_to_out(desc, have_key | ctx->adata.algtype, OP_ALG_AS_INIT,
 			  ctx->ctx_len, ctx);
 
 	ctx->sh_desc_update_first_dma = dma_map_single(jrdev, desc,
@@ -363,7 +361,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	/* ahash_final shared descriptor */
 	desc = ctx->sh_desc_fin;
 
-	ahash_ctx_data_to_out(desc, have_key | ctx->alg_type,
+	ahash_ctx_data_to_out(desc, have_key | ctx->adata.algtype,
 			      OP_ALG_AS_FINALIZE, digestsize, ctx);
 
 	ctx->sh_desc_fin_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
@@ -381,8 +379,8 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	/* ahash_digest shared descriptor */
 	desc = ctx->sh_desc_digest;
 
-	ahash_data_to_out(desc, have_key | ctx->alg_type, OP_ALG_AS_INITFINAL,
-			  digestsize, ctx);
+	ahash_data_to_out(desc, have_key | ctx->adata.algtype,
+			  OP_ALG_AS_INITFINAL, digestsize, ctx);
 
 	ctx->sh_desc_digest_dma = dma_map_single(jrdev, desc,
 						 desc_bytes(desc),
@@ -404,9 +402,8 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 static int gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 			      u32 keylen)
 {
-	return gen_split_key(ctx->jrdev, ctx->key, ctx->split_key_len,
-			       ctx->split_key_pad_len, key_in, keylen,
-			       ctx->alg_op);
+	return gen_split_key(ctx->jrdev, ctx->key, &ctx->adata, key_in, keylen,
+			     ctx->alg_op);
 }
 
 /* Digest hash size if it is too large */
@@ -444,7 +441,7 @@ static int hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 	}
 
 	/* Job descriptor to perform unkeyed hash on key_in */
-	append_operation(desc, ctx->alg_type | OP_ALG_ENCRYPT |
+	append_operation(desc, ctx->adata.algtype | OP_ALG_ENCRYPT |
 			 OP_ALG_AS_INITFINAL);
 	append_seq_in_ptr(desc, src_dma, *keylen, 0);
 	append_seq_fifo_load(desc, *keylen, FIFOLD_CLASS_CLASS2 |
@@ -515,13 +512,13 @@ static int ahash_setkey(struct crypto_ahash *ahash,
 	}
 
 	/* Pick class 2 key length from algorithm submask */
-	ctx->split_key_len = mdpadlen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
-				      OP_ALG_ALGSEL_SHIFT] * 2;
-	ctx->split_key_pad_len = ALIGN(ctx->split_key_len, 16);
+	ctx->adata.keylen = mdpadlen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
+				     OP_ALG_ALGSEL_SHIFT] * 2;
+	ctx->adata.keylen_pad = ALIGN(ctx->adata.keylen, 16);
 
 #ifdef DEBUG
 	printk(KERN_ERR "split_key_len %d split_key_pad_len %d\n",
-	       ctx->split_key_len, ctx->split_key_pad_len);
+	       ctx->adata.keylen, ctx->adata.keylen_pad);
 	print_hex_dump(KERN_ERR, "key in @"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, key, keylen, 1);
 #endif
@@ -530,7 +527,7 @@ static int ahash_setkey(struct crypto_ahash *ahash,
 	if (ret)
 		goto bad_free_key;
 
-	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->split_key_pad_len,
+	ctx->key_dma = dma_map_single(jrdev, ctx->key, ctx->adata.keylen_pad,
 				      DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, ctx->key_dma)) {
 		dev_err(jrdev, "unable to map key i/o memory\n");
@@ -540,14 +537,15 @@ static int ahash_setkey(struct crypto_ahash *ahash,
 #ifdef DEBUG
 	print_hex_dump(KERN_ERR, "ctx.key@"__stringify(__LINE__)": ",
 		       DUMP_PREFIX_ADDRESS, 16, 4, ctx->key,
-		       ctx->split_key_pad_len, 1);
+		       ctx->adata.keylen_pad, 1);
 #endif
 
 	ret = ahash_set_sh_desc(ahash);
 	if (ret) {
-		dma_unmap_single(jrdev, ctx->key_dma, ctx->split_key_pad_len,
+		dma_unmap_single(jrdev, ctx->key_dma, ctx->adata.keylen_pad,
 				 DMA_TO_DEVICE);
 	}
+
  error_free_key:
 	kfree(hashed_key);
 	return ret;
@@ -1832,7 +1830,7 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
 		return PTR_ERR(ctx->jrdev);
 	}
 	/* copy descriptor header template value */
-	ctx->alg_type = OP_TYPE_CLASS2_ALG | caam_hash->alg_type;
+	ctx->adata.algtype = OP_TYPE_CLASS2_ALG | caam_hash->alg_type;
 	ctx->alg_op = OP_TYPE_CLASS2_ALG | caam_hash->alg_op;
 
 	ctx->ctx_len = runninglen[(ctx->alg_op & OP_ALG_ALGSEL_SUBMASK) >>
diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h
index 354da735af62..bfef7952dfb7 100644
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@@ -430,3 +430,22 @@ do { \
 	APPEND_MATH_IMM_u64(LSHIFT, desc, dest, src0, src1, data)
 #define append_math_rshift_imm_u64(desc, dest, src0, src1, data) \
 	APPEND_MATH_IMM_u64(RSHIFT, desc, dest, src0, src1, data)
+
+/**
+ * struct alginfo - Container for algorithm details
+ * @algtype: algorithm selector; for valid values, see documentation of the
+ *           functions where it is used.
+ * @keylen: length of the provided algorithm key, in bytes
+ * @keylen_pad: padded length of the provided algorithm key, in bytes
+ * @key: address where algorithm key resides; virtual address if key_inline
+ *       is true, dma (bus) address if key_inline is false.
+ * @key_inline: true - key can be inlined in the descriptor; false - key is
+ *              referenced by the descriptor
+ */
+struct alginfo {
+	u32 algtype;
+	unsigned int keylen;
+	unsigned int keylen_pad;
+	u64 key;
+	bool key_inline;
+};
diff --git a/drivers/crypto/caam/key_gen.c b/drivers/crypto/caam/key_gen.c
index e1eaf4ff9762..df287e751df1 100644
--- a/drivers/crypto/caam/key_gen.c
+++ b/drivers/crypto/caam/key_gen.c
@@ -41,8 +41,8 @@ Split key generation-----------------------------------------------
 [06] 0x64260028    fifostr: class2 mdsplit-jdk len=40
 			@0xffe04000
 */
-int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
-		  int split_key_pad_len, const u8 *key_in, u32 keylen,
+int gen_split_key(struct device *jrdev, u8 *key_out,
+		  struct alginfo * const adata, const u8 *key_in, u32 keylen,
 		  u32 alg_op)
 {
 	u32 *desc;
@@ -63,7 +63,7 @@ int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
 		goto out_free;
 	}
 
-	dma_addr_out = dma_map_single(jrdev, key_out, split_key_pad_len,
+	dma_addr_out = dma_map_single(jrdev, key_out, adata->keylen_pad,
 				      DMA_FROM_DEVICE);
 	if (dma_mapping_error(jrdev, dma_addr_out)) {
 		dev_err(jrdev, "unable to map key output memory\n");
@@ -87,7 +87,7 @@ int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
 	 * FIFO_STORE with the explicit split-key content store
 	 * (0x26 output type)
 	 */
-	append_fifo_store(desc, dma_addr_out, split_key_len,
+	append_fifo_store(desc, dma_addr_out, adata->keylen,
 			  LDST_CLASS_2_CCB | FIFOST_TYPE_SPLIT_KEK);
 
 #ifdef DEBUG
@@ -108,11 +108,11 @@ int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
 #ifdef DEBUG
 		print_hex_dump(KERN_ERR, "ctx.key@"__stringify(__LINE__)": ",
 			       DUMP_PREFIX_ADDRESS, 16, 4, key_out,
-			       split_key_pad_len, 1);
+			       adata->keylen_pad, 1);
 #endif
 	}
 
-	dma_unmap_single(jrdev, dma_addr_out, split_key_pad_len,
+	dma_unmap_single(jrdev, dma_addr_out, adata->keylen_pad,
 			 DMA_FROM_DEVICE);
 out_unmap_in:
 	dma_unmap_single(jrdev, dma_addr_in, keylen, DMA_TO_DEVICE);
diff --git a/drivers/crypto/caam/key_gen.h b/drivers/crypto/caam/key_gen.h
index c5588f6d8109..511882af0596 100644
--- a/drivers/crypto/caam/key_gen.h
+++ b/drivers/crypto/caam/key_gen.h
@@ -12,6 +12,6 @@ struct split_key_result {
 
 void split_key_done(struct device *dev, u32 *desc, u32 err, void *context);
 
-int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
-		    int split_key_pad_len, const u8 *key_in, u32 keylen,
-		    u32 alg_op);
+int gen_split_key(struct device *jrdev, u8 *key_out,
+		  struct alginfo * const adata, const u8 *key_in, u32 keylen,
+		  u32 alg_op);
-- 
2.4.4

^ permalink raw reply related

* [PATCH 10/10] MAINTAINERS: add maintainers for caam crypto driver
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

Add myself and Dan as maintainers of the caam crypto driver.

Cc: Dan Douglass <dan.douglass@nxp.com>
Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 MAINTAINERS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7e0064..e28aab4d525a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4972,6 +4972,14 @@ F:	include/linux/fb.h
 F:	include/uapi/video/
 F:	include/uapi/linux/fb.h
 
+FREESCALE CAAM (Cryptographic Acceleration and Assurance Module) DRIVER
+M:	Horia Geantă <horia.geanta@nxp.com>
+M:	Dan Douglass <dan.douglass@nxp.com>
+L:	linux-crypto@vger.kernel.org
+S:	Maintained
+F:	drivers/crypto/caam/
+F:	Documentation/devicetree/bindings/crypto/fsl-sec4.txt
+
 FREESCALE DIU FRAMEBUFFER DRIVER
 M:	Timur Tabi <timur@tabi.org>
 L:	linux-fbdev@vger.kernel.org
-- 
2.4.4

^ permalink raw reply related

* [PATCH 01/10] crypto: caam - move append_key_aead() into init_sh_desc_key_aead()
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

append_key_aead() is used in only one place, thus inline it.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/caamalg.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 8db54b090d39..567e234fb49b 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -228,12 +228,19 @@ struct caam_ctx {
 	unsigned int authsize;
 };
 
-static void append_key_aead(u32 *desc, struct caam_ctx *ctx,
-			    int keys_fit_inline, bool is_rfc3686)
+static void init_sh_desc_key_aead(u32 *desc, struct caam_ctx *ctx,
+				  int keys_fit_inline, bool is_rfc3686)
 {
-	u32 *nonce;
+	u32 *key_jump_cmd;
 	unsigned int enckeylen = ctx->enckeylen;
 
+	/* Note: Context registers are saved. */
+	init_sh_desc(desc, HDR_SHARE_SERIAL | HDR_SAVECTX);
+
+	/* Skip if already shared */
+	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+				   JUMP_COND_SHRD);
+
 	/*
 	 * RFC3686 specific:
 	 *	| ctx->key = {AUTH_KEY, ENC_KEY, NONCE}
@@ -258,6 +265,8 @@ static void append_key_aead(u32 *desc, struct caam_ctx *ctx,
 
 	/* Load Counter into CONTEXT1 reg */
 	if (is_rfc3686) {
+		u32 *nonce;
+
 		nonce = (u32 *)((void *)ctx->key + ctx->split_key_pad_len +
 			       enckeylen);
 		append_load_as_imm(desc, nonce, CTR_RFC3686_NONCE_SIZE,
@@ -269,21 +278,6 @@ static void append_key_aead(u32 *desc, struct caam_ctx *ctx,
 			    (16 << MOVE_OFFSET_SHIFT) |
 			    (CTR_RFC3686_NONCE_SIZE << MOVE_LEN_SHIFT));
 	}
-}
-
-static void init_sh_desc_key_aead(u32 *desc, struct caam_ctx *ctx,
-				  int keys_fit_inline, bool is_rfc3686)
-{
-	u32 *key_jump_cmd;
-
-	/* Note: Context registers are saved. */
-	init_sh_desc(desc, HDR_SHARE_SERIAL | HDR_SAVECTX);
-
-	/* Skip if already shared */
-	key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
-				   JUMP_COND_SHRD);
-
-	append_key_aead(desc, ctx, keys_fit_inline, is_rfc3686);
 
 	set_jump_tgt_here(desc, key_jump_cmd);
 }
-- 
2.4.4

^ permalink raw reply related

* [PATCH 00/10] crypto: caam - shared descriptor generation refactoring
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu

In preparation for the caam/qi (Queue Interface) driver, the generation
of the shared descriptors is abstracted and exported.
This way the existing caam/jr (Job Ring) and caam/qi drivers will have
a common base of descriptors.

Thanks,
Horia

Horia Geantă (10):
  crypto: caam - move append_key_aead() into init_sh_desc_key_aead()
  crypto: caam - group algorithm related params
  crypto: caam - remove superfluous alg_op algorithm param
  crypto: caam - improve key inlining
  crypto: caam - rewrite some generic inline append cmds
  crypto: caam - remove unneded dependencies on CRYPTO_DEV_FSL_CAAM
  crypto: caam - refactor encryption descriptors generation
  crypto: caam - consolidate split key length computation
  crypto: caam - refactor ahash shared descriptor generation
  MAINTAINERS: add maintainers for caam crypto driver

 MAINTAINERS                        |    8 +
 drivers/crypto/caam/Kconfig        |   11 +-
 drivers/crypto/caam/Makefile       |    1 +
 drivers/crypto/caam/caamalg.c      | 1421 +++++-------------------------------
 drivers/crypto/caam/caamalg_desc.c | 1302 +++++++++++++++++++++++++++++++++
 drivers/crypto/caam/caamalg_desc.h |   97 +++
 drivers/crypto/caam/caamhash.c     |  184 ++---
 drivers/crypto/caam/desc_constr.h  |   58 ++
 drivers/crypto/caam/key_gen.c      |   62 +-
 drivers/crypto/caam/key_gen.h      |    6 +-
 10 files changed, 1767 insertions(+), 1383 deletions(-)
 create mode 100644 drivers/crypto/caam/caamalg_desc.c
 create mode 100644 drivers/crypto/caam/caamalg_desc.h

-- 
2.4.4

^ permalink raw reply

* [PATCH 09/10] crypto: caam - refactor ahash shared descriptor generation
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

Move ahash shared descriptor generation into a single function.
Currently there is no plan to support ahash on any other interface
besides the Job Ring, thus for now the functionality is not exported.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/caamhash.c | 130 ++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 94 deletions(-)

diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 505007d0277c..e58639ea53b1 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -217,86 +217,54 @@ static inline int ctx_map_to_sec4_sg(u32 *desc, struct device *jrdev,
 	return 0;
 }
 
-/* Common shared descriptor commands */
-static inline void append_key_ahash(u32 *desc, struct caam_hash_ctx *ctx)
-{
-	append_key_as_imm(desc, ctx->key, ctx->adata.keylen_pad,
-			  ctx->adata.keylen, CLASS_2 | KEY_DEST_MDHA_SPLIT |
-			  KEY_ENC);
-}
-
-/* Append key if it has been set */
-static inline void init_sh_desc_key_ahash(u32 *desc, struct caam_hash_ctx *ctx)
-{
-	u32 *key_jump_cmd;
-
-	init_sh_desc(desc, HDR_SHARE_SERIAL);
-
-	if (ctx->adata.keylen) {
-		/* Skip if already shared */
-		key_jump_cmd = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
-					   JUMP_COND_SHRD);
-
-		append_key_ahash(desc, ctx);
-
-		set_jump_tgt_here(desc, key_jump_cmd);
-	}
-}
-
 /*
- * For ahash read data from seqin following state->caam_ctx,
- * and write resulting class2 context to seqout, which may be state->caam_ctx
- * or req->result
+ * For ahash update, final and finup (import_ctx = true)
+ *     import context, read and write to seqout
+ * For ahash firsts and digest (import_ctx = false)
+ *     read and write to seqout
  */
-static inline void ahash_append_load_str(u32 *desc, int digestsize)
+static inline void ahash_gen_sh_desc(u32 *desc, u32 state, int digestsize,
+				     struct caam_hash_ctx *ctx, bool import_ctx)
 {
-	/* Calculate remaining bytes to read */
-	append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
+	u32 op = ctx->adata.algtype;
+	u32 *skip_key_load;
 
-	/* Read remaining bytes */
-	append_seq_fifo_load(desc, 0, FIFOLD_CLASS_CLASS2 | FIFOLD_TYPE_LAST2 |
-			     FIFOLD_TYPE_MSG | KEY_VLF);
-
-	/* Store class2 context bytes */
-	append_seq_store(desc, digestsize, LDST_CLASS_2_CCB |
-			 LDST_SRCDST_BYTE_CONTEXT);
-}
+	init_sh_desc(desc, HDR_SHARE_SERIAL);
 
-/*
- * For ahash update, final and finup, import context, read and write to seqout
- */
-static inline void ahash_ctx_data_to_out(u32 *desc, u32 op, u32 state,
-					 int digestsize,
-					 struct caam_hash_ctx *ctx)
-{
-	init_sh_desc_key_ahash(desc, ctx);
+	/* Append key if it has been set; ahash update excluded */
+	if ((state != OP_ALG_AS_UPDATE) && (ctx->adata.keylen)) {
+		/* Skip key loading if already shared */
+		skip_key_load = append_jump(desc, JUMP_JSL | JUMP_TEST_ALL |
+					    JUMP_COND_SHRD);
 
-	/* Import context from software */
-	append_seq_load(desc, ctx->ctx_len, LDST_CLASS_2_CCB |
-			LDST_SRCDST_BYTE_CONTEXT);
+		append_key_as_imm(desc, ctx->key, ctx->adata.keylen_pad,
+				  ctx->adata.keylen, CLASS_2 |
+				  KEY_DEST_MDHA_SPLIT | KEY_ENC);
 
-	/* Class 2 operation */
-	append_operation(desc, op | state | OP_ALG_ENCRYPT);
+		set_jump_tgt_here(desc, skip_key_load);
 
-	/*
-	 * Load from buf and/or src and write to req->result or state->context
-	 */
-	ahash_append_load_str(desc, digestsize);
-}
+		op |= OP_ALG_AAI_HMAC_PRECOMP;
+	}
 
-/* For ahash firsts and digest, read and write to seqout */
-static inline void ahash_data_to_out(u32 *desc, u32 op, u32 state,
-				     int digestsize, struct caam_hash_ctx *ctx)
-{
-	init_sh_desc_key_ahash(desc, ctx);
+	/* If needed, import context from software */
+	if (import_ctx)
+		append_seq_load(desc, ctx->ctx_len, LDST_CLASS_2_CCB |
+				LDST_SRCDST_BYTE_CONTEXT);
 
 	/* Class 2 operation */
 	append_operation(desc, op | state | OP_ALG_ENCRYPT);
 
 	/*
 	 * Load from buf and/or src and write to req->result or state->context
+	 * Calculate remaining bytes to read
 	 */
-	ahash_append_load_str(desc, digestsize);
+	append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
+	/* Read remaining bytes */
+	append_seq_fifo_load(desc, 0, FIFOLD_CLASS_CLASS2 | FIFOLD_TYPE_LAST2 |
+			     FIFOLD_TYPE_MSG | KEY_VLF);
+	/* Store class2 context bytes */
+	append_seq_store(desc, digestsize, LDST_CLASS_2_CCB |
+			 LDST_SRCDST_BYTE_CONTEXT);
 }
 
 static int ahash_set_sh_desc(struct crypto_ahash *ahash)
@@ -304,28 +272,11 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	struct caam_hash_ctx *ctx = crypto_ahash_ctx(ahash);
 	int digestsize = crypto_ahash_digestsize(ahash);
 	struct device *jrdev = ctx->jrdev;
-	u32 have_key = 0;
 	u32 *desc;
 
-	if (ctx->adata.keylen)
-		have_key = OP_ALG_AAI_HMAC_PRECOMP;
-
 	/* ahash_update shared descriptor */
 	desc = ctx->sh_desc_update;
-
-	init_sh_desc(desc, HDR_SHARE_SERIAL);
-
-	/* Import context from software */
-	append_seq_load(desc, ctx->ctx_len, LDST_CLASS_2_CCB |
-			LDST_SRCDST_BYTE_CONTEXT);
-
-	/* Class 2 operation */
-	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_UPDATE |
-			 OP_ALG_ENCRYPT);
-
-	/* Load data and write to result or context */
-	ahash_append_load_str(desc, ctx->ctx_len);
-
+	ahash_gen_sh_desc(desc, OP_ALG_AS_UPDATE, ctx->ctx_len, ctx, true);
 	ctx->sh_desc_update_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
 						 DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, ctx->sh_desc_update_dma)) {
@@ -340,10 +291,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 
 	/* ahash_update_first shared descriptor */
 	desc = ctx->sh_desc_update_first;
-
-	ahash_data_to_out(desc, have_key | ctx->adata.algtype, OP_ALG_AS_INIT,
-			  ctx->ctx_len, ctx);
-
+	ahash_gen_sh_desc(desc, OP_ALG_AS_INIT, ctx->ctx_len, ctx, false);
 	ctx->sh_desc_update_first_dma = dma_map_single(jrdev, desc,
 						       desc_bytes(desc),
 						       DMA_TO_DEVICE);
@@ -359,10 +307,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 
 	/* ahash_final shared descriptor */
 	desc = ctx->sh_desc_fin;
-
-	ahash_ctx_data_to_out(desc, have_key | ctx->adata.algtype,
-			      OP_ALG_AS_FINALIZE, digestsize, ctx);
-
+	ahash_gen_sh_desc(desc, OP_ALG_AS_FINALIZE, digestsize, ctx, true);
 	ctx->sh_desc_fin_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
 					      DMA_TO_DEVICE);
 	if (dma_mapping_error(jrdev, ctx->sh_desc_fin_dma)) {
@@ -377,10 +322,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 
 	/* ahash_digest shared descriptor */
 	desc = ctx->sh_desc_digest;
-
-	ahash_data_to_out(desc, have_key | ctx->adata.algtype,
-			  OP_ALG_AS_INITFINAL, digestsize, ctx);
-
+	ahash_gen_sh_desc(desc, OP_ALG_AS_INITFINAL, digestsize, ctx, false);
 	ctx->sh_desc_digest_dma = dma_map_single(jrdev, desc,
 						 desc_bytes(desc),
 						 DMA_TO_DEVICE);
-- 
2.4.4

^ permalink raw reply related

* [PATCH 05/10] crypto: caam - rewrite some generic inline append cmds
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

A few descriptor commands are generated using generic
inline append "append_cmd" function.
Rewrite them using specific inline append functions.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/caamalg.c  | 20 ++++++++++----------
 drivers/crypto/caam/caamhash.c |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 5f332df1a8e6..9cb95f5b2eb3 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -1853,9 +1853,9 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 			  ctx->cdata.keylen, CLASS_1 | KEY_DEST_CLASS_REG);
 
 	/* Load sector size with index 40 bytes (0x28) */
-	append_cmd(desc, CMD_LOAD | IMMEDIATE | LDST_SRCDST_BYTE_CONTEXT |
-		   LDST_CLASS_1_CCB | (0x28 << LDST_OFFSET_SHIFT) | 8);
-	append_data(desc, (void *)&sector_size, 8);
+	append_load_as_imm(desc, (void *)&sector_size, 8, LDST_CLASS_1_CCB |
+			   LDST_SRCDST_BYTE_CONTEXT |
+			   (0x28 << LDST_OFFSET_SHIFT));
 
 	set_jump_tgt_here(desc, key_jump_cmd);
 
@@ -1864,8 +1864,8 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	 * Upper 8B of IV - will be used as sector index
 	 * Lower 8B of IV - will be discarded
 	 */
-	append_cmd(desc, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
-		   LDST_CLASS_1_CCB | (0x20 << LDST_OFFSET_SHIFT) | 8);
+	append_seq_load(desc, 8, LDST_SRCDST_BYTE_CONTEXT | LDST_CLASS_1_CCB |
+			(0x20 << LDST_OFFSET_SHIFT));
 	append_seq_fifo_load(desc, 8, FIFOLD_CLASS_SKIP);
 
 	/* Load operation */
@@ -1900,9 +1900,9 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 			  ctx->cdata.keylen, CLASS_1 | KEY_DEST_CLASS_REG);
 
 	/* Load sector size with index 40 bytes (0x28) */
-	append_cmd(desc, CMD_LOAD | IMMEDIATE | LDST_SRCDST_BYTE_CONTEXT |
-		   LDST_CLASS_1_CCB | (0x28 << LDST_OFFSET_SHIFT) | 8);
-	append_data(desc, (void *)&sector_size, 8);
+	append_load_as_imm(desc, (void *)&sector_size, 8, LDST_CLASS_1_CCB |
+			   LDST_SRCDST_BYTE_CONTEXT |
+			   (0x28 << LDST_OFFSET_SHIFT));
 
 	set_jump_tgt_here(desc, key_jump_cmd);
 
@@ -1911,8 +1911,8 @@ static int xts_ablkcipher_setkey(struct crypto_ablkcipher *ablkcipher,
 	 * Upper 8B of IV - will be used as sector index
 	 * Lower 8B of IV - will be discarded
 	 */
-	append_cmd(desc, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
-		   LDST_CLASS_1_CCB | (0x20 << LDST_OFFSET_SHIFT) | 8);
+	append_seq_load(desc, 8, LDST_SRCDST_BYTE_CONTEXT | LDST_CLASS_1_CCB |
+			(0x20 << LDST_OFFSET_SHIFT));
 	append_seq_fifo_load(desc, 8, FIFOLD_CLASS_SKIP);
 
 	/* Load operation */
diff --git a/drivers/crypto/caam/caamhash.c b/drivers/crypto/caam/caamhash.c
index 8e4530d68208..d3f0ae16a73b 100644
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -272,8 +272,8 @@ static inline void ahash_ctx_data_to_out(u32 *desc, u32 op, u32 state,
 	init_sh_desc_key_ahash(desc, ctx);
 
 	/* Import context from software */
-	append_cmd(desc, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
-		   LDST_CLASS_2_CCB | ctx->ctx_len);
+	append_seq_load(desc, ctx->ctx_len, LDST_CLASS_2_CCB |
+			LDST_SRCDST_BYTE_CONTEXT);
 
 	/* Class 2 operation */
 	append_operation(desc, op | state | OP_ALG_ENCRYPT);
@@ -316,8 +316,8 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	init_sh_desc(desc, HDR_SHARE_SERIAL);
 
 	/* Import context from software */
-	append_cmd(desc, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
-		   LDST_CLASS_2_CCB | ctx->ctx_len);
+	append_seq_load(desc, ctx->ctx_len, LDST_CLASS_2_CCB |
+			LDST_SRCDST_BYTE_CONTEXT);
 
 	/* Class 2 operation */
 	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_UPDATE |
-- 
2.4.4

^ permalink raw reply related

* [PATCH 04/10] crypto: caam - improve key inlining
From: Horia Geantă @ 2016-11-22 13:44 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, linux-crypto, Dan Douglass, Tudor Ambarus,
	Alexandru Porosanu
In-Reply-To: <1479822252-23833-1-git-send-email-horia.geanta@nxp.com>

For authenc / stitched AEAD algorithms, check independently
each of the two (authentication, encryption) keys whether inlining
is possible.
Prioritize the inlining of the authentication key, since the length
of the (split) key is bigger than that of the encryption key.

For the other algorithms, compute only once per tfm the remaining
available bytes and decide whether key inlining is possible
based on this.

Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
---
 drivers/crypto/caam/caamalg.c     | 130 ++++++++++++++++++++++----------------
 drivers/crypto/caam/desc_constr.h |  39 ++++++++++++
 2 files changed, 116 insertions(+), 53 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 48fc000d86bf..5f332df1a8e6 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -224,7 +224,7 @@ struct caam_ctx {
 };
 
 static void init_sh_desc_key_aead(u32 *desc, struct caam_ctx *ctx,
-				  int keys_fit_inline, bool is_rfc3686)
+				  bool is_rfc3686)
 {
 	u32 *key_jump_cmd;
 	unsigned int enckeylen = ctx->cdata.keylen;
@@ -244,18 +244,20 @@ static void init_sh_desc_key_aead(u32 *desc, struct caam_ctx *ctx,
 	if (is_rfc3686)
 		enckeylen -= CTR_RFC3686_NONCE_SIZE;
 
-	if (keys_fit_inline) {
+	if (ctx->adata.key_inline)
 		append_key_as_imm(desc, (void *)ctx->adata.key,
 				  ctx->adata.keylen_pad, ctx->adata.keylen,
 				  CLASS_2 | KEY_DEST_MDHA_SPLIT | KEY_ENC);
-		append_key_as_imm(desc, (void *)ctx->cdata.key, enckeylen,
-				  enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
-	} else {
+	else
 		append_key(desc, ctx->adata.key, ctx->adata.keylen, CLASS_2 |
 			   KEY_DEST_MDHA_SPLIT | KEY_ENC);
+
+	if (ctx->cdata.key_inline)
+		append_key_as_imm(desc, (void *)ctx->cdata.key, enckeylen,
+				  enckeylen, CLASS_1 | KEY_DEST_CLASS_REG);
+	else
 		append_key(desc, ctx->cdata.key, enckeylen, CLASS_1 |
 			   KEY_DEST_CLASS_REG);
-	}
 
 	/* Load Counter into CONTEXT1 reg */
 	if (is_rfc3686) {
@@ -282,13 +284,14 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	struct device *jrdev = ctx->jrdev;
 	u32 *key_jump_cmd, *jump_cmd, *read_move_cmd, *write_move_cmd;
 	u32 *desc;
+	int rem_bytes = CAAM_DESC_BYTES_MAX - AEAD_DESC_JOB_IO_LEN -
+			ctx->adata.keylen_pad;
 
 	/*
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_AEAD_NULL_ENC_LEN + AEAD_DESC_JOB_IO_LEN +
-	    ctx->adata.keylen_pad <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_AEAD_NULL_ENC_LEN) {
 		ctx->adata.key_inline = true;
 		ctx->adata.key = (uintptr_t)ctx->key;
 	} else {
@@ -368,8 +371,7 @@ static int aead_null_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_AEAD_NULL_DEC_LEN + DESC_JOB_IO_LEN +
-	    ctx->adata.keylen_pad <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_AEAD_NULL_DEC_LEN) {
 		ctx->adata.key_inline = true;
 		ctx->adata.key = (uintptr_t)ctx->key;
 	} else {
@@ -463,10 +465,11 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	unsigned int ivsize = crypto_aead_ivsize(aead);
 	struct caam_ctx *ctx = crypto_aead_ctx(aead);
 	struct device *jrdev = ctx->jrdev;
-	bool keys_fit_inline;
 	u32 geniv, moveiv;
 	u32 ctx1_iv_off = 0;
 	u32 *desc;
+	u32 inl_mask;
+	unsigned int data_len[2];
 	const bool ctr_mode = ((ctx->cdata.algtype & OP_ALG_AAI_MASK) ==
 			       OP_ALG_AAI_CTR_MOD128);
 	const bool is_rfc3686 = alg->caam.rfc3686;
@@ -493,6 +496,9 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	if (is_rfc3686)
 		ctx1_iv_off = 16 + CTR_RFC3686_NONCE_SIZE;
 
+	data_len[0] = ctx->adata.keylen_pad;
+	data_len[1] = ctx->cdata.keylen;
+
 	if (alg->caam.geniv)
 		goto skip_enc;
 
@@ -500,24 +506,30 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_AEAD_ENC_LEN + AUTHENC_DESC_JOB_IO_LEN +
-	    ctx->adata.keylen_pad + ctx->cdata.keylen +
-	    (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0) <=
-	    CAAM_DESC_BYTES_MAX) {
-		keys_fit_inline = true;
+	if (desc_inline_query(DESC_AEAD_ENC_LEN +
+			      (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0),
+			      AUTHENC_DESC_JOB_IO_LEN, data_len, &inl_mask,
+			      ARRAY_SIZE(data_len)) < 0)
+		return -EINVAL;
+
+	if (inl_mask & 1)
 		ctx->adata.key = (uintptr_t)ctx->key;
-		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
-	} else {
-		keys_fit_inline = false;
+	else
 		ctx->adata.key = ctx->key_dma;
+
+	if (inl_mask & 2)
+		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
+	else
 		ctx->cdata.key = ctx->key_dma + ctx->adata.keylen_pad;
-	}
+
+	ctx->adata.key_inline = !!(inl_mask & 1);
+	ctx->cdata.key_inline = !!(inl_mask & 2);
 
 	/* aead_encrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
 
 	/* Note: Context registers are saved. */
-	init_sh_desc_key_aead(desc, ctx, keys_fit_inline, is_rfc3686);
+	init_sh_desc_key_aead(desc, ctx, is_rfc3686);
 
 	/* Class 2 operation */
 	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_INITFINAL |
@@ -572,24 +584,30 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_AEAD_DEC_LEN + AUTHENC_DESC_JOB_IO_LEN +
-	    ctx->adata.keylen_pad + ctx->cdata.keylen +
-	    (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0) <=
-	    CAAM_DESC_BYTES_MAX) {
-		keys_fit_inline = true;
+	if (desc_inline_query(DESC_AEAD_DEC_LEN +
+			      (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0),
+			      AUTHENC_DESC_JOB_IO_LEN, data_len, &inl_mask,
+			      ARRAY_SIZE(data_len)) < 0)
+		return -EINVAL;
+
+	if (inl_mask & 1)
 		ctx->adata.key = (uintptr_t)ctx->key;
-		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
-	} else {
-		keys_fit_inline = false;
+	else
 		ctx->adata.key = ctx->key_dma;
+
+	if (inl_mask & 2)
+		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
+	else
 		ctx->cdata.key = ctx->key_dma + ctx->adata.keylen_pad;
-	}
+
+	ctx->adata.key_inline = !!(inl_mask & 1);
+	ctx->cdata.key_inline = !!(inl_mask & 2);
 
 	/* aead_decrypt shared descriptor */
 	desc = ctx->sh_desc_dec;
 
 	/* Note: Context registers are saved. */
-	init_sh_desc_key_aead(desc, ctx, keys_fit_inline, is_rfc3686);
+	init_sh_desc_key_aead(desc, ctx, is_rfc3686);
 
 	/* Class 2 operation */
 	append_operation(desc, ctx->adata.algtype | OP_ALG_AS_INITFINAL |
@@ -660,24 +678,30 @@ static int aead_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_AEAD_GIVENC_LEN + AUTHENC_DESC_JOB_IO_LEN +
-	    ctx->adata.keylen_pad + ctx->cdata.keylen +
-	    (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0) <=
-	    CAAM_DESC_BYTES_MAX) {
-		keys_fit_inline = true;
+	if (desc_inline_query(DESC_AEAD_GIVENC_LEN +
+			      (is_rfc3686 ? DESC_AEAD_CTR_RFC3686_LEN : 0),
+			      AUTHENC_DESC_JOB_IO_LEN, data_len, &inl_mask,
+			      ARRAY_SIZE(data_len)) < 0)
+		return -EINVAL;
+
+	if (inl_mask & 1)
 		ctx->adata.key = (uintptr_t)ctx->key;
-		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
-	} else {
-		keys_fit_inline = false;
+	else
 		ctx->adata.key = ctx->key_dma;
+
+	if (inl_mask & 2)
+		ctx->cdata.key = (uintptr_t)(ctx->key + ctx->adata.keylen_pad);
+	else
 		ctx->cdata.key = ctx->key_dma + ctx->adata.keylen_pad;
-	}
+
+	ctx->adata.key_inline = !!(inl_mask & 1);
+	ctx->cdata.key_inline = !!(inl_mask & 2);
 
 	/* aead_givencrypt shared descriptor */
 	desc = ctx->sh_desc_enc;
 
 	/* Note: Context registers are saved. */
-	init_sh_desc_key_aead(desc, ctx, keys_fit_inline, is_rfc3686);
+	init_sh_desc_key_aead(desc, ctx, is_rfc3686);
 
 	if (is_rfc3686)
 		goto copy_iv;
@@ -787,6 +811,8 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 	u32 *key_jump_cmd, *zero_payload_jump_cmd,
 	    *zero_assoc_jump_cmd1, *zero_assoc_jump_cmd2;
 	u32 *desc;
+	int rem_bytes = CAAM_DESC_BYTES_MAX - GCM_DESC_JOB_IO_LEN -
+			ctx->cdata.keylen;
 
 	if (!ctx->cdata.keylen || !ctx->authsize)
 		return 0;
@@ -796,8 +822,7 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptor
 	 * must fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_GCM_ENC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_GCM_ENC_LEN) {
 		ctx->cdata.key_inline = true;
 		ctx->cdata.key = (uintptr_t)ctx->key;
 	} else {
@@ -895,8 +920,7 @@ static int gcm_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_GCM_DEC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_GCM_DEC_LEN) {
 		ctx->cdata.key_inline = true;
 		ctx->cdata.key = (uintptr_t)ctx->key;
 	} else {
@@ -996,6 +1020,8 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 	struct device *jrdev = ctx->jrdev;
 	u32 *key_jump_cmd;
 	u32 *desc;
+	int rem_bytes = CAAM_DESC_BYTES_MAX - GCM_DESC_JOB_IO_LEN -
+			ctx->cdata.keylen;
 
 	if (!ctx->cdata.keylen || !ctx->authsize)
 		return 0;
@@ -1005,8 +1031,7 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptor
 	 * must fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_RFC4106_ENC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_RFC4106_ENC_LEN) {
 		ctx->cdata.key_inline = true;
 		ctx->cdata.key = (uintptr_t)ctx->key;
 	} else {
@@ -1084,8 +1109,7 @@ static int rfc4106_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_RFC4106_DEC_LEN + DESC_JOB_IO_LEN +
-	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_RFC4106_DEC_LEN) {
 		ctx->cdata.key_inline = true;
 		ctx->cdata.key = (uintptr_t)ctx->key;
 	} else {
@@ -1180,6 +1204,8 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 	u32 *key_jump_cmd;
 	u32 *read_move_cmd, *write_move_cmd;
 	u32 *desc;
+	int rem_bytes = CAAM_DESC_BYTES_MAX - GCM_DESC_JOB_IO_LEN -
+			ctx->cdata.keylen;
 
 	if (!ctx->cdata.keylen || !ctx->authsize)
 		return 0;
@@ -1189,8 +1215,7 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptor
 	 * must fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_RFC4543_ENC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_RFC4543_ENC_LEN) {
 		ctx->cdata.key_inline = true;
 		ctx->cdata.key = (uintptr_t)ctx->key;
 	} else {
@@ -1267,8 +1292,7 @@ static int rfc4543_set_sh_desc(struct crypto_aead *aead)
 	 * Job Descriptor and Shared Descriptors
 	 * must all fit into the 64-word Descriptor h/w Buffer
 	 */
-	if (DESC_RFC4543_DEC_LEN + GCM_DESC_JOB_IO_LEN +
-	    ctx->cdata.keylen <= CAAM_DESC_BYTES_MAX) {
+	if (rem_bytes >= DESC_RFC4543_DEC_LEN) {
 		ctx->cdata.key_inline = true;
 		ctx->cdata.key = (uintptr_t)ctx->key;
 	} else {
diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h
index bfef7952dfb7..fa70c0d79c40 100644
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@@ -449,3 +449,42 @@ struct alginfo {
 	u64 key;
 	bool key_inline;
 };
+
+/**
+ * desc_inline_query() - Provide indications on which data items can be inlined
+ *                       and which shall be referenced in a shared descriptor.
+ * @sd_base_len: Shared descriptor base length - bytes consumed by the commands,
+ *               excluding the data items to be inlined (or corresponding
+ *               pointer if an item is not inlined). Each cnstr_* function that
+ *               generates descriptors should have a define mentioning
+ *               corresponding length.
+ * @jd_len: Maximum length of the job descriptor(s) that will be used
+ *          together with the shared descriptor.
+ * @data_len: Array of lengths of the data items trying to be inlined
+ * @inl_mask: 32bit mask with bit x = 1 if data item x can be inlined, 0
+ *            otherwise.
+ * @count: Number of data items (size of @data_len array); must be <= 32
+ *
+ * Return: 0 if data can be inlined / referenced, negative value if not. If 0,
+ *         check @inl_mask for details.
+ */
+static inline int desc_inline_query(unsigned int sd_base_len,
+				    unsigned int jd_len, unsigned int *data_len,
+				    u32 *inl_mask, unsigned int count)
+{
+	int rem_bytes = (int)(CAAM_DESC_BYTES_MAX - sd_base_len - jd_len);
+	unsigned int i;
+
+	*inl_mask = 0;
+	for (i = 0; (i < count) && (rem_bytes > 0); i++) {
+		if (rem_bytes - (int)(data_len[i] +
+			(count - i - 1) * CAAM_PTR_SZ) >= 0) {
+			rem_bytes -= data_len[i];
+			*inl_mask |= (1 << i);
+		} else {
+			rem_bytes -= CAAM_PTR_SZ;
+		}
+	}
+
+	return (rem_bytes >= 0) ? 0 : -1;
+}
-- 
2.4.4

^ permalink raw reply related

* Re: [PATCH v3] arm64/crypto: Accelerated CRC T10 DIF computation
From: Ard Biesheuvel @ 2016-11-22 13:38 UTC (permalink / raw)
  To: YueHaibing
  Cc: Herbert Xu, David S. Miller, Catalin Marinas, Will Deacon,
	linux-kernel@vger.kernel.org, Hanjun Guo, dingtinahong,
	yangshengkai, linux-arm-kernel@lists.infradead.org,
	linux-crypto@vger.kernel.org
In-Reply-To: <CAKv+Gu9x+Z6Wx_jXQvFBuD4nQFF7rhz7M0KZmPWqUgfsfVz-OA@mail.gmail.com>

On 22 November 2016 at 12:53, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 22 November 2016 at 10:14, YueHaibing <yuehaibing@huawei.com> wrote:
>> This is the ARM64 CRC T10 DIF transform accelerated with the ARMv8
>> NEON instruction.The config CRYPTO_CRCT10DIF_NEON should be turned
>> on to enable the feature.The crc_t10dif crypto library function will
>> use this faster algorithm when crct10dif_neon module is loaded.
>>
>
> What is this algorithm commonly used for? In other words, why is it a
> good idea to add support for this algorithm to the kernel?
>
>> Tcrypt benchmark results:
>>
>> HIP06  (mode=320 sec=2)
>>
>> The ratio of bytes/sec crct10dif-neon Vs. crct10dif-generic:
>>
>>                         TEST                              neon          generic         ratio
>>   16 byte blocks,   16 bytes per update,   1 updates    214506112       171095400       1.25
>>   64 byte blocks,   16 bytes per update,   4 updates    139385312       119036352       1.17
>>   64 byte blocks,   64 bytes per update,   1 updates    671523712       198945344       3.38
>>  256 byte blocks,   16 bytes per update,  16 updates    157674880       125146752       1.26
>>  256 byte blocks,   64 bytes per update,   4 updates    491888128       175764096       2.80
>>  256 byte blocks,  256 bytes per update,   1 updates    2123298176      206995200       10.26
>> 1024 byte blocks,   16 bytes per update,  64 updates    161243136       126460416       1.28
>> 1024 byte blocks,  256 bytes per update,   4 updates    1643020800      200027136       8.21
>> 1024 byte blocks, 1024 bytes per update,   1 updates    4238239232      209106432       20.27
>> 2048 byte blocks,   16 bytes per update, 128 updates    162079744       126953472       1.28
>> 2048 byte blocks,  256 bytes per update,   8 updates    1693587456      200867840       8.43
>> 2048 byte blocks, 1024 bytes per update,   2 updates    3424323584      206330880       16.60
>> 2048 byte blocks, 2048 bytes per update,   1 updates    5228207104      208620544       25.06
>> 4096 byte blocks,   16 bytes per update, 256 updates    162304000       126894080       1.28
>> 4096 byte blocks,  256 bytes per update,  16 updates    1731862528      201197568       8.61
>> 4096 byte blocks, 1024 bytes per update,   4 updates    3668625408      207003648       17.72
>> 4096 byte blocks, 4096 bytes per update,   1 updates    5551239168      209127424       26.54
>> 8192 byte blocks,   16 bytes per update, 512 updates    162779136       126984192       1.28
>> 8192 byte blocks,  256 bytes per update,  32 updates    1753702400      201420800       8.71
>> 8192 byte blocks, 1024 bytes per update,   8 updates    3760918528      207351808       18.14
>> 8192 byte blocks, 4096 bytes per update,   2 updates    5483655168      208928768       26.25
>> 8192 byte blocks, 8192 bytes per update,   1 updates    5623377920      209108992       26.89
>>
>> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
>> Signed-off-by: YangShengkai <yangshengkai@huawei.com>
>> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>> Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
>>
>> ---
>>  arch/arm64/crypto/Kconfig                 |   5 +
>>  arch/arm64/crypto/Makefile                |   4 +
>>  arch/arm64/crypto/crct10dif-neon-asm_64.S | 751 ++++++++++++++++++++++++++++++
>>  arch/arm64/crypto/crct10dif-neon_glue.c   | 115 +++++
>>  4 files changed, 875 insertions(+)
>>  create mode 100644 arch/arm64/crypto/crct10dif-neon-asm_64.S
>>  create mode 100644 arch/arm64/crypto/crct10dif-neon_glue.c
>>
>> diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
>> index 2cf32e9..2e450bf 100644
>> --- a/arch/arm64/crypto/Kconfig
>> +++ b/arch/arm64/crypto/Kconfig
>> @@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE
>>         depends on ARM64 && KERNEL_MODE_NEON
>>         select CRYPTO_HASH
>>
>> +config CRYPTO_CRCT10DIF_NEON
>> +       tristate "CRCT10DIF hardware acceleration using NEON instructions"
>> +       depends on ARM64 && KERNEL_MODE_NEON
>> +       select CRYPTO_HASH
>> +
>
> Could you please follow the existing pattern:
>
> config CRYPTO_CRCT10DIF_ARM64_NEON
>
>
>>  config CRYPTO_AES_ARM64_CE
>>         tristate "AES core cipher using ARMv8 Crypto Extensions"
>>         depends on ARM64 && KERNEL_MODE_NEON
>> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
>> index abb79b3..6c9ff2c 100644
>> --- a/arch/arm64/crypto/Makefile
>> +++ b/arch/arm64/crypto/Makefile
>> @@ -29,6 +29,10 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
>>  obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
>>  aes-neon-blk-y := aes-glue-neon.o aes-neon.o
>>
>> +obj-$(CONFIG_CRYPTO_CRCT10DIF_NEON) += crct10dif-neon.o
>> +crct10dif-neon-y := crct10dif-neon-asm_64.o crct10dif-neon_glue.o
>> +AFLAGS_crct10dif-neon-asm_64.o := -march=armv8-a+crypto
>> +
>
> Please drop this line, and add
>
> .cpu generic+crypto
>
> to the .S file
>
>>  AFLAGS_aes-ce.o                := -DINTERLEAVE=4
>>  AFLAGS_aes-neon.o      := -DINTERLEAVE=4
>>
>> diff --git a/arch/arm64/crypto/crct10dif-neon-asm_64.S b/arch/arm64/crypto/crct10dif-neon-asm_64.S
>> new file mode 100644
>> index 0000000..2ae3033
>> --- /dev/null
>> +++ b/arch/arm64/crypto/crct10dif-neon-asm_64.S
>> @@ -0,0 +1,751 @@
>> +/*
>> + * Copyright (c) 2016-2017 Hisilicon Limited.
>> + *
>
> Please drop the 2017 here.
>
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +
>> +.global crc_t10dif_neon
>
> Please drop this .global, and use ENTRY() below
>
>> +.text
>> +
>> +/* X0 is initial CRC value
>> + * X1 is data buffer
>> + * X2 is the length of buffer
>> + * X3 is the backup buffer(for extend)
>> + * X4 for other extend parameter(for extend)
>> + * Q0, Q1, Q2, Q3 maybe as parameter for other functions,
>> + * the value of Q0, Q1, Q2, Q3 maybe modified.
>> + *
>> + * suggestion:
>> + * 1. dont use general purpose register for calculation
>> + * 2. set data endianness outside of the kernel
>> + * 3. use ext as shifting around
>> + * 4. dont use LD3/LD4, ST3/ST4
>> + */
>> +
>
>
> Whose suggestions are these, and why? I do appreciate comments like
> this, but only if I can learn something from it
>
>> +crc_t10dif_neon:
>
> ENTRY()
>
>> +       /* push the register to stack that CRC16 will use */
>> +       STP             X5, X6, [sp, #-0x10]!
>
> Please use an ordinary stack frame, i.e.,
>
> stp   x29, x30, [sp, #-xxx]!
> mov x29, sp
>
> where xxx is the entire allocation you need for stacking callee save registers
>
>> +       STP             X7, X8, [sp, #-0x10]!
>> +       STP             X9, X10, [sp, #-0x10]!
>> +       STP             X11, X12, [sp, #-0x10]!
>> +       STP             X13, X14, [sp, #-0x10]!
>
> These are not callee save, so no need to stack them
>
>> +       STP             Q10, Q11, [sp, #-0x20]!
>> +       STP             Q12, Q13, [sp, #-0x20]!
>> +       STP             Q4, Q5, [sp, #-0x20]!
>> +       STP             Q6, Q7, [sp, #-0x20]!
>> +       STP             Q8, Q9, [sp, #-0x20]!
>> +       STP             Q14, Q15, [sp, #-0x20]!
>> +       STP             Q16, Q17, [sp, #-0x20]!
>> +       STP             Q18, Q19, [sp, #-0x20]!
>> +
>
> What is the point of stacking the NEON registers? Also, as a general
> note, could you switch to lower case throughout the file?
>
>
>> +       SUB             sp,sp,#0x20
>> +
>
> Please account for locals in the allocation above. Only outgoing
> arguments should be allocated below the frame pointer
>
>
>> +       MOV             X11, #0         // PUSH STACK FLAG
>> +
>
> What does this comment mean?
>
>> +       CMP             X2, #0x80
>> +       B.LT            2f              // _less_than_128, <128
>> +
>
> Redundant comment
>
>> +       /* V10/V11/V12/V13      is 128bit.
>> +        * we get data 512bit( by cacheline ) each time
>> +        */
>> +       LDP             Q10, Q11, [X1], #0x20
>> +       LDP             Q12, Q13, [X1], #0x20
>> +
>> +       /* move the initial value to V6 register */
>> +       LSL             X0, X0, #48
>> +       EOR             V6.16B, V6.16B, V6.16B
>> +       MOV             V6.D[1], X0
>> +
>> +       /* big-little end change. because the data in memory is little-end,
>> +        * we deal the data for bigend
>> +        */
>> +
>
> What if I am using a big endian kernel? Hint: you probably need to
> wrap these in CPU_LE()
>
>> +       REV64           V10.16B, V10.16B
>> +       REV64           V11.16B, V11.16B
>> +       REV64           V12.16B, V12.16B
>> +       REV64           V13.16B, V13.16B
>> +       EXT             V10.16B, V10.16B, V10.16B, #8
>> +       EXT             V11.16B, V11.16B, V11.16B, #8
>> +       EXT             V12.16B, V12.16B, V12.16B, #8
>> +       EXT             V13.16B, V13.16B, V13.16B, #8
>> +
>> +       EOR             V10.16B, V10.16B, V6.16B
>> +
>> +       SUB             X2, X2, #0x80
>> +       ADD             X5, X1, #0x20
>> +
>> +       /* deal data when the size of buffer bigger than 128 bytes */
>> +       /* _fold_64_B_loop */
>> +       LDR             Q6,=0xe658000000000000044c000000000000
>
> Could you move all these non-trivial constants to a separate location
> (after the end of the function), and name them?
>
>> +1:
>> +
>> +       LDP             Q16, Q17, [X1] ,#0x40
>> +       LDP             Q18, Q19, [X5], #0x40
>> +
>> +       /* carry-less multiply.
>> +        * V10 high-64bits carry-less multiply
>> +        * V6 high-64bits(PMULL2)
>> +        * V11 low-64bits carry-less multiply V6 low-64bits(PMULL)
>> +        */
>> +
>> +       PMULL2          V4.1Q, V10.2D, V6.2D
>> +       PMULL           V10.1Q, V10.1D, V6.1D
>> +       PMULL2          V5.1Q, V11.2D, V6.2D
>> +       PMULL           V11.1Q, V11.1D, V6.1D
>> +
>
> These instructions are only available if you have the PMULL extension,
> so this algorithm is not plain NEON.
>
>> +       REV64           V16.16B, V16.16B
>> +       REV64           V17.16B, V17.16B
>> +       REV64           V18.16B, V18.16B
>> +       REV64           V19.16B, V19.16B
>> +
>
> Endian swap on LE only?
>
>> +       PMULL2          V14.1Q, V12.2D, V6.2D
>> +       PMULL           V12.1Q, V12.1D, V6.1D
>> +       PMULL2          V15.1Q, V13.2D, V6.2D
>> +       PMULL           V13.1Q, V13.1D, V6.1D
>> +
>> +       EXT             V16.16B, V16.16B, V16.16B, #8
>> +       EOR             V10.16B, V10.16B, V4.16B
>> +
>> +       EXT             V17.16B, V17.16B, V17.16B, #8
>> +       EOR             V11.16B, V11.16B, V5.16B
>> +
>> +       EXT             V18.16B, V18.16B, V18.16B, #8
>> +       EOR             V12.16B, V12.16B, V14.16B
>> +
>> +       EXT             V19.16B, V19.16B, V19.16B, #8
>> +       EOR             V13.16B, V13.16B, V15.16B
>> +
>> +       SUB             X2, X2, #0x40
>> +
>> +
>> +       EOR             V10.16B, V10.16B, V16.16B
>> +       EOR             V11.16B, V11.16B, V17.16B
>> +
>> +       EOR             V12.16B, V12.16B, V18.16B
>> +       EOR             V13.16B, V13.16B, V19.16B
>> +
>> +       CMP             X2, #0x0
>> +       B.GE            1b                      // >=0
>> +
>> +       LDR             Q6, =0x06df0000000000002d56000000000000
>> +       MOV             V4.16B, V10.16B
>> +       /* V10 carry-less 0x06df000000000000([127:64]*[127:64]) */
>> +       PMULL           V4.1Q, V4.1D, V6.1D     //switch PMULL & PMULL2 order
>> +       PMULL2          V10.1Q, V10.2D, V6.2D
>> +       EOR             V11.16B, V11.16B, V4.16B
>> +       EOR             V11.16B, V11.16B, V10.16B
>> +
>> +       MOV             V4.16B, V11.16B
>> +       PMULL           V4.1Q, V4.1D, V6.1D     //switch PMULL & PMULL2 order
>> +       PMULL2          V11.1Q, V11.2D, V6.2D
>> +       EOR             V12.16B, V12.16B, V4.16B
>> +       EOR             V12.16B, V12.16B, V11.16B
>> +
>> +       MOV             V4.16B, V12.16B
>> +       PMULL           V4.1Q, V4.1D, V6.1D     //switch PMULL & PMULL2 order
>> +       PMULL2          V12.1Q, V12.2D, V6.2D
>> +       EOR             V13.16B, V13.16B, V4.16B
>> +       EOR             V13.16B, V13.16B, V12.16B
>> +
>> +       ADD             X2, X2, #48
>> +       CMP             X2, #0x0
>> +       B.LT            3f                      // _final_reduction_for_128, <0
>> +
>> +       /* _16B_reduction_loop */
>> +4:
>> +       /* unrelated load as early as possible*/
>> +       LDR             Q10, [X1], #0x10
>> +
>> +       MOV             V4.16B, V13.16B
>> +       PMULL2          V13.1Q, V13.2D, V6.2D
>> +       PMULL           V4.1Q, V4.1D, V6.1D
>> +       EOR             V13.16B, V13.16B, V4.16B
>> +
>> +       REV64           V10.16B, V10.16B
>> +       EXT             V10.16B, V10.16B, V10.16B, #8
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +
>> +       SUB             X2, X2, #0x10
>> +       CMP             X2, #0x0
>> +       B.GE            4b                      // _16B_reduction_loop, >=0
>> +
>> +       /*  _final_reduction_for_128 */
>> +3:     ADD             X2, X2, #0x10
>> +       CMP             X2, #0x0
>> +       B.EQ            5f                      // _128_done, ==0
>> +
>> +       /* _get_last_two_xmms */
>
> Bogus comment. I guess you ported this code from x86, are you sure you
> don't need to credit the original author?
>
>> +6:     MOV             V12.16B, V13.16B
>> +       SUB             X1, X1, #0x10
>> +       ADD             X1, X1, X2
>> +       LDR             Q11, [X1], #0x10
>> +       REV64           V11.16B, V11.16B
>> +       EXT             V11.16B, V11.16B, V11.16B, #8
>> +
>> +       CMP             X2, #8
>> +       B.EQ            50f
>> +       B.LT            51f
>> +       B.GT            52f
>> +
>> +50:
>> +       /* dont use X register as temp one */
>> +       FMOV            D14, D12
>> +       MOVI            D12, #0
>> +       MOV             V12.D[1],V14.D[0]
>> +       B               53f
>> +51:
>> +       MOV             X9, #64
>> +       LSL             X13, X2, #3             // <<3 equal x8
>> +       SUB             X9, X9, X13
>> +       MOV             X5, V12.D[0]            // low 64-bit
>> +       MOV             X6, V12.D[1]            // high 64-bit
>> +       LSR             X10, X5, X9             // high bit of low 64-bit
>> +       LSL             X7, X5, X13
>> +       LSL             X8, X6, X13
>> +       ORR             X8, X8, X10             // combination of high 64-bit
>> +       MOV             V12.D[1], X8
>> +       MOV             V12.D[0], X7
>> +
>> +       B               53f
>> +52:
>> +       LSL             X13, X2, #3             // <<3 equal x8
>> +       SUB             X13, X13, #64
>> +
>> +       DUP             V18.2D, X13
>> +       FMOV            D16, D12
>> +       USHL            D16, D16, D18
>> +       EXT             V12.16B, V16.16B, V16.16B, #8
>> +
>> +53:
>> +       MOVI            D14, #0                 //add one zero constant
>> +
>> +       CMP             X2, #0
>> +       B.EQ    30f
>> +       CMP             X2, #1
>> +       B.EQ    31f
>> +       CMP             X2, #2
>> +       B.EQ    32f
>> +       CMP             X2, #3
>> +       B.EQ    33f
>> +       CMP             X2, #4
>> +       B.EQ    34f
>> +       CMP             X2, #5
>> +       B.EQ    35f
>> +       CMP             X2, #6
>> +       B.EQ    36f
>> +       CMP             X2, #7
>> +       B.EQ    37f
>> +       CMP             X2, #8
>> +       B.EQ    38f
>> +       CMP             X2, #9
>> +       B.EQ    39f
>> +       CMP             X2, #10
>> +       B.EQ    40f
>> +       CMP             X2, #11
>> +       B.EQ    41f
>> +       CMP             X2, #12
>> +       B.EQ    42f
>> +       CMP             X2, #13
>> +       B.EQ    43f
>> +       CMP             X2, #14
>> +       B.EQ    44f
>> +       CMP             X2, #15
>> +       B.EQ    45f
>> +
>
> This looks awful. If you make the snippets below a fixed size, you
> could use a computed goto instead
>
>> +       // >> 128bit
>> +30:
>> +       EOR             V13.16B, V13.16B, V13.16B
>> +       EOR             V8.16B, V8.16B, V8.16B
>> +       LDR             Q9,=0xffffffffffffffffffffffffffffffff
>
> Shouldn't you initialize q8 here as well.

Never mind, I just spotted the EOR which intializes it to 0x0

> And in general, couldn't you
> use some kind of shift to generate these constants (in all cases
> below)?
>> +       B               46f
>> +
>> +       // >> 120bit
>> +31:
>> +       USHR            V13.2D, V13.2D, #56
>> +       EXT             V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xff
>> +       LDR             Q9,=0xffffffffffffffffffffffffffffff00
>> +       B               46f
>> +
>> +       // >> 112bit
>> +32:
>> +       USHR             V13.2D, V13.2D, #48
>> +       EXT             V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xffff
>> +       LDR             Q9,=0xffffffffffffffffffffffffffff0000
>> +       B               46f
>> +
>> +       // >> 104bit
>> +33:
>> +       USHR            V13.2D, V13.2D, #40
>> +       EXT             V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xffffff
>> +       LDR             Q9,=0xffffffffffffffffffffffffff000000
>> +       B               46f
>> +
>> +       // >> 96bit
>> +34:
>> +       USHR            V13.2D, V13.2D, #32
>> +       EXT             V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xffffffff
>> +       LDR             Q9,=0xffffffffffffffffffffffff00000000
>> +       B               46f
>> +
>> +       // >> 88bit
>> +35:
>> +       USHR            V13.2D, V13.2D, #24
>> +       EXT             V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xffffffffff
>> +       LDR             Q9,=0xffffffffffffffffffffff0000000000
>> +       B               46f
>> +
>> +       // >> 80bit
>> +36:
>> +       USHR            V13.2D, V13.2D, #16
>> +       EXT             V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xffffffffffff
>> +       LDR             Q9,=0xffffffffffffffffffff000000000000
>> +       B               46f
>> +
>> +       // >> 72bit
>> +37:
>> +       USHR            V13.2D, V13.2D, #8
>> +       EXT             V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xffffffffffffff
>> +       LDR             Q9,=0xffffffffffffffffff00000000000000
>> +       B               46f
>> +
>> +       // >> 64bit
>> +38:
>> +       EXT              V13.16B, V13.16B, V14.16B, #8
>> +       LDR             Q8,=0xffffffffffffffff
>> +       LDR             Q9,=0xffffffffffffffff0000000000000000
>> +       B               46f
>> +
>> +       // >> 56bit
>> +39:
>> +       EXT             V13.16B, V13.16B, V13.16B, #7
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.H[5], V14.H[0]
>> +       MOV             V13.B[9], V14.B[0]
>> +
>> +       LDR             Q8,=0xffffffffffffffffff
>> +       LDR             Q9,=0xffffffffffffff000000000000000000
>> +       B               46f
>> +
>> +       // >> 48bit
>> +40:
>> +       EXT             V13.16B, V13.16B, V13.16B, #6
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.H[5], V14.H[0]
>> +
>> +       LDR             Q8,=0xffffffffffffffffffff
>> +       LDR             Q9,=0xffffffffffff00000000000000000000
>> +       B               46f
>> +
>> +       // >> 40bit
>> +41:
>> +       EXT             V13.16B, V13.16B, V13.16B, #5
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.B[11], V14.B[0]
>> +
>> +       LDR             Q8,=0xffffffffffffffffffffff
>> +       LDR             Q9,=0xffffffffff0000000000000000000000
>> +       B               46f
>> +
>> +       // >> 32bit
>> +42:
>> +       EXT             V13.16B, V13.16B, V13.16B, #4
>> +       MOV             V13.S[3], V14.S[0]
>> +
>> +       LDR             Q8,=0xffffffffffffffffffffffff
>> +       LDR             Q9,=0xffffffff000000000000000000000000
>> +       B               46f
>> +
>> +       // >> 24bit
>> +43:
>> +       EXT             V13.16B, V13.16B, V13.16B, #3
>> +       MOV             V13.H[7], V14.H[0]
>> +       MOV             V13.B[13], V14.B[0]
>> +
>> +       LDR             Q8,=0xffffffffffffffffffffffffff
>> +       LDR             Q9,=0xffffff00000000000000000000000000
>> +       B               46f
>> +
>> +       // >> 16bit
>> +44:
>> +       EXT             V13.16B, V13.16B, V13.16B, #2
>> +       MOV             V13.H[7], V14.H[0]
>> +
>> +       LDR             Q8,=0xffffffffffffffffffffffffffff
>> +       LDR             Q9,=0xffff0000000000000000000000000000
>> +       B               46f
>> +
>> +       // >> 8bit
>> +45:
>> +       EXT              V13.16B, V13.16B, V13.16B, #1
>> +       MOV             V13.B[15], V14.B[0]
>> +
>> +       LDR             Q8,=0xffffffffffffffffffffffffffffff
>> +       LDR             Q9,=0xff000000000000000000000000000000
>> +
>> +       // backup V12 first
>> +       // pblendvb     xmm1, xmm2
>
> Another remnant of the x86 version, please remove
>
>> +46:
>> +       AND             V12.16B, V12.16B, V9.16B
>> +       AND             V11.16B, V11.16B, V8.16B
>> +       ORR             V11.16B, V11.16B, V12.16B
>> +
>> +       MOV             V12.16B, V11.16B
>> +       MOV             V4.16B, V13.16B
>> +       PMULL2          V13.1Q, V13.2D, V6.2D
>> +       PMULL           V4.1Q, V4.1D, V6.1D
>> +       EOR             V13.16B, V13.16B, V4.16B
>> +       EOR             V13.16B, V13.16B, V12.16B
>> +
>> +       /* _128_done. we change the Q6 D[0] and D[1] */
>> +5:     LDR             Q6, =0x2d560000000000001368000000000000
>> +       MOVI            D14, #0
>> +       MOV             V10.16B, V13.16B
>> +       PMULL2          V13.1Q, V13.2D, V6.2D
>> +
>> +       MOV             V10.D[1], V10.D[0]
>> +       MOV             V10.D[0], V14.D[0]    //set zero
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +
>> +       MOV             V10.16B, V13.16B
>> +       LDR             Q7, =0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
>> +       AND             V10.16B, V10.16B, V7.16B
>> +
>> +       MOV             S13, V13.S[3]
>> +
>> +       PMULL           V13.1Q, V13.1D, V6.1D
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +
>> +       /* _barrett */
>
> What does '_barrett' mean?
>
>> +7:     LDR             Q6, =0x00000001f65a57f8000000018bb70000
>> +       MOVI            D14, #0
>> +       MOV             V10.16B, V13.16B
>> +       PMULL2          V13.1Q, V13.2D, V6.2D
>> +
>> +       EXT             V13.16B, V13.16B, V13.16B, #12
>> +       MOV             V13.S[0], V14.S[0]
>> +
>> +       EXT             V6.16B, V6.16B, V6.16B, #8
>> +       PMULL2          V13.1Q, V13.2D, V6.2D
>> +
>> +       EXT             V13.16B, V13.16B, V13.16B, #12
>> +       MOV             V13.S[0], V14.S[0]
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +       MOV             X0, V13.D[0]
>> +
>> +       /* _cleanup */
>> +8:     MOV             X14, #48
>> +       LSR             X0, X0, X14
>
> Why the temp x14?
>
>> +99:
>> +       ADD             sp, sp, #0x20
>> +
>> +       LDP             Q18, Q19, [sp], #0x20
>> +       LDP             Q16, Q17, [sp], #0x20
>> +       LDP             Q14, Q15, [sp], #0x20
>> +
>> +       LDP             Q8, Q9, [sp], #0x20
>> +       LDP             Q6, Q7, [sp], #0x20
>> +       LDP             Q4, Q5, [sp], #0x20
>> +       LDP             Q12, Q13, [sp], #0x20
>> +       LDP             Q10, Q11, [sp], #0x20
>> +       LDP             X13, X14, [sp], #0x10
>> +       LDP             X11, X12, [sp], #0x10
>> +       LDP             X9, X10, [sp], #0x10
>> +       LDP             X7, X8, [sp], #0x10
>> +       LDP             X5, X6, [sp], #0x10
>> +
>
> None of these registers need to be restored. The only thing you need
> (to mirror the prologue)
>
> ldp x29, x30, [sp], #xxx
> ret
>
> where xxx is the same value you used at the beginning.
>
>
>> +       RET
>> +
>> +       /* _less_than_128 */
>> +2:     CMP             X2, #32
>> +       B.LT            9f                              // _less_than_32
>> +       LDR             Q6, =0x06df0000000000002d56000000000000
>> +
>> +       LSL             X0, X0, #48
>> +       LDR             Q10, =0x0
>
> Please use movi here
>
>> +       MOV             V10.D[1], X0
>> +       LDR             Q13, [X1], #0x10
>> +       REV64           V13.16B, V13.16B
>> +       EXT             V13.16B, V13.16B, V13.16B, #8
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +
>> +       SUB             X2, X2, #32
>> +       B               4b
>> +
>> +       /* _less_than_32 */
>> +9:     CMP             X2, #0
>> +       B.EQ            99b                     // _cleanup
>
> You can use CBZ here
>
>> +       LSL             X0, X0, #48
>> +       LDR             Q10,=0x0
>
> Please use movi here
>
>> +       MOV             V10.D[1], X0
>> +
>> +       CMP             X2, #16
>> +       B.EQ            10f                     // _exact_16_left
>> +       B.LE            11f                     // _less_than_16_left
>> +       LDR             Q13, [X1], #0x10
>> +
>> +       REV64           V13.16B, V13.16B
>> +       EXT             V13.16B, V13.16B, V13.16B, #8
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +       SUB             X2, X2, #16
>> +       LDR             Q6, =0x06df0000000000002d56000000000000
>> +       B               6b                      // _get_last_two_xmms
>
> Another bogus comment
>
>> +
>> +       /*  _less_than_16_left */
>> +11:    CMP             X2, #4
>> +       B.LT            13f                     // _only_less_than_4
>> +
>> +       /* backup the length of data, we used in _less_than_2_left */
>> +       MOV             X8, X2
>> +       CMP             X2, #8
>> +       B.LT            14f                     // _less_than_8_left
>> +
>> +       LDR             X14, [X1], #8
>> +       /* push the data to stack, we backup the data to V10 */
>> +       STR             X14, [sp, #0]
>> +       SUB             X2, X2, #8
>> +       ADD             X11, X11, #8
>> +
>> +       /* _less_than_8_left */
>> +14:    CMP             X2, #4
>> +       B.LT            15f                     // _less_than_4_left
>> +
>> +       /* get 32bit data */
>> +       LDR             W5, [X1], #4
>> +
>> +       /* push the data to stack */
>> +       STR             W5, [sp, X11]
>> +       SUB             X2, X2, #4
>> +       ADD             X11, X11, #4
>> +
>> +       /* _less_than_4_left */
>> +15:    CMP             X2, #2
>> +       B.LT            16f                     // _less_than_2_left
>> +
>> +       /* get 16bits data */
>> +       LDRH            W6, [X1], #2
>> +
>> +       /* push the data to stack */
>> +       STRH            W6, [sp, X11]
>> +       SUB             X2, X2, #2
>> +       ADD             X11, X11, #2
>> +
>> +       /* _less_than_2_left */
>> +16:
>> +       /* get 8bits data */
>> +       LDRB            W7, [X1], #1
>> +       STRB            W7, [sp, X11]
>> +       ADD             X11, X11, #1
>> +
>> +       /* POP data from stack, store to V13 */
>> +       LDR             Q13, [sp]
>> +       MOVI            D14, #0
>> +       REV64           V13.16B, V13.16B
>> +       MOV             V8.16B, V13.16B
>> +       MOV             V13.D[1], V8.D[0]
>> +       MOV             V13.D[0], V8.D[1]
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +       CMP             X8, #15
>> +       B.EQ    80f
>> +       CMP             X8, #14
>> +       B.EQ    81f
>> +       CMP             X8, #13
>> +       B.EQ    82f
>> +       CMP             X8, #12
>> +       B.EQ    83f
>> +       CMP             X8, #11
>> +       B.EQ    84f
>> +       CMP             X8, #10
>> +       B.EQ    85f
>> +       CMP             X8, #9
>> +       B.EQ    86f
>> +       CMP             X8, #8
>> +       B.EQ    87f
>> +       CMP             X8, #7
>> +       B.EQ    88f
>> +       CMP             X8, #6
>> +       B.EQ    89f
>> +       CMP             X8, #5
>> +       B.EQ    90f
>> +       CMP             X8, #4
>> +       B.EQ    91f
>> +       CMP             X8, #3
>> +       B.EQ    92f
>> +       CMP             X8, #2
>> +       B.EQ    93f
>> +       CMP             X8, #1
>> +       B.EQ    94f
>> +       CMP             X8, #0
>> +       B.EQ    95f
>> +
>
> Again, please use a computed goto instead
>
>> +80:
>> +       EXT             V13.16B, V13.16B, V13.16B, #1
>> +       MOV             V13.B[15], V14.B[0]
>> +       B               5b
>> +
>> +81:
>> +       EXT             V13.16B, V13.16B, V13.16B, #2
>> +       MOV              V13.H[7], V14.H[0]
>> +       B               5b
>> +
>> +82:
>> +       EXT             V13.16B, V13.16B, V13.16B, #3
>> +       MOV             V13.H[7], V14.H[0]
>> +       MOV             V13.B[13], V14.B[0]
>> +       B               5b
>> +83:
>> +
>> +       EXT             V13.16B, V13.16B, V13.16B, #4
>> +       MOV             V13.S[3], V14.S[0]
>> +       B               5b
>> +
>> +84:
>> +       EXT             V13.16B, V13.16B, V13.16B, #5
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.B[11], V14.B[0]
>> +       B               5b
>> +
>> +85:
>> +       EXT             V13.16B, V13.16B, V13.16B, #6
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.H[5], V14.H[0]
>> +       B               5b
>> +
>> +86:
>> +       EXT             V13.16B, V13.16B, V13.16B, #7
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.H[5], V14.H[0]
>> +       MOV             V13.B[9], V14.B[0]
>> +       B               5b
>> +
>> +87:
>> +       MOV             V13.D[0], V13.D[1]
>> +       MOV             V13.D[1], V14.D[0]
>> +       B               5b
>> +
>> +88:
>> +       EXT             V13.16B, V13.16B, V13.16B, #9
>> +       MOV             V13.D[1], V14.D[0]
>> +       MOV             V13.B[7], V14.B[0]
>> +       B               5b
>> +
>> +89:
>> +       EXT             V13.16B, V13.16B, V13.16B, #10
>> +       MOV             V13.D[1], V14.D[0]
>> +       MOV              V13.H[3], V14.H[0]
>> +       B               5b
>> +
>> +90:
>> +       EXT             V13.16B, V13.16B, V13.16B, #11
>> +       MOV             V13.D[1], V14.D[0]
>> +       MOV             V13.H[3], V14.H[0]
>> +       MOV             V13.B[5], V14.B[0]
>> +       B               5b
>> +
>> +91:
>> +       MOV             V13.S[0], V13.S[3]
>> +       MOV             V13.D[1], V14.D[0]
>> +       MOV             V13.S[1], V14.S[0]
>> +       B               5b
>> +
>> +92:
>> +       EXT             V13.16B, V13.16B, V13.16B, #13
>> +       MOV             V13.D[1], V14.D[0]
>> +       MOV             V13.S[1], V14.S[0]
>> +       MOV             V13.B[3], V14.B[0]
>> +       B               5b
>> +
>> +93:
>> +       MOV             V15.H[0], V13.H[7]
>> +       MOV             V13.16B, V14.16B
>> +       MOV             V13.H[0], V15.H[0]
>> +       B               5b
>> +
>> +94:
>> +       MOV             V15.B[0], V13.B[15]
>> +       MOV             V13.16B, V14.16B
>> +       MOV             V13.B[0], V15.B[0]
>> +       B               5b
>> +
>> +95:
>> +       LDR             Q13,=0x0
>
> movi
>
>> +       B               5b                              // _128_done
>> +
>> +       /* _exact_16_left */
>> +10:
>> +       LD1             { V13.2D }, [X1], #0x10
>> +
>> +       REV64           V13.16B, V13.16B
>> +       EXT             V13.16B, V13.16B, V13.16B, #8
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +       B               5b                              // _128_done
>> +
>> +       /* _only_less_than_4 */
>> +13:    CMP             X2, #3
>> +       MOVI            D14, #0
>> +       B.LT            17f                             //_only_less_than_3
>> +
>> +       LDR             S13, [X1], #4
>> +       MOV              V13.B[15], V13.B[0]
>> +       MOV             V13.B[14], V13.B[1]
>> +       MOV             V13.B[13], V13.B[2]
>> +       MOV             V13.S[0], V13.S[1]
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +
>> +       EXT             V13.16B, V13.16B, V13.16B, #5
>> +
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.B[11], V14.B[0]
>> +
>> +       B               7b                              // _barrett
>> +       /* _only_less_than_3 */
>> +17:
>> +       CMP             X2, #2
>> +       B.LT            18f                             // _only_less_than_2
>> +
>> +       LDR             H13, [X1], #2
>> +       MOV             V13.B[15], V13.B[0]
>> +       MOV             V13.B[14], V13.B[1]
>> +       MOV             V13.H[0], V13.H[1]
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +
>> +       EXT             V13.16B, V13.16B, V13.16B, #6
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.H[5], V14.H[0]
>> +
>> +       B               7b                              // _barrett
>> +
>> +       /* _only_less_than_2 */
>> +18:
>> +       LDRB            W7, [X1], #1
>> +       LDR             Q13, = 0x0
>> +       MOV             V13.B[15], W7
>> +
>> +       EOR             V13.16B, V13.16B, V10.16B
>> +
>> +       EXT             V13.16B, V13.16B, V13.16B, #7
>> +       MOV             V13.S[3], V14.S[0]
>> +       MOV             V13.H[5], V14.H[0]
>> +       MOV             V13.B[9], V14.B[0]
>> +
>> +       B               7b                              // _barrett
>
> Please end with ENDPROC()
>
>> diff --git a/arch/arm64/crypto/crct10dif-neon_glue.c b/arch/arm64/crypto/crct10dif-neon_glue.c
>> new file mode 100644
>> index 0000000..a6d8c5c
>> --- /dev/null
>> +++ b/arch/arm64/crypto/crct10dif-neon_glue.c
>> @@ -0,0 +1,115 @@
>> +/*
>> + * Copyright (c) 2016-2017 Hisilicon Limited.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + */
>> +
>> +
>> +#include <linux/types.h>
>> +#include <linux/module.h>
>> +#include <linux/crc-t10dif.h>
>> +#include <crypto/internal/hash.h>
>> +#include <linux/init.h>
>> +#include <linux/string.h>
>> +#include <linux/kernel.h>
>> +
>> +asmlinkage __u16 crc_t10dif_neon(__u16 crc, const unsigned char *buf,
>> +                               size_t len);
>> +
>> +struct chksum_desc_ctx {
>> +       __u16 crc;
>> +};
>> +
>> +/*
>> + * Steps through buffer one byte at at time, calculates reflected
>> + * crc using table.
>> + */
>> +
>
> Where is the table?
>
>> +static int chksum_init(struct shash_desc *desc)
>> +{
>> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
>> +
>> +       ctx->crc = 0;
>> +
>> +       return 0;
>> +}
>> +
>> +static int chksum_update(struct shash_desc *desc, const u8 *data,
>> +                        unsigned int length)
>> +{
>> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
>> +
>> +       ctx->crc = crc_t10dif_neon(ctx->crc, data, length);
>
> You need kernel_neon_begin/kernel_neon_end here
>
>> +       return 0;
>> +}
>> +
>> +static int chksum_final(struct shash_desc *desc, u8 *out)
>> +{
>> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
>> +
>> +       *(__u16 *)out = ctx->crc;
>> +       return 0;
>> +}
>> +
>> +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
>> +                       u8 *out)
>> +{
>> +       *(__u16 *)out = crc_t10dif_neon(*crcp, data, len);
>
> ... and here
>
>> +       return 0;
>> +}
>> +
>> +static int chksum_finup(struct shash_desc *desc, const u8 *data,
>> +                       unsigned int len, u8 *out)
>> +{
>> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
>> +
>> +       return __chksum_finup(&ctx->crc, data, len, out);
>> +}
>> +
>> +static int chksum_digest(struct shash_desc *desc, const u8 *data,
>> +                        unsigned int length, u8 *out)
>> +{
>> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
>> +
>> +       return __chksum_finup(&ctx->crc, data, length, out);
>> +}
>> +
>> +static struct shash_alg alg = {
>> +       .digestsize             =       CRC_T10DIF_DIGEST_SIZE,
>> +       .init           =       chksum_init,
>> +       .update         =       chksum_update,
>> +       .final          =       chksum_final,
>> +       .finup          =       chksum_finup,
>> +       .digest         =       chksum_digest,
>> +       .descsize               =       sizeof(struct chksum_desc_ctx),
>> +       .base                   =       {
>> +               .cra_name               =       "crct10dif",
>> +               .cra_driver_name        =       "crct10dif-neon",
>> +               .cra_priority           =       200,
>> +               .cra_blocksize          =       CRC_T10DIF_BLOCK_SIZE,
>> +               .cra_module             =       THIS_MODULE,
>
>
> Please align the = characters here, and add only a single space after
>
> Note that you can do
>
> .base.cra_name = xxx,
> .base.xxx
>
> as well.
>
>> +       }
>> +};
>> +
>> +static int __init crct10dif_arm64_mod_init(void)
>> +{
>> +       return crypto_register_shash(&alg);
>
> You need to check here if your CPU has support for the 64x64->128
> PMULL instruction.
>
>> +}
>> +
>> +static void __exit crct10dif_arm64_mod_fini(void)
>> +{
>> +       crypto_unregister_shash(&alg);
>> +}
>> +
>> +module_init(crct10dif_arm64_mod_init);
>> +module_exit(crct10dif_arm64_mod_fini);
>> +
>> +MODULE_AUTHOR("YueHaibing <yuehaibing@huawei.com>");
>> +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with ARM64 NEON instruction.");
>> +MODULE_LICENSE("GPL");
>> +
>> +MODULE_ALIAS_CRYPTO("crct10dif");
>> +MODULE_ALIAS_CRYPTO("crct10dif-neon");
>> --
>> 2.7.0
>>
>>
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> linux-arm-kernel@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply

* Re: [PATCH v3] arm64/crypto: Accelerated CRC T10 DIF computation
From: Ard Biesheuvel @ 2016-11-22 12:53 UTC (permalink / raw)
  To: YueHaibing
  Cc: Herbert Xu, David S. Miller, Catalin Marinas, Will Deacon,
	linux-kernel@vger.kernel.org, Hanjun Guo, dingtinahong,
	yangshengkai, linux-arm-kernel@lists.infradead.org,
	linux-crypto@vger.kernel.org
In-Reply-To: <20161122101455.5312-1-yuehaibing@huawei.com>

On 22 November 2016 at 10:14, YueHaibing <yuehaibing@huawei.com> wrote:
> This is the ARM64 CRC T10 DIF transform accelerated with the ARMv8
> NEON instruction.The config CRYPTO_CRCT10DIF_NEON should be turned
> on to enable the feature.The crc_t10dif crypto library function will
> use this faster algorithm when crct10dif_neon module is loaded.
>

What is this algorithm commonly used for? In other words, why is it a
good idea to add support for this algorithm to the kernel?

> Tcrypt benchmark results:
>
> HIP06  (mode=320 sec=2)
>
> The ratio of bytes/sec crct10dif-neon Vs. crct10dif-generic:
>
>                         TEST                              neon          generic         ratio
>   16 byte blocks,   16 bytes per update,   1 updates    214506112       171095400       1.25
>   64 byte blocks,   16 bytes per update,   4 updates    139385312       119036352       1.17
>   64 byte blocks,   64 bytes per update,   1 updates    671523712       198945344       3.38
>  256 byte blocks,   16 bytes per update,  16 updates    157674880       125146752       1.26
>  256 byte blocks,   64 bytes per update,   4 updates    491888128       175764096       2.80
>  256 byte blocks,  256 bytes per update,   1 updates    2123298176      206995200       10.26
> 1024 byte blocks,   16 bytes per update,  64 updates    161243136       126460416       1.28
> 1024 byte blocks,  256 bytes per update,   4 updates    1643020800      200027136       8.21
> 1024 byte blocks, 1024 bytes per update,   1 updates    4238239232      209106432       20.27
> 2048 byte blocks,   16 bytes per update, 128 updates    162079744       126953472       1.28
> 2048 byte blocks,  256 bytes per update,   8 updates    1693587456      200867840       8.43
> 2048 byte blocks, 1024 bytes per update,   2 updates    3424323584      206330880       16.60
> 2048 byte blocks, 2048 bytes per update,   1 updates    5228207104      208620544       25.06
> 4096 byte blocks,   16 bytes per update, 256 updates    162304000       126894080       1.28
> 4096 byte blocks,  256 bytes per update,  16 updates    1731862528      201197568       8.61
> 4096 byte blocks, 1024 bytes per update,   4 updates    3668625408      207003648       17.72
> 4096 byte blocks, 4096 bytes per update,   1 updates    5551239168      209127424       26.54
> 8192 byte blocks,   16 bytes per update, 512 updates    162779136       126984192       1.28
> 8192 byte blocks,  256 bytes per update,  32 updates    1753702400      201420800       8.71
> 8192 byte blocks, 1024 bytes per update,   8 updates    3760918528      207351808       18.14
> 8192 byte blocks, 4096 bytes per update,   2 updates    5483655168      208928768       26.25
> 8192 byte blocks, 8192 bytes per update,   1 updates    5623377920      209108992       26.89
>
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> Signed-off-by: YangShengkai <yangshengkai@huawei.com>
> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
> Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
>
> ---
>  arch/arm64/crypto/Kconfig                 |   5 +
>  arch/arm64/crypto/Makefile                |   4 +
>  arch/arm64/crypto/crct10dif-neon-asm_64.S | 751 ++++++++++++++++++++++++++++++
>  arch/arm64/crypto/crct10dif-neon_glue.c   | 115 +++++
>  4 files changed, 875 insertions(+)
>  create mode 100644 arch/arm64/crypto/crct10dif-neon-asm_64.S
>  create mode 100644 arch/arm64/crypto/crct10dif-neon_glue.c
>
> diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
> index 2cf32e9..2e450bf 100644
> --- a/arch/arm64/crypto/Kconfig
> +++ b/arch/arm64/crypto/Kconfig
> @@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE
>         depends on ARM64 && KERNEL_MODE_NEON
>         select CRYPTO_HASH
>
> +config CRYPTO_CRCT10DIF_NEON
> +       tristate "CRCT10DIF hardware acceleration using NEON instructions"
> +       depends on ARM64 && KERNEL_MODE_NEON
> +       select CRYPTO_HASH
> +

Could you please follow the existing pattern:

config CRYPTO_CRCT10DIF_ARM64_NEON


>  config CRYPTO_AES_ARM64_CE
>         tristate "AES core cipher using ARMv8 Crypto Extensions"
>         depends on ARM64 && KERNEL_MODE_NEON
> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
> index abb79b3..6c9ff2c 100644
> --- a/arch/arm64/crypto/Makefile
> +++ b/arch/arm64/crypto/Makefile
> @@ -29,6 +29,10 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
>  obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
>  aes-neon-blk-y := aes-glue-neon.o aes-neon.o
>
> +obj-$(CONFIG_CRYPTO_CRCT10DIF_NEON) += crct10dif-neon.o
> +crct10dif-neon-y := crct10dif-neon-asm_64.o crct10dif-neon_glue.o
> +AFLAGS_crct10dif-neon-asm_64.o := -march=armv8-a+crypto
> +

Please drop this line, and add

.cpu generic+crypto

to the .S file

>  AFLAGS_aes-ce.o                := -DINTERLEAVE=4
>  AFLAGS_aes-neon.o      := -DINTERLEAVE=4
>
> diff --git a/arch/arm64/crypto/crct10dif-neon-asm_64.S b/arch/arm64/crypto/crct10dif-neon-asm_64.S
> new file mode 100644
> index 0000000..2ae3033
> --- /dev/null
> +++ b/arch/arm64/crypto/crct10dif-neon-asm_64.S
> @@ -0,0 +1,751 @@
> +/*
> + * Copyright (c) 2016-2017 Hisilicon Limited.
> + *

Please drop the 2017 here.

> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +.global crc_t10dif_neon

Please drop this .global, and use ENTRY() below

> +.text
> +
> +/* X0 is initial CRC value
> + * X1 is data buffer
> + * X2 is the length of buffer
> + * X3 is the backup buffer(for extend)
> + * X4 for other extend parameter(for extend)
> + * Q0, Q1, Q2, Q3 maybe as parameter for other functions,
> + * the value of Q0, Q1, Q2, Q3 maybe modified.
> + *
> + * suggestion:
> + * 1. dont use general purpose register for calculation
> + * 2. set data endianness outside of the kernel
> + * 3. use ext as shifting around
> + * 4. dont use LD3/LD4, ST3/ST4
> + */
> +


Whose suggestions are these, and why? I do appreciate comments like
this, but only if I can learn something from it

> +crc_t10dif_neon:

ENTRY()

> +       /* push the register to stack that CRC16 will use */
> +       STP             X5, X6, [sp, #-0x10]!

Please use an ordinary stack frame, i.e.,

stp   x29, x30, [sp, #-xxx]!
mov x29, sp

where xxx is the entire allocation you need for stacking callee save registers

> +       STP             X7, X8, [sp, #-0x10]!
> +       STP             X9, X10, [sp, #-0x10]!
> +       STP             X11, X12, [sp, #-0x10]!
> +       STP             X13, X14, [sp, #-0x10]!

These are not callee save, so no need to stack them

> +       STP             Q10, Q11, [sp, #-0x20]!
> +       STP             Q12, Q13, [sp, #-0x20]!
> +       STP             Q4, Q5, [sp, #-0x20]!
> +       STP             Q6, Q7, [sp, #-0x20]!
> +       STP             Q8, Q9, [sp, #-0x20]!
> +       STP             Q14, Q15, [sp, #-0x20]!
> +       STP             Q16, Q17, [sp, #-0x20]!
> +       STP             Q18, Q19, [sp, #-0x20]!
> +

What is the point of stacking the NEON registers? Also, as a general
note, could you switch to lower case throughout the file?


> +       SUB             sp,sp,#0x20
> +

Please account for locals in the allocation above. Only outgoing
arguments should be allocated below the frame pointer


> +       MOV             X11, #0         // PUSH STACK FLAG
> +

What does this comment mean?

> +       CMP             X2, #0x80
> +       B.LT            2f              // _less_than_128, <128
> +

Redundant comment

> +       /* V10/V11/V12/V13      is 128bit.
> +        * we get data 512bit( by cacheline ) each time
> +        */
> +       LDP             Q10, Q11, [X1], #0x20
> +       LDP             Q12, Q13, [X1], #0x20
> +
> +       /* move the initial value to V6 register */
> +       LSL             X0, X0, #48
> +       EOR             V6.16B, V6.16B, V6.16B
> +       MOV             V6.D[1], X0
> +
> +       /* big-little end change. because the data in memory is little-end,
> +        * we deal the data for bigend
> +        */
> +

What if I am using a big endian kernel? Hint: you probably need to
wrap these in CPU_LE()

> +       REV64           V10.16B, V10.16B
> +       REV64           V11.16B, V11.16B
> +       REV64           V12.16B, V12.16B
> +       REV64           V13.16B, V13.16B
> +       EXT             V10.16B, V10.16B, V10.16B, #8
> +       EXT             V11.16B, V11.16B, V11.16B, #8
> +       EXT             V12.16B, V12.16B, V12.16B, #8
> +       EXT             V13.16B, V13.16B, V13.16B, #8
> +
> +       EOR             V10.16B, V10.16B, V6.16B
> +
> +       SUB             X2, X2, #0x80
> +       ADD             X5, X1, #0x20
> +
> +       /* deal data when the size of buffer bigger than 128 bytes */
> +       /* _fold_64_B_loop */
> +       LDR             Q6,=0xe658000000000000044c000000000000

Could you move all these non-trivial constants to a separate location
(after the end of the function), and name them?

> +1:
> +
> +       LDP             Q16, Q17, [X1] ,#0x40
> +       LDP             Q18, Q19, [X5], #0x40
> +
> +       /* carry-less multiply.
> +        * V10 high-64bits carry-less multiply
> +        * V6 high-64bits(PMULL2)
> +        * V11 low-64bits carry-less multiply V6 low-64bits(PMULL)
> +        */
> +
> +       PMULL2          V4.1Q, V10.2D, V6.2D
> +       PMULL           V10.1Q, V10.1D, V6.1D
> +       PMULL2          V5.1Q, V11.2D, V6.2D
> +       PMULL           V11.1Q, V11.1D, V6.1D
> +

These instructions are only available if you have the PMULL extension,
so this algorithm is not plain NEON.

> +       REV64           V16.16B, V16.16B
> +       REV64           V17.16B, V17.16B
> +       REV64           V18.16B, V18.16B
> +       REV64           V19.16B, V19.16B
> +

Endian swap on LE only?

> +       PMULL2          V14.1Q, V12.2D, V6.2D
> +       PMULL           V12.1Q, V12.1D, V6.1D
> +       PMULL2          V15.1Q, V13.2D, V6.2D
> +       PMULL           V13.1Q, V13.1D, V6.1D
> +
> +       EXT             V16.16B, V16.16B, V16.16B, #8
> +       EOR             V10.16B, V10.16B, V4.16B
> +
> +       EXT             V17.16B, V17.16B, V17.16B, #8
> +       EOR             V11.16B, V11.16B, V5.16B
> +
> +       EXT             V18.16B, V18.16B, V18.16B, #8
> +       EOR             V12.16B, V12.16B, V14.16B
> +
> +       EXT             V19.16B, V19.16B, V19.16B, #8
> +       EOR             V13.16B, V13.16B, V15.16B
> +
> +       SUB             X2, X2, #0x40
> +
> +
> +       EOR             V10.16B, V10.16B, V16.16B
> +       EOR             V11.16B, V11.16B, V17.16B
> +
> +       EOR             V12.16B, V12.16B, V18.16B
> +       EOR             V13.16B, V13.16B, V19.16B
> +
> +       CMP             X2, #0x0
> +       B.GE            1b                      // >=0
> +
> +       LDR             Q6, =0x06df0000000000002d56000000000000
> +       MOV             V4.16B, V10.16B
> +       /* V10 carry-less 0x06df000000000000([127:64]*[127:64]) */
> +       PMULL           V4.1Q, V4.1D, V6.1D     //switch PMULL & PMULL2 order
> +       PMULL2          V10.1Q, V10.2D, V6.2D
> +       EOR             V11.16B, V11.16B, V4.16B
> +       EOR             V11.16B, V11.16B, V10.16B
> +
> +       MOV             V4.16B, V11.16B
> +       PMULL           V4.1Q, V4.1D, V6.1D     //switch PMULL & PMULL2 order
> +       PMULL2          V11.1Q, V11.2D, V6.2D
> +       EOR             V12.16B, V12.16B, V4.16B
> +       EOR             V12.16B, V12.16B, V11.16B
> +
> +       MOV             V4.16B, V12.16B
> +       PMULL           V4.1Q, V4.1D, V6.1D     //switch PMULL & PMULL2 order
> +       PMULL2          V12.1Q, V12.2D, V6.2D
> +       EOR             V13.16B, V13.16B, V4.16B
> +       EOR             V13.16B, V13.16B, V12.16B
> +
> +       ADD             X2, X2, #48
> +       CMP             X2, #0x0
> +       B.LT            3f                      // _final_reduction_for_128, <0
> +
> +       /* _16B_reduction_loop */
> +4:
> +       /* unrelated load as early as possible*/
> +       LDR             Q10, [X1], #0x10
> +
> +       MOV             V4.16B, V13.16B
> +       PMULL2          V13.1Q, V13.2D, V6.2D
> +       PMULL           V4.1Q, V4.1D, V6.1D
> +       EOR             V13.16B, V13.16B, V4.16B
> +
> +       REV64           V10.16B, V10.16B
> +       EXT             V10.16B, V10.16B, V10.16B, #8
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +
> +       SUB             X2, X2, #0x10
> +       CMP             X2, #0x0
> +       B.GE            4b                      // _16B_reduction_loop, >=0
> +
> +       /*  _final_reduction_for_128 */
> +3:     ADD             X2, X2, #0x10
> +       CMP             X2, #0x0
> +       B.EQ            5f                      // _128_done, ==0
> +
> +       /* _get_last_two_xmms */

Bogus comment. I guess you ported this code from x86, are you sure you
don't need to credit the original author?

> +6:     MOV             V12.16B, V13.16B
> +       SUB             X1, X1, #0x10
> +       ADD             X1, X1, X2
> +       LDR             Q11, [X1], #0x10
> +       REV64           V11.16B, V11.16B
> +       EXT             V11.16B, V11.16B, V11.16B, #8
> +
> +       CMP             X2, #8
> +       B.EQ            50f
> +       B.LT            51f
> +       B.GT            52f
> +
> +50:
> +       /* dont use X register as temp one */
> +       FMOV            D14, D12
> +       MOVI            D12, #0
> +       MOV             V12.D[1],V14.D[0]
> +       B               53f
> +51:
> +       MOV             X9, #64
> +       LSL             X13, X2, #3             // <<3 equal x8
> +       SUB             X9, X9, X13
> +       MOV             X5, V12.D[0]            // low 64-bit
> +       MOV             X6, V12.D[1]            // high 64-bit
> +       LSR             X10, X5, X9             // high bit of low 64-bit
> +       LSL             X7, X5, X13
> +       LSL             X8, X6, X13
> +       ORR             X8, X8, X10             // combination of high 64-bit
> +       MOV             V12.D[1], X8
> +       MOV             V12.D[0], X7
> +
> +       B               53f
> +52:
> +       LSL             X13, X2, #3             // <<3 equal x8
> +       SUB             X13, X13, #64
> +
> +       DUP             V18.2D, X13
> +       FMOV            D16, D12
> +       USHL            D16, D16, D18
> +       EXT             V12.16B, V16.16B, V16.16B, #8
> +
> +53:
> +       MOVI            D14, #0                 //add one zero constant
> +
> +       CMP             X2, #0
> +       B.EQ    30f
> +       CMP             X2, #1
> +       B.EQ    31f
> +       CMP             X2, #2
> +       B.EQ    32f
> +       CMP             X2, #3
> +       B.EQ    33f
> +       CMP             X2, #4
> +       B.EQ    34f
> +       CMP             X2, #5
> +       B.EQ    35f
> +       CMP             X2, #6
> +       B.EQ    36f
> +       CMP             X2, #7
> +       B.EQ    37f
> +       CMP             X2, #8
> +       B.EQ    38f
> +       CMP             X2, #9
> +       B.EQ    39f
> +       CMP             X2, #10
> +       B.EQ    40f
> +       CMP             X2, #11
> +       B.EQ    41f
> +       CMP             X2, #12
> +       B.EQ    42f
> +       CMP             X2, #13
> +       B.EQ    43f
> +       CMP             X2, #14
> +       B.EQ    44f
> +       CMP             X2, #15
> +       B.EQ    45f
> +

This looks awful. If you make the snippets below a fixed size, you
could use a computed goto instead

> +       // >> 128bit
> +30:
> +       EOR             V13.16B, V13.16B, V13.16B
> +       EOR             V8.16B, V8.16B, V8.16B
> +       LDR             Q9,=0xffffffffffffffffffffffffffffffff

Shouldn't you initialize q8 here as well. And in general, couldn't you
use some kind of shift to generate these constants (in all cases
below)?
> +       B               46f
> +
> +       // >> 120bit
> +31:
> +       USHR            V13.2D, V13.2D, #56
> +       EXT             V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xff
> +       LDR             Q9,=0xffffffffffffffffffffffffffffff00
> +       B               46f
> +
> +       // >> 112bit
> +32:
> +       USHR             V13.2D, V13.2D, #48
> +       EXT             V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xffff
> +       LDR             Q9,=0xffffffffffffffffffffffffffff0000
> +       B               46f
> +
> +       // >> 104bit
> +33:
> +       USHR            V13.2D, V13.2D, #40
> +       EXT             V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xffffff
> +       LDR             Q9,=0xffffffffffffffffffffffffff000000
> +       B               46f
> +
> +       // >> 96bit
> +34:
> +       USHR            V13.2D, V13.2D, #32
> +       EXT             V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xffffffff
> +       LDR             Q9,=0xffffffffffffffffffffffff00000000
> +       B               46f
> +
> +       // >> 88bit
> +35:
> +       USHR            V13.2D, V13.2D, #24
> +       EXT             V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xffffffffff
> +       LDR             Q9,=0xffffffffffffffffffffff0000000000
> +       B               46f
> +
> +       // >> 80bit
> +36:
> +       USHR            V13.2D, V13.2D, #16
> +       EXT             V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xffffffffffff
> +       LDR             Q9,=0xffffffffffffffffffff000000000000
> +       B               46f
> +
> +       // >> 72bit
> +37:
> +       USHR            V13.2D, V13.2D, #8
> +       EXT             V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xffffffffffffff
> +       LDR             Q9,=0xffffffffffffffffff00000000000000
> +       B               46f
> +
> +       // >> 64bit
> +38:
> +       EXT              V13.16B, V13.16B, V14.16B, #8
> +       LDR             Q8,=0xffffffffffffffff
> +       LDR             Q9,=0xffffffffffffffff0000000000000000
> +       B               46f
> +
> +       // >> 56bit
> +39:
> +       EXT             V13.16B, V13.16B, V13.16B, #7
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.H[5], V14.H[0]
> +       MOV             V13.B[9], V14.B[0]
> +
> +       LDR             Q8,=0xffffffffffffffffff
> +       LDR             Q9,=0xffffffffffffff000000000000000000
> +       B               46f
> +
> +       // >> 48bit
> +40:
> +       EXT             V13.16B, V13.16B, V13.16B, #6
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.H[5], V14.H[0]
> +
> +       LDR             Q8,=0xffffffffffffffffffff
> +       LDR             Q9,=0xffffffffffff00000000000000000000
> +       B               46f
> +
> +       // >> 40bit
> +41:
> +       EXT             V13.16B, V13.16B, V13.16B, #5
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.B[11], V14.B[0]
> +
> +       LDR             Q8,=0xffffffffffffffffffffff
> +       LDR             Q9,=0xffffffffff0000000000000000000000
> +       B               46f
> +
> +       // >> 32bit
> +42:
> +       EXT             V13.16B, V13.16B, V13.16B, #4
> +       MOV             V13.S[3], V14.S[0]
> +
> +       LDR             Q8,=0xffffffffffffffffffffffff
> +       LDR             Q9,=0xffffffff000000000000000000000000
> +       B               46f
> +
> +       // >> 24bit
> +43:
> +       EXT             V13.16B, V13.16B, V13.16B, #3
> +       MOV             V13.H[7], V14.H[0]
> +       MOV             V13.B[13], V14.B[0]
> +
> +       LDR             Q8,=0xffffffffffffffffffffffffff
> +       LDR             Q9,=0xffffff00000000000000000000000000
> +       B               46f
> +
> +       // >> 16bit
> +44:
> +       EXT             V13.16B, V13.16B, V13.16B, #2
> +       MOV             V13.H[7], V14.H[0]
> +
> +       LDR             Q8,=0xffffffffffffffffffffffffffff
> +       LDR             Q9,=0xffff0000000000000000000000000000
> +       B               46f
> +
> +       // >> 8bit
> +45:
> +       EXT              V13.16B, V13.16B, V13.16B, #1
> +       MOV             V13.B[15], V14.B[0]
> +
> +       LDR             Q8,=0xffffffffffffffffffffffffffffff
> +       LDR             Q9,=0xff000000000000000000000000000000
> +
> +       // backup V12 first
> +       // pblendvb     xmm1, xmm2

Another remnant of the x86 version, please remove

> +46:
> +       AND             V12.16B, V12.16B, V9.16B
> +       AND             V11.16B, V11.16B, V8.16B
> +       ORR             V11.16B, V11.16B, V12.16B
> +
> +       MOV             V12.16B, V11.16B
> +       MOV             V4.16B, V13.16B
> +       PMULL2          V13.1Q, V13.2D, V6.2D
> +       PMULL           V4.1Q, V4.1D, V6.1D
> +       EOR             V13.16B, V13.16B, V4.16B
> +       EOR             V13.16B, V13.16B, V12.16B
> +
> +       /* _128_done. we change the Q6 D[0] and D[1] */
> +5:     LDR             Q6, =0x2d560000000000001368000000000000
> +       MOVI            D14, #0
> +       MOV             V10.16B, V13.16B
> +       PMULL2          V13.1Q, V13.2D, V6.2D
> +
> +       MOV             V10.D[1], V10.D[0]
> +       MOV             V10.D[0], V14.D[0]    //set zero
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +
> +       MOV             V10.16B, V13.16B
> +       LDR             Q7, =0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
> +       AND             V10.16B, V10.16B, V7.16B
> +
> +       MOV             S13, V13.S[3]
> +
> +       PMULL           V13.1Q, V13.1D, V6.1D
> +       EOR             V13.16B, V13.16B, V10.16B
> +
> +       /* _barrett */

What does '_barrett' mean?

> +7:     LDR             Q6, =0x00000001f65a57f8000000018bb70000
> +       MOVI            D14, #0
> +       MOV             V10.16B, V13.16B
> +       PMULL2          V13.1Q, V13.2D, V6.2D
> +
> +       EXT             V13.16B, V13.16B, V13.16B, #12
> +       MOV             V13.S[0], V14.S[0]
> +
> +       EXT             V6.16B, V6.16B, V6.16B, #8
> +       PMULL2          V13.1Q, V13.2D, V6.2D
> +
> +       EXT             V13.16B, V13.16B, V13.16B, #12
> +       MOV             V13.S[0], V14.S[0]
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +       MOV             X0, V13.D[0]
> +
> +       /* _cleanup */
> +8:     MOV             X14, #48
> +       LSR             X0, X0, X14

Why the temp x14?

> +99:
> +       ADD             sp, sp, #0x20
> +
> +       LDP             Q18, Q19, [sp], #0x20
> +       LDP             Q16, Q17, [sp], #0x20
> +       LDP             Q14, Q15, [sp], #0x20
> +
> +       LDP             Q8, Q9, [sp], #0x20
> +       LDP             Q6, Q7, [sp], #0x20
> +       LDP             Q4, Q5, [sp], #0x20
> +       LDP             Q12, Q13, [sp], #0x20
> +       LDP             Q10, Q11, [sp], #0x20
> +       LDP             X13, X14, [sp], #0x10
> +       LDP             X11, X12, [sp], #0x10
> +       LDP             X9, X10, [sp], #0x10
> +       LDP             X7, X8, [sp], #0x10
> +       LDP             X5, X6, [sp], #0x10
> +

None of these registers need to be restored. The only thing you need
(to mirror the prologue)

ldp x29, x30, [sp], #xxx
ret

where xxx is the same value you used at the beginning.


> +       RET
> +
> +       /* _less_than_128 */
> +2:     CMP             X2, #32
> +       B.LT            9f                              // _less_than_32
> +       LDR             Q6, =0x06df0000000000002d56000000000000
> +
> +       LSL             X0, X0, #48
> +       LDR             Q10, =0x0

Please use movi here

> +       MOV             V10.D[1], X0
> +       LDR             Q13, [X1], #0x10
> +       REV64           V13.16B, V13.16B
> +       EXT             V13.16B, V13.16B, V13.16B, #8
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +
> +       SUB             X2, X2, #32
> +       B               4b
> +
> +       /* _less_than_32 */
> +9:     CMP             X2, #0
> +       B.EQ            99b                     // _cleanup

You can use CBZ here

> +       LSL             X0, X0, #48
> +       LDR             Q10,=0x0

Please use movi here

> +       MOV             V10.D[1], X0
> +
> +       CMP             X2, #16
> +       B.EQ            10f                     // _exact_16_left
> +       B.LE            11f                     // _less_than_16_left
> +       LDR             Q13, [X1], #0x10
> +
> +       REV64           V13.16B, V13.16B
> +       EXT             V13.16B, V13.16B, V13.16B, #8
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +       SUB             X2, X2, #16
> +       LDR             Q6, =0x06df0000000000002d56000000000000
> +       B               6b                      // _get_last_two_xmms

Another bogus comment

> +
> +       /*  _less_than_16_left */
> +11:    CMP             X2, #4
> +       B.LT            13f                     // _only_less_than_4
> +
> +       /* backup the length of data, we used in _less_than_2_left */
> +       MOV             X8, X2
> +       CMP             X2, #8
> +       B.LT            14f                     // _less_than_8_left
> +
> +       LDR             X14, [X1], #8
> +       /* push the data to stack, we backup the data to V10 */
> +       STR             X14, [sp, #0]
> +       SUB             X2, X2, #8
> +       ADD             X11, X11, #8
> +
> +       /* _less_than_8_left */
> +14:    CMP             X2, #4
> +       B.LT            15f                     // _less_than_4_left
> +
> +       /* get 32bit data */
> +       LDR             W5, [X1], #4
> +
> +       /* push the data to stack */
> +       STR             W5, [sp, X11]
> +       SUB             X2, X2, #4
> +       ADD             X11, X11, #4
> +
> +       /* _less_than_4_left */
> +15:    CMP             X2, #2
> +       B.LT            16f                     // _less_than_2_left
> +
> +       /* get 16bits data */
> +       LDRH            W6, [X1], #2
> +
> +       /* push the data to stack */
> +       STRH            W6, [sp, X11]
> +       SUB             X2, X2, #2
> +       ADD             X11, X11, #2
> +
> +       /* _less_than_2_left */
> +16:
> +       /* get 8bits data */
> +       LDRB            W7, [X1], #1
> +       STRB            W7, [sp, X11]
> +       ADD             X11, X11, #1
> +
> +       /* POP data from stack, store to V13 */
> +       LDR             Q13, [sp]
> +       MOVI            D14, #0
> +       REV64           V13.16B, V13.16B
> +       MOV             V8.16B, V13.16B
> +       MOV             V13.D[1], V8.D[0]
> +       MOV             V13.D[0], V8.D[1]
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +       CMP             X8, #15
> +       B.EQ    80f
> +       CMP             X8, #14
> +       B.EQ    81f
> +       CMP             X8, #13
> +       B.EQ    82f
> +       CMP             X8, #12
> +       B.EQ    83f
> +       CMP             X8, #11
> +       B.EQ    84f
> +       CMP             X8, #10
> +       B.EQ    85f
> +       CMP             X8, #9
> +       B.EQ    86f
> +       CMP             X8, #8
> +       B.EQ    87f
> +       CMP             X8, #7
> +       B.EQ    88f
> +       CMP             X8, #6
> +       B.EQ    89f
> +       CMP             X8, #5
> +       B.EQ    90f
> +       CMP             X8, #4
> +       B.EQ    91f
> +       CMP             X8, #3
> +       B.EQ    92f
> +       CMP             X8, #2
> +       B.EQ    93f
> +       CMP             X8, #1
> +       B.EQ    94f
> +       CMP             X8, #0
> +       B.EQ    95f
> +

Again, please use a computed goto instead

> +80:
> +       EXT             V13.16B, V13.16B, V13.16B, #1
> +       MOV             V13.B[15], V14.B[0]
> +       B               5b
> +
> +81:
> +       EXT             V13.16B, V13.16B, V13.16B, #2
> +       MOV              V13.H[7], V14.H[0]
> +       B               5b
> +
> +82:
> +       EXT             V13.16B, V13.16B, V13.16B, #3
> +       MOV             V13.H[7], V14.H[0]
> +       MOV             V13.B[13], V14.B[0]
> +       B               5b
> +83:
> +
> +       EXT             V13.16B, V13.16B, V13.16B, #4
> +       MOV             V13.S[3], V14.S[0]
> +       B               5b
> +
> +84:
> +       EXT             V13.16B, V13.16B, V13.16B, #5
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.B[11], V14.B[0]
> +       B               5b
> +
> +85:
> +       EXT             V13.16B, V13.16B, V13.16B, #6
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.H[5], V14.H[0]
> +       B               5b
> +
> +86:
> +       EXT             V13.16B, V13.16B, V13.16B, #7
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.H[5], V14.H[0]
> +       MOV             V13.B[9], V14.B[0]
> +       B               5b
> +
> +87:
> +       MOV             V13.D[0], V13.D[1]
> +       MOV             V13.D[1], V14.D[0]
> +       B               5b
> +
> +88:
> +       EXT             V13.16B, V13.16B, V13.16B, #9
> +       MOV             V13.D[1], V14.D[0]
> +       MOV             V13.B[7], V14.B[0]
> +       B               5b
> +
> +89:
> +       EXT             V13.16B, V13.16B, V13.16B, #10
> +       MOV             V13.D[1], V14.D[0]
> +       MOV              V13.H[3], V14.H[0]
> +       B               5b
> +
> +90:
> +       EXT             V13.16B, V13.16B, V13.16B, #11
> +       MOV             V13.D[1], V14.D[0]
> +       MOV             V13.H[3], V14.H[0]
> +       MOV             V13.B[5], V14.B[0]
> +       B               5b
> +
> +91:
> +       MOV             V13.S[0], V13.S[3]
> +       MOV             V13.D[1], V14.D[0]
> +       MOV             V13.S[1], V14.S[0]
> +       B               5b
> +
> +92:
> +       EXT             V13.16B, V13.16B, V13.16B, #13
> +       MOV             V13.D[1], V14.D[0]
> +       MOV             V13.S[1], V14.S[0]
> +       MOV             V13.B[3], V14.B[0]
> +       B               5b
> +
> +93:
> +       MOV             V15.H[0], V13.H[7]
> +       MOV             V13.16B, V14.16B
> +       MOV             V13.H[0], V15.H[0]
> +       B               5b
> +
> +94:
> +       MOV             V15.B[0], V13.B[15]
> +       MOV             V13.16B, V14.16B
> +       MOV             V13.B[0], V15.B[0]
> +       B               5b
> +
> +95:
> +       LDR             Q13,=0x0

movi

> +       B               5b                              // _128_done
> +
> +       /* _exact_16_left */
> +10:
> +       LD1             { V13.2D }, [X1], #0x10
> +
> +       REV64           V13.16B, V13.16B
> +       EXT             V13.16B, V13.16B, V13.16B, #8
> +       EOR             V13.16B, V13.16B, V10.16B
> +       B               5b                              // _128_done
> +
> +       /* _only_less_than_4 */
> +13:    CMP             X2, #3
> +       MOVI            D14, #0
> +       B.LT            17f                             //_only_less_than_3
> +
> +       LDR             S13, [X1], #4
> +       MOV              V13.B[15], V13.B[0]
> +       MOV             V13.B[14], V13.B[1]
> +       MOV             V13.B[13], V13.B[2]
> +       MOV             V13.S[0], V13.S[1]
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +
> +       EXT             V13.16B, V13.16B, V13.16B, #5
> +
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.B[11], V14.B[0]
> +
> +       B               7b                              // _barrett
> +       /* _only_less_than_3 */
> +17:
> +       CMP             X2, #2
> +       B.LT            18f                             // _only_less_than_2
> +
> +       LDR             H13, [X1], #2
> +       MOV             V13.B[15], V13.B[0]
> +       MOV             V13.B[14], V13.B[1]
> +       MOV             V13.H[0], V13.H[1]
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +
> +       EXT             V13.16B, V13.16B, V13.16B, #6
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.H[5], V14.H[0]
> +
> +       B               7b                              // _barrett
> +
> +       /* _only_less_than_2 */
> +18:
> +       LDRB            W7, [X1], #1
> +       LDR             Q13, = 0x0
> +       MOV             V13.B[15], W7
> +
> +       EOR             V13.16B, V13.16B, V10.16B
> +
> +       EXT             V13.16B, V13.16B, V13.16B, #7
> +       MOV             V13.S[3], V14.S[0]
> +       MOV             V13.H[5], V14.H[0]
> +       MOV             V13.B[9], V14.B[0]
> +
> +       B               7b                              // _barrett

Please end with ENDPROC()

> diff --git a/arch/arm64/crypto/crct10dif-neon_glue.c b/arch/arm64/crypto/crct10dif-neon_glue.c
> new file mode 100644
> index 0000000..a6d8c5c
> --- /dev/null
> +++ b/arch/arm64/crypto/crct10dif-neon_glue.c
> @@ -0,0 +1,115 @@
> +/*
> + * Copyright (c) 2016-2017 Hisilicon Limited.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +
> +#include <linux/types.h>
> +#include <linux/module.h>
> +#include <linux/crc-t10dif.h>
> +#include <crypto/internal/hash.h>
> +#include <linux/init.h>
> +#include <linux/string.h>
> +#include <linux/kernel.h>
> +
> +asmlinkage __u16 crc_t10dif_neon(__u16 crc, const unsigned char *buf,
> +                               size_t len);
> +
> +struct chksum_desc_ctx {
> +       __u16 crc;
> +};
> +
> +/*
> + * Steps through buffer one byte at at time, calculates reflected
> + * crc using table.
> + */
> +

Where is the table?

> +static int chksum_init(struct shash_desc *desc)
> +{
> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +       ctx->crc = 0;
> +
> +       return 0;
> +}
> +
> +static int chksum_update(struct shash_desc *desc, const u8 *data,
> +                        unsigned int length)
> +{
> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +       ctx->crc = crc_t10dif_neon(ctx->crc, data, length);

You need kernel_neon_begin/kernel_neon_end here

> +       return 0;
> +}
> +
> +static int chksum_final(struct shash_desc *desc, u8 *out)
> +{
> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +       *(__u16 *)out = ctx->crc;
> +       return 0;
> +}
> +
> +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
> +                       u8 *out)
> +{
> +       *(__u16 *)out = crc_t10dif_neon(*crcp, data, len);

... and here

> +       return 0;
> +}
> +
> +static int chksum_finup(struct shash_desc *desc, const u8 *data,
> +                       unsigned int len, u8 *out)
> +{
> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +       return __chksum_finup(&ctx->crc, data, len, out);
> +}
> +
> +static int chksum_digest(struct shash_desc *desc, const u8 *data,
> +                        unsigned int length, u8 *out)
> +{
> +       struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
> +
> +       return __chksum_finup(&ctx->crc, data, length, out);
> +}
> +
> +static struct shash_alg alg = {
> +       .digestsize             =       CRC_T10DIF_DIGEST_SIZE,
> +       .init           =       chksum_init,
> +       .update         =       chksum_update,
> +       .final          =       chksum_final,
> +       .finup          =       chksum_finup,
> +       .digest         =       chksum_digest,
> +       .descsize               =       sizeof(struct chksum_desc_ctx),
> +       .base                   =       {
> +               .cra_name               =       "crct10dif",
> +               .cra_driver_name        =       "crct10dif-neon",
> +               .cra_priority           =       200,
> +               .cra_blocksize          =       CRC_T10DIF_BLOCK_SIZE,
> +               .cra_module             =       THIS_MODULE,


Please align the = characters here, and add only a single space after

Note that you can do

.base.cra_name = xxx,
.base.xxx

as well.

> +       }
> +};
> +
> +static int __init crct10dif_arm64_mod_init(void)
> +{
> +       return crypto_register_shash(&alg);

You need to check here if your CPU has support for the 64x64->128
PMULL instruction.

> +}
> +
> +static void __exit crct10dif_arm64_mod_fini(void)
> +{
> +       crypto_unregister_shash(&alg);
> +}
> +
> +module_init(crct10dif_arm64_mod_init);
> +module_exit(crct10dif_arm64_mod_fini);
> +
> +MODULE_AUTHOR("YueHaibing <yuehaibing@huawei.com>");
> +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with ARM64 NEON instruction.");
> +MODULE_LICENSE("GPL");
> +
> +MODULE_ALIAS_CRYPTO("crct10dif");
> +MODULE_ALIAS_CRYPTO("crct10dif-neon");
> --
> 2.7.0
>
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply

* [v3 PATCH 16/16] crypto: aesbs - Convert to skcipher
From: Herbert Xu @ 2016-11-22 12:08 UTC (permalink / raw)
  To: Linux Crypto Mailing List
In-Reply-To: <20161122120703.GA11911@gondor.apana.org.au>

This patch converts aesbs over to the skcipher interface.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 arch/arm/crypto/aesbs-glue.c |  380 +++++++++++++++++--------------------------
 1 file changed, 152 insertions(+), 228 deletions(-)

diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c
index 0511a6c..f5eafce 100644
--- a/arch/arm/crypto/aesbs-glue.c
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -10,8 +10,9 @@
 
 #include <asm/neon.h>
 #include <crypto/aes.h>
-#include <crypto/ablk_helper.h>
-#include <crypto/algapi.h>
+#include <crypto/cbc.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
 #include <linux/module.h>
 #include <crypto/xts.h>
 
@@ -55,14 +56,14 @@ struct aesbs_xts_ctx {
 	struct AES_KEY	twkey;
 };
 
-static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+static int aesbs_cbc_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 			     unsigned int key_len)
 {
-	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int bits = key_len * 8;
 
 	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 	ctx->dec.rk = ctx->enc;
@@ -71,33 +72,33 @@ static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 	return 0;
 }
 
-static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+static int aesbs_ctr_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 			     unsigned int key_len)
 {
-	struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int bits = key_len * 8;
 
 	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 	ctx->enc.converted = 0;
 	return 0;
 }
 
-static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+static int aesbs_xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
 			     unsigned int key_len)
 {
-	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int bits = key_len * 4;
 	int err;
 
-	err = xts_check_key(tfm, in_key, key_len);
+	err = xts_verify_key(tfm, in_key, key_len);
 	if (err)
 		return err;
 
 	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 	ctx->dec.rk = ctx->enc.rk;
@@ -107,88 +108,52 @@ static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 	return 0;
 }
 
-static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
-			     struct scatterlist *dst,
-			     struct scatterlist *src, unsigned int nbytes)
+static inline void aesbs_encrypt_one(struct crypto_skcipher *tfm,
+				     const u8 *src, u8 *dst)
 {
-	struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	struct blkcipher_walk walk;
-	int err;
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt(desc, &walk);
+	AES_encrypt(src, dst, &ctx->dec.rk);
+}
 
-	while (walk.nbytes) {
-		u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
-		u8 *src = walk.src.virt.addr;
+static int aesbs_cbc_encrypt(struct skcipher_request *req)
+{
+	return crypto_cbc_encrypt_walk(req, aesbs_encrypt_one);
+}
 
-		if (walk.dst.virt.addr == walk.src.virt.addr) {
-			u8 *iv = walk.iv;
-
-			do {
-				crypto_xor(src, iv, AES_BLOCK_SIZE);
-				AES_encrypt(src, src, &ctx->enc);
-				iv = src;
-				src += AES_BLOCK_SIZE;
-			} while (--blocks);
-			memcpy(walk.iv, iv, AES_BLOCK_SIZE);
-		} else {
-			u8 *dst = walk.dst.virt.addr;
-
-			do {
-				crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
-				AES_encrypt(walk.iv, dst, &ctx->enc);
-				memcpy(walk.iv, dst, AES_BLOCK_SIZE);
-				src += AES_BLOCK_SIZE;
-				dst += AES_BLOCK_SIZE;
-			} while (--blocks);
-		}
-		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
-	}
-	return err;
+static inline void aesbs_decrypt_one(struct crypto_skcipher *tfm,
+				     const u8 *src, u8 *dst)
+{
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	AES_decrypt(src, dst, &ctx->dec.rk);
 }
 
-static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
-			     struct scatterlist *dst,
-			     struct scatterlist *src, unsigned int nbytes)
+static int aesbs_cbc_decrypt(struct skcipher_request *req)
 {
-	struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	unsigned int nbytes;
 	int err;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
-
-	while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
-		kernel_neon_begin();
-		bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
-				  walk.nbytes, &ctx->dec, walk.iv);
-		kernel_neon_end();
-		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
-	}
-	while (walk.nbytes) {
-		u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
+	for (err = skcipher_walk_virt(&walk, req, false);
+	     (nbytes = walk.nbytes); err = skcipher_walk_done(&walk, nbytes)) {
+		u32 blocks = nbytes / AES_BLOCK_SIZE;
 		u8 *dst = walk.dst.virt.addr;
 		u8 *src = walk.src.virt.addr;
-		u8 bk[2][AES_BLOCK_SIZE];
 		u8 *iv = walk.iv;
 
-		do {
-			if (walk.dst.virt.addr == walk.src.virt.addr)
-				memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
-
-			AES_decrypt(src, dst, &ctx->dec.rk);
-			crypto_xor(dst, iv, AES_BLOCK_SIZE);
-
-			if (walk.dst.virt.addr == walk.src.virt.addr)
-				iv = bk[blocks & 1];
-			else
-				iv = src;
+		if (blocks >= 8) {
+			kernel_neon_begin();
+			bsaes_cbc_encrypt(src, dst, nbytes, &ctx->dec, iv);
+			kernel_neon_end();
+			nbytes %= AES_BLOCK_SIZE;
+			continue;
+		}
 
-			dst += AES_BLOCK_SIZE;
-			src += AES_BLOCK_SIZE;
-		} while (--blocks);
-		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+		nbytes = crypto_cbc_decrypt_blocks(&walk, tfm,
+						   aesbs_decrypt_one);
 	}
 	return err;
 }
@@ -206,17 +171,15 @@ static void inc_be128_ctr(__be32 ctr[], u32 addend)
 	}
 }
 
-static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
-			     struct scatterlist *dst, struct scatterlist *src,
-			     unsigned int nbytes)
+static int aesbs_ctr_encrypt(struct skcipher_request *req)
 {
-	struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
 	u32 blocks;
 	int err;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+	err = skcipher_walk_virt(&walk, req, false);
 
 	while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
 		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
@@ -235,11 +198,7 @@ static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
 		kernel_neon_end();
 		inc_be128_ctr(ctr, blocks);
 
-		nbytes -= blocks * AES_BLOCK_SIZE;
-		if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
-			break;
-
-		err = blkcipher_walk_done(desc, &walk, tail);
+		err = skcipher_walk_done(&walk, tail);
 	}
 	if (walk.nbytes) {
 		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
@@ -248,23 +207,21 @@ static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
 
 		AES_encrypt(walk.iv, ks, &ctx->enc.rk);
 		if (tdst != tsrc)
-			memcpy(tdst, tsrc, nbytes);
-		crypto_xor(tdst, ks, nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
+			memcpy(tdst, tsrc, walk.nbytes);
+		crypto_xor(tdst, ks, walk.nbytes);
+		err = skcipher_walk_done(&walk, 0);
 	}
 	return err;
 }
 
-static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
-			     struct scatterlist *dst,
-			     struct scatterlist *src, unsigned int nbytes)
+static int aesbs_xts_encrypt(struct skcipher_request *req)
 {
-	struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
 	int err;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+	err = skcipher_walk_virt(&walk, req, false);
 
 	/* generate the initial tweak */
 	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
@@ -274,21 +231,19 @@ static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
 		bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
 				  walk.nbytes, &ctx->enc, walk.iv);
 		kernel_neon_end();
-		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
 }
 
-static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
-			     struct scatterlist *dst,
-			     struct scatterlist *src, unsigned int nbytes)
+static int aesbs_xts_decrypt(struct skcipher_request *req)
 {
-	struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	struct blkcipher_walk walk;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
 	int err;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
+	err = skcipher_walk_virt(&walk, req, false);
 
 	/* generate the initial tweak */
 	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
@@ -298,141 +253,110 @@ static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
 		bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
 				  walk.nbytes, &ctx->dec, walk.iv);
 		kernel_neon_end();
-		err = blkcipher_walk_done(desc, &walk, walk.nbytes % AES_BLOCK_SIZE);
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
 }
 
-static struct crypto_alg aesbs_algs[] = { {
-	.cra_name		= "__cbc-aes-neonbs",
-	.cra_driver_name	= "__driver-cbc-aes-neonbs",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
-				  CRYPTO_ALG_INTERNAL,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct aesbs_cbc_ctx),
-	.cra_alignmask		= 7,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_blkcipher = {
-		.min_keysize	= AES_MIN_KEY_SIZE,
-		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
-		.setkey		= aesbs_cbc_set_key,
-		.encrypt	= aesbs_cbc_encrypt,
-		.decrypt	= aesbs_cbc_decrypt,
+static struct skcipher_alg aesbs_algs[] = { {
+	.base = {
+		.cra_name		= "__cbc(aes)",
+		.cra_driver_name	= "__cbc-aes-neonbs",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct aesbs_cbc_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
 	},
+	.min_keysize	= AES_MIN_KEY_SIZE,
+	.max_keysize	= AES_MAX_KEY_SIZE,
+	.ivsize		= AES_BLOCK_SIZE,
+	.setkey		= aesbs_cbc_set_key,
+	.encrypt	= aesbs_cbc_encrypt,
+	.decrypt	= aesbs_cbc_decrypt,
 }, {
-	.cra_name		= "__ctr-aes-neonbs",
-	.cra_driver_name	= "__driver-ctr-aes-neonbs",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
-				  CRYPTO_ALG_INTERNAL,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct aesbs_ctr_ctx),
-	.cra_alignmask		= 7,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_blkcipher = {
-		.min_keysize	= AES_MIN_KEY_SIZE,
-		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
-		.setkey		= aesbs_ctr_set_key,
-		.encrypt	= aesbs_ctr_encrypt,
-		.decrypt	= aesbs_ctr_encrypt,
+	.base = {
+		.cra_name		= "__ctr(aes)",
+		.cra_driver_name	= "__ctr-aes-neonbs",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct aesbs_ctr_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
 	},
+	.min_keysize	= AES_MIN_KEY_SIZE,
+	.max_keysize	= AES_MAX_KEY_SIZE,
+	.ivsize		= AES_BLOCK_SIZE,
+	.chunksize	= AES_BLOCK_SIZE,
+	.setkey		= aesbs_ctr_set_key,
+	.encrypt	= aesbs_ctr_encrypt,
+	.decrypt	= aesbs_ctr_encrypt,
 }, {
-	.cra_name		= "__xts-aes-neonbs",
-	.cra_driver_name	= "__driver-xts-aes-neonbs",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
-				  CRYPTO_ALG_INTERNAL,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct aesbs_xts_ctx),
-	.cra_alignmask		= 7,
-	.cra_type		= &crypto_blkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_blkcipher = {
-		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
-		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
-		.setkey		= aesbs_xts_set_key,
-		.encrypt	= aesbs_xts_encrypt,
-		.decrypt	= aesbs_xts_decrypt,
+	.base = {
+		.cra_name		= "__xts(aes)",
+		.cra_driver_name	= "__xts-aes-neonbs",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_blocksize		= AES_BLOCK_SIZE,
+		.cra_ctxsize		= sizeof(struct aesbs_xts_ctx),
+		.cra_alignmask		= 7,
+		.cra_module		= THIS_MODULE,
 	},
-}, {
-	.cra_name		= "cbc(aes)",
-	.cra_driver_name	= "cbc-aes-neonbs",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 7,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_ablkcipher = {
-		.min_keysize	= AES_MIN_KEY_SIZE,
-		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
-		.setkey		= ablk_set_key,
-		.encrypt	= __ablk_encrypt,
-		.decrypt	= ablk_decrypt,
-	}
-}, {
-	.cra_name		= "ctr(aes)",
-	.cra_driver_name	= "ctr-aes-neonbs",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 7,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_ablkcipher = {
-		.min_keysize	= AES_MIN_KEY_SIZE,
-		.max_keysize	= AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
-		.setkey		= ablk_set_key,
-		.encrypt	= ablk_encrypt,
-		.decrypt	= ablk_decrypt,
-	}
-}, {
-	.cra_name		= "xts(aes)",
-	.cra_driver_name	= "xts-aes-neonbs",
-	.cra_priority		= 300,
-	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= AES_BLOCK_SIZE,
-	.cra_ctxsize		= sizeof(struct async_helper_ctx),
-	.cra_alignmask		= 7,
-	.cra_type		= &crypto_ablkcipher_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= ablk_init,
-	.cra_exit		= ablk_exit,
-	.cra_ablkcipher = {
-		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
-		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
-		.ivsize		= AES_BLOCK_SIZE,
-		.setkey		= ablk_set_key,
-		.encrypt	= ablk_encrypt,
-		.decrypt	= ablk_decrypt,
-	}
+	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
+	.max_keysize	= 2 * AES_MAX_KEY_SIZE,
+	.ivsize		= AES_BLOCK_SIZE,
+	.setkey		= aesbs_xts_set_key,
+	.encrypt	= aesbs_xts_encrypt,
+	.decrypt	= aesbs_xts_decrypt,
 } };
 
+struct simd_skcipher_alg *aesbs_simd_algs[ARRAY_SIZE(aesbs_algs)];
+
+static void aesbs_mod_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(aesbs_simd_algs) && aesbs_simd_algs[i]; i++)
+		simd_skcipher_free(aesbs_simd_algs[i]);
+
+	crypto_unregister_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+}
+
 static int __init aesbs_mod_init(void)
 {
+	struct simd_skcipher_alg *simd;
+	const char *basename;
+	const char *algname;
+	const char *drvname;
+	int err;
+	int i;
+
 	if (!cpu_has_neon())
 		return -ENODEV;
 
-	return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
-}
+	err = crypto_register_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+	if (err)
+		return err;
 
-static void __exit aesbs_mod_exit(void)
-{
-	crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
+	for (i = 0; i < ARRAY_SIZE(aesbs_algs); i++) {
+		algname = aesbs_algs[i].base.cra_name + 2;
+		drvname = aesbs_algs[i].base.cra_driver_name + 2;
+		basename = aesbs_algs[i].base.cra_driver_name;
+		simd = simd_skcipher_create_compat(algname, drvname, basename);
+		err = PTR_ERR(simd);
+		if (IS_ERR(simd))
+			goto unregister_simds;
+
+		aesbs_simd_algs[i] = simd;
+	}
+
+	return 0;
+
+unregister_simds:
+	aesbs_mod_exit();
+	return err;
 }
 
 module_init(aesbs_mod_init);

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox