[PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup
@ 2016-12-04 11:54 Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 1/6] crypto: testmgr - avoid overlap in chunked tests Ard Biesheuvel
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2016-12-04 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

This v2 combines the CRC-T10DIF and CRC32 implementations for both ARM and
arm64 that I sent out a couple of weeks ago, and adds support to the latter
for CRC32C.

Ard Biesheuvel (6):
  crypto: testmgr - avoid overlap in chunked tests
  crypto: testmgr - add/enhance test cases for CRC-T10DIF
  crypto: arm64/crct10dif - port x86 SSE implementation to arm64
  crypto: arm/crct10dif - port x86 SSE implementation to ARM
  crypto: arm64/crc32 - accelerated support based on x86 SSE
    implementation
  crypto: arm/crc32 - accelerated support based on x86 SSE
    implementation

 arch/arm/crypto/Kconfig               |  10 +
 arch/arm/crypto/Makefile              |   4 +
 arch/arm/crypto/crc32-ce-core.S       | 306 +++++++++++++++++
 arch/arm/crypto/crc32-ce-glue.c       | 195 +++++++++++
 arch/arm/crypto/crct10dif-ce-core.S   | 349 ++++++++++++++++++++
 arch/arm/crypto/crct10dif-ce-glue.c   |  95 ++++++
 arch/arm64/crypto/Kconfig             |  11 +
 arch/arm64/crypto/Makefile            |   6 +
 arch/arm64/crypto/crc32-ce-core.S     | 266 +++++++++++++++
 arch/arm64/crypto/crc32-ce-glue.c     | 188 +++++++++++
 arch/arm64/crypto/crct10dif-ce-core.S | 317 ++++++++++++++++++
 arch/arm64/crypto/crct10dif-ce-glue.c |  91 +++++
 crypto/testmgr.c                      |   2 +-
 crypto/testmgr.h                      |  70 ++--
 14 files changed, 1881 insertions(+), 29 deletions(-)
 create mode 100644 arch/arm/crypto/crc32-ce-core.S
 create mode 100644 arch/arm/crypto/crc32-ce-glue.c
 create mode 100644 arch/arm/crypto/crct10dif-ce-core.S
 create mode 100644 arch/arm/crypto/crct10dif-ce-glue.c
 create mode 100644 arch/arm64/crypto/crc32-ce-core.S
 create mode 100644 arch/arm64/crypto/crc32-ce-glue.c
 create mode 100644 arch/arm64/crypto/crct10dif-ce-core.S
 create mode 100644 arch/arm64/crypto/crct10dif-ce-glue.c

-- 
2.7.4

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2 1/6] crypto: testmgr - avoid overlap in chunked tests
  2016-12-04 11:54 [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup Ard Biesheuvel
@ 2016-12-04 11:54 ` Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 2/6] crypto: testmgr - add/enhance test cases for CRC-T10DIF Ard Biesheuvel
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2016-12-04 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

The IDXn offsets are chosen such that tap values (which may go up to
255) end up overlapping in the xbuf allocation. In particular, IDX1
and IDX3 are too close together, so update IDX3 to avoid this issue.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 crypto/testmgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index ded50b67c757..670893bcf361 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -63,7 +63,7 @@ int alg_test(const char *driver, const char *alg, u32 type, u32 mask)
  */
 #define IDX1		32
 #define IDX2		32400
-#define IDX3		1
+#define IDX3		511
 #define IDX4		8193
 #define IDX5		22222
 #define IDX6		17101
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 2/6] crypto: testmgr - add/enhance test cases for CRC-T10DIF
  2016-12-04 11:54 [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 1/6] crypto: testmgr - avoid overlap in chunked tests Ard Biesheuvel
@ 2016-12-04 11:54 ` Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 3/6] crypto: arm64/crct10dif - port x86 SSE implementation to arm64 Ard Biesheuvel
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2016-12-04 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

The existing test cases only exercise a small slice of the various
possible code paths through the x86 SSE/PCLMULQDQ implementation,
and the upcoming ports of it for arm64. So add one that exceeds 256
bytes in size, and convert another to a chunked test.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 crypto/testmgr.h | 70 ++++++++++++--------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index e64a4ef9d8ca..b7cd41b25a2a 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -1334,36 +1334,50 @@ static struct hash_testvec rmd320_tv_template[] = {
 	}
 };
 
-#define CRCT10DIF_TEST_VECTORS	3
+#define CRCT10DIF_TEST_VECTORS	ARRAY_SIZE(crct10dif_tv_template)
 static struct hash_testvec crct10dif_tv_template[] = {
 	{
-		.plaintext = "abc",
-		.psize  = 3,
-#ifdef __LITTLE_ENDIAN
-		.digest = "\x3b\x44",
-#else
-		.digest = "\x44\x3b",
-#endif
-	}, {
-		.plaintext = "1234567890123456789012345678901234567890"
-			     "123456789012345678901234567890123456789",
-		.psize	= 79,
-#ifdef __LITTLE_ENDIAN
-		.digest	= "\x70\x4b",
-#else
-		.digest	= "\x4b\x70",
-#endif
-	}, {
-		.plaintext =
-		"abcddddddddddddddddddddddddddddddddddddddddddddddddddddd",
-		.psize  = 56,
-#ifdef __LITTLE_ENDIAN
-		.digest = "\xe3\x9c",
-#else
-		.digest = "\x9c\xe3",
-#endif
-		.np     = 2,
-		.tap    = { 28, 28 }
+		.plaintext	= "abc",
+		.psize		= 3,
+		.digest		= (u8 *)(u16 []){ 0x443b },
+	}, {
+		.plaintext 	= "1234567890123456789012345678901234567890"
+				  "123456789012345678901234567890123456789",
+		.psize		= 79,
+		.digest 	= (u8 *)(u16 []){ 0x4b70 },
+		.np		= 2,
+		.tap		= { 63, 16 },
+	}, {
+		.plaintext	= "abcdddddddddddddddddddddddddddddddddddddddd"
+				  "ddddddddddddd",
+		.psize		= 56,
+		.digest		= (u8 *)(u16 []){ 0x9ce3 },
+		.np		= 8,
+		.tap		= { 1, 2, 28, 7, 6, 5, 4, 3 },
+	}, {
+		.plaintext 	= "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "123456789012345678901234567890123456789",
+		.psize		= 319,
+		.digest		= (u8 *)(u16 []){ 0x44c6 },
+	}, {
+		.plaintext 	= "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "1234567890123456789012345678901234567890"
+				  "123456789012345678901234567890123456789",
+		.psize		= 319,
+		.digest		= (u8 *)(u16 []){ 0x44c6 },
+		.np		= 4,
+		.tap		= { 1, 255, 57, 6 },
 	}
 };
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 3/6] crypto: arm64/crct10dif - port x86 SSE implementation to arm64
  2016-12-04 11:54 [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 1/6] crypto: testmgr - avoid overlap in chunked tests Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 2/6] crypto: testmgr - add/enhance test cases for CRC-T10DIF Ard Biesheuvel
@ 2016-12-04 11:54 ` Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 4/6] crypto: arm/crct10dif - port x86 SSE implementation to ARM Ard Biesheuvel
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2016-12-04 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

This is a transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only
operate on multiples of 16 bytes. The residual data is handled
by the generic C implementation.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig             |   5 +
 arch/arm64/crypto/Makefile            |   3 +
 arch/arm64/crypto/crct10dif-ce-core.S | 317 ++++++++++++++++++++
 arch/arm64/crypto/crct10dif-ce-glue.c |  91 ++++++
 4 files changed, 416 insertions(+)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 2cf32e9887e1..d773c0659202 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -23,6 +23,11 @@ config CRYPTO_GHASH_ARM64_CE
 	depends on ARM64 && KERNEL_MODE_NEON
 	select CRYPTO_HASH
 
+config CRYPTO_CRCT10DIF_ARM64_CE
+	tristate "CRCT10DIF digest algorithm using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC_T10DIF
+	select CRYPTO_HASH
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index abb79b3cfcfe..36fd3eb4201b 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -17,6 +17,9 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
+obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
+crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
 
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
new file mode 100644
index 000000000000..641685effebd
--- /dev/null
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -0,0 +1,317 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Function API:
+//       UINT16 crc_t10dif_pcl(
+//               UINT16 init_crc, //initial CRC value, 16 bits
+//               const unsigned char *buf, //buffer pointer to calculate CRC on
+//               UINT64 len //buffer length in bytes (64-bit data)
+//       );
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//	Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.cpu		generic+crypto
+
+	arg1_low32	.req	w0
+	arg2		.req	x1
+	arg3		.req	x2
+
+	vzr		.req	v13
+
+ENTRY(crc_t10dif_pmull)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp
+
+	movi		vzr.16b, #0		// init zero register
+
+	// adjust the 16-bit initial_crc value, scale it to 32 bits
+	lsl		arg1_low32, arg1_low32, #16
+
+	// check if smaller than 256
+	cmp		arg3, #256
+
+	// for sizes less than 128, we can't fold 64B at a time...
+	b.lt		_less_than_128
+
+	// load the initial crc value
+	// crc value does not need to be byte-reflected, but it needs
+	// to be moved to the high part of the register.
+	// because data will be byte-reflected and will align with
+	// initial crc at correct place.
+	movi		v10.16b, #0
+	mov		v10.s[3], arg1_low32		// initial crc
+
+	// receive the initial 64B data, xor the initial crc value
+	ldp		q0, q1, [arg2]
+	ldp		q2, q3, [arg2, #0x20]
+	ldp		q4, q5, [arg2, #0x40]
+	ldp		q6, q7, [arg2, #0x60]
+	add		arg2, arg2, #0x80
+
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	rev64		v2.16b, v2.16b			)
+CPU_LE(	rev64		v3.16b, v3.16b			)
+CPU_LE(	rev64		v4.16b, v4.16b			)
+CPU_LE(	rev64		v5.16b, v5.16b			)
+CPU_LE(	rev64		v6.16b, v6.16b			)
+CPU_LE(	rev64		v7.16b, v7.16b			)
+
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
+CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
+CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
+CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
+CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+
+	// XOR the initial_crc value
+	eor		v0.16b, v0.16b, v10.16b
+
+	ldr		q10, rk3	// xmm10 has rk3 and rk4
+					// type of pmull instruction
+					// will determine which constant to use
+
+	//
+	// we subtract 256 instead of 128 to save one instruction from the loop
+	//
+	sub		arg3, arg3, #256
+
+	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	// buffer. The _fold_64_B_loop will fold 64B at a time
+	// until we have 64+y Bytes of buffer
+
+
+	// fold 64B at a time. This section of the code folds 4 vector
+	// registers in parallel
+_fold_64_B_loop:
+
+	.macro		fold64, reg1, reg2
+	ldp		q11, q12, [arg2], #0x20
+
+	pmull2		v8.1q, \reg1\().2d, v10.2d
+	pmull		\reg1\().1q, \reg1\().1d, v10.1d
+
+CPU_LE(	rev64		v11.16b, v11.16b		)
+CPU_LE(	rev64		v12.16b, v12.16b		)
+
+	pmull2		v9.1q, \reg2\().2d, v10.2d
+	pmull		\reg2\().1q, \reg2\().1d, v10.1d
+
+CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
+CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
+
+	eor		\reg1\().16b, \reg1\().16b, v8.16b
+	eor		\reg2\().16b, \reg2\().16b, v9.16b
+	eor		\reg1\().16b, \reg1\().16b, v11.16b
+	eor		\reg2\().16b, \reg2\().16b, v12.16b
+	.endm
+
+	fold64		v0, v1
+	fold64		v2, v3
+	fold64		v4, v5
+	fold64		v6, v7
+
+	subs		arg3, arg3, #128
+
+	// check if there is another 64B in the buffer to be able to fold
+	b.ge		_fold_64_B_loop
+
+	// at this point, the buffer pointer is pointing at the last y Bytes
+	// of the buffer the 64B of folded data is in 4 of the vector
+	// registers: v0, v1, v2, v3
+
+	// fold the 8 vector registers to 1 vector register with different
+	// constants
+
+	ldr		q10, rk9
+
+	.macro		fold16, reg, rk
+	pmull		v8.1q, \reg\().1d, v10.1d
+	pmull2		\reg\().1q, \reg\().2d, v10.2d
+	.ifnb		\rk
+	ldr		q10, \rk
+	.endif
+	eor		v7.16b, v7.16b, v8.16b
+	eor		v7.16b, v7.16b, \reg\().16b
+	.endm
+
+	fold16		v0, rk11
+	fold16		v1, rk13
+	fold16		v2, rk15
+	fold16		v3, rk17
+	fold16		v4, rk19
+	fold16		v5, rk1
+	fold16		v6
+
+	// instead of 64, we add 48 to the loop counter to save 1 instruction
+	// from the loop instead of a cmp instruction, we use the negative
+	// flag with the jl instruction
+	adds		arg3, arg3, #(128-16)
+	b.lt		_final_reduction_for_128
+
+	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+	// and the rest is in memory. We can fold 16 bytes@a time if y>=16
+	// continue folding 16B at a time
+
+_16B_reduction_loop:
+	pmull		v8.1q, v7.1d, v10.1d
+	pmull2		v7.1q, v7.2d, v10.2d
+	eor		v7.16b, v7.16b, v8.16b
+
+	ldr		q0, [arg2], #16
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	eor		v7.16b, v7.16b, v0.16b
+	subs		arg3, arg3, #16
+
+	// instead of a cmp instruction, we utilize the flags with the
+	// jge instruction equivalent of: cmp arg3, 16-16
+	// check if there is any more 16B in the buffer to be able to fold
+	b.ge		_16B_reduction_loop
+
+_final_reduction_for_128:
+	// compute crc of a 128-bit value
+	ldr		q10, rk5		// rk5 and rk6 in xmm10
+
+	// 64b fold
+	ext		v0.16b, vzr.16b, v7.16b, #8
+	mov		v7.d[0], v7.d[1]
+	pmull		v7.1q, v7.1d, v10.1d
+	eor		v7.16b, v7.16b, v0.16b
+
+	// 32b fold
+	ext		v0.16b, v7.16b, vzr.16b, #4
+	mov		v7.s[3], vzr.s[0]
+	pmull2		v0.1q, v0.2d, v10.2d
+	eor		v7.16b, v7.16b, v0.16b
+
+	// barrett reduction
+_barrett:
+	ldr		q10, rk7
+	mov		v0.d[0], v7.d[1]
+
+	pmull		v0.1q, v0.1d, v10.1d
+	ext		v0.16b, vzr.16b, v0.16b, #12
+	pmull2		v0.1q, v0.2d, v10.2d
+	ext		v0.16b, vzr.16b, v0.16b, #12
+	eor		v7.16b, v7.16b, v0.16b
+	mov		w0, v7.s[1]
+
+_cleanup:
+	// scale the result back to 16 bits
+	lsr		x0, x0, #16
+	ldp		x29, x30, [sp], #16
+	ret
+
+_less_than_128:
+	cbz		arg3, _cleanup
+
+	movi		v0.16b, #0
+	mov		v0.s[3], arg1_low32	// get the initial crc value
+
+	ldr		q7, [arg2], #0x10
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
+
+	// check if there is enough buffer to be able to fold 16B at a time
+	cmp		arg3, #32
+	b.lt		_final_reduction_for_128
+
+	// now if there is, load the constants
+	ldr		q10, rk1		// rk1 and rk2 in xmm10
+
+	// update the counter. subtract 32 instead of 16 to save one
+	// instruction from the loop
+	sub		arg3, arg3, #32
+	b		_16B_reduction_loop
+ENDPROC(crc_t10dif_pmull)
+
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+	.align		4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q
+
+rk1:	.octa		0x06df0000000000002d56000000000000
+rk3:	.octa		0x7cf50000000000009d9d000000000000
+rk5:	.octa		0x13680000000000002d56000000000000
+rk7:	.octa		0x000000018bb7000000000001f65a57f8
+rk9:	.octa		0xbfd6000000000000ceae000000000000
+rk11:	.octa		0x713c0000000000001e16000000000000
+rk13:	.octa		0x80a6000000000000f7f9000000000000
+rk15:	.octa		0xe658000000000000044c000000000000
+rk17:	.octa		0xa497000000000000ad18000000000000
+rk19:	.octa		0xe7b50000000000006ee3000000000000
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
new file mode 100644
index 000000000000..735678884194
--- /dev/null
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,91 @@
+/*
+ * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/neon.h>
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
+
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
+
+static int crct10dif_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = 0;
+	return 0;
+}
+
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+		unsigned int l = length & ~(CRC_T10DIF_PMULL_CHUNK_SIZE - 1);
+
+		kernel_neon_begin_partial(14);
+		*crc = crc_t10dif_pmull(*crc, data, l);
+		kernel_neon_end();
+
+		data += l;
+	}
+	if (length % CRC_T10DIF_PMULL_CHUNK_SIZE)
+		*crc = crc_t10dif_generic(*crc, data,
+					  length % CRC_T10DIF_PMULL_CHUNK_SIZE);
+
+	return 0;
+}
+
+static int crct10dif_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*(u16 *)out = *crc;
+	return 0;
+}
+
+static struct shash_alg crc_t10dif_alg = {
+	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
+	.init			= crct10dif_init,
+	.update			= crct10dif_update,
+	.final			= crct10dif_final,
+	.descsize		= CRC_T10DIF_DIGEST_SIZE,
+
+	.base.cra_name		= "crct10dif",
+	.base.cra_driver_name	= "crct10dif-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+};
+
+static int __init crc_t10dif_mod_init(void)
+{
+	return crypto_register_shash(&crc_t10dif_alg);
+}
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+	crypto_unregister_shash(&crc_t10dif_alg);
+}
+
+module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 4/6] crypto: arm/crct10dif - port x86 SSE implementation to ARM
  2016-12-04 11:54 [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup Ard Biesheuvel
                   ` (2 preceding siblings ...)
  2016-12-04 11:54 ` [PATCH v2 3/6] crypto: arm64/crct10dif - port x86 SSE implementation to arm64 Ard Biesheuvel
@ 2016-12-04 11:54 ` Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 5/6] crypto: arm64/crc32 - accelerated support based on x86 SSE implementation Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 6/6] crypto: arm/crc32 " Ard Biesheuvel
  5 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2016-12-04 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

This is a transliteration of the Intel algorithm implemented
using SSE and PCLMULQDQ instructions that resides in the file
arch/x86/crypto/crct10dif-pcl-asm_64.S, but simplified to only
operate on multiples of 16 bytes. The residual data is handled
by the generic C implementation.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/Kconfig             |   5 +
 arch/arm/crypto/Makefile            |   2 +
 arch/arm/crypto/crct10dif-ce-core.S | 349 ++++++++++++++++++++
 arch/arm/crypto/crct10dif-ce-glue.c |  95 ++++++
 4 files changed, 451 insertions(+)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 27ed1b1cd1d7..fce801fa52a1 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -120,4 +120,9 @@ config CRYPTO_GHASH_ARM_CE
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
 	  that is part of the ARMv8 Crypto Extensions
 
+config CRYPTO_CRCT10DIF_ARM_CE
+	tristate "CRCT10DIF digest algorithm using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC_T10DIF
+	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc5150702b64..fc77265014b7 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -13,6 +13,7 @@ ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
 
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -36,6 +37,7 @@ sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
+crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
new file mode 100644
index 000000000000..ae2adb54e905
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -0,0 +1,349 @@
+//
+// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Function API:
+//       UINT16 crc_t10dif_pcl(
+//               UINT16 init_crc, //initial CRC value, 16 bits
+//               const unsigned char *buf, //buffer pointer to calculate CRC on
+//               UINT64 len //buffer length in bytes (64-bit data)
+//       );
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//	Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define CPU_LE(code...)
+#else
+#define CPU_LE(code...)		code
+#endif
+
+	.text
+	.fpu		crypto-neon-fp-armv8
+
+	arg1_low32	.req	r0
+	arg2		.req	r1
+	arg3		.req	r2
+
+	qzr		.req	q13
+
+	q0l		.req	d0
+	q0h		.req	d1
+	q1l		.req	d2
+	q1h		.req	d3
+	q2l		.req	d4
+	q2h		.req	d5
+	q3l		.req	d6
+	q3h		.req	d7
+	q4l		.req	d8
+	q4h		.req	d9
+	q5l		.req	d10
+	q5h		.req	d11
+	q6l		.req	d12
+	q6h		.req	d13
+	q7l		.req	d14
+	q7h		.req	d15
+
+ENTRY(crc_t10dif_pmull)
+	push		{r4, lr}
+
+	vmov.i8		qzr, #0			// init zero register
+
+	// adjust the 16-bit initial_crc value, scale it to 32 bits
+	lsl		arg1_low32, arg1_low32, #16
+
+	// check if smaller than 256
+	cmp		arg3, #256
+
+	// for sizes less than 128, we can't fold 64B at a time...
+	blt		_less_than_128
+
+	// load the initial crc value
+	// crc value does not need to be byte-reflected, but it needs
+	// to be moved to the high part of the register.
+	// because data will be byte-reflected and will align with
+	// initial crc at correct place.
+	vmov		s0, arg1_low32		// initial crc
+	vext.8		q10, qzr, q0, #4
+
+	// receive the initial 64B data, xor the initial crc value
+	vld1.64		{q0-q1}, [arg2]!
+	vld1.64		{q2-q3}, [arg2]!
+	vld1.64		{q4-q5}, [arg2]!
+	vld1.64		{q6-q7}, [arg2]!
+CPU_LE(	vrev64.8	q0, q0			)
+CPU_LE(	vrev64.8	q1, q1			)
+CPU_LE(	vrev64.8	q2, q2			)
+CPU_LE(	vrev64.8	q3, q3			)
+CPU_LE(	vrev64.8	q4, q4			)
+CPU_LE(	vrev64.8	q5, q5			)
+CPU_LE(	vrev64.8	q6, q6			)
+CPU_LE(	vrev64.8	q7, q7			)
+
+	vswp		d0, d1
+	vswp		d2, d3
+	vswp		d4, d5
+	vswp		d6, d7
+	vswp		d8, d9
+	vswp		d10, d11
+	vswp		d12, d13
+	vswp		d14, d15
+
+	// XOR the initial_crc value
+	veor.8		q0, q0, q10
+
+	adr		ip, rk3
+	vld1.64		{q10}, [ip]	// xmm10 has rk3 and rk4
+					// type of pmull instruction
+					// will determine which constant to use
+
+	//
+	// we subtract 256 instead of 128 to save one instruction from the loop
+	//
+	sub		arg3, arg3, #256
+
+	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	// buffer. The _fold_64_B_loop will fold 64B at a time
+	// until we have 64+y Bytes of buffer
+
+
+	// fold 64B at a time. This section of the code folds 4 vector
+	// registers in parallel
+_fold_64_B_loop:
+
+	.macro		fold64, reg1, reg2
+	vld1.64		{q11-q12}, [arg2]!
+
+	vmull.p64	q8, \reg1\()h, d21
+	vmull.p64	\reg1, \reg1\()l, d20
+	vmull.p64	q9, \reg2\()h, d21
+	vmull.p64	\reg2, \reg2\()l, d20
+
+CPU_LE(	vrev64.8	q11, q11		)
+CPU_LE(	vrev64.8	q12, q12		)
+	vswp		d22, d23
+	vswp		d24, d25
+
+	veor.8		\reg1, \reg1, q8
+	veor.8		\reg2, \reg2, q9
+	veor.8		\reg1, \reg1, q11
+	veor.8		\reg2, \reg2, q12
+	.endm
+
+	fold64		q0, q1
+	fold64		q2, q3
+	fold64		q4, q5
+	fold64		q6, q7
+
+	subs		arg3, arg3, #128
+
+	// check if there is another 64B in the buffer to be able to fold
+	bge		_fold_64_B_loop
+
+	// at this point, the buffer pointer is pointing at the last y Bytes
+	// of the buffer the 64B of folded data is in 4 of the vector
+	// registers: v0, v1, v2, v3
+
+	// fold the 8 vector registers to 1 vector register with different
+	// constants
+
+	adr		ip, rk9
+	vld1.64		{q10}, [ip]!
+
+	.macro		fold16, reg, rk
+	vmull.p64	q8, \reg\()l, d20
+	vmull.p64	\reg, \reg\()h, d21
+	.ifnb		\rk
+	vld1.64		{q10}, [ip]!
+	.endif
+	veor.8		q7, q7, q8
+	veor.8		q7, q7, \reg
+	.endm
+
+	fold16		q0, rk11
+	fold16		q1, rk13
+	fold16		q2, rk15
+	fold16		q3, rk17
+	fold16		q4, rk19
+	fold16		q5, rk1
+	fold16		q6
+
+	// instead of 64, we add 48 to the loop counter to save 1 instruction
+	// from the loop instead of a cmp instruction, we use the negative
+	// flag with the jl instruction
+	adds		arg3, arg3, #(128-16)
+	blt		_final_reduction_for_128
+
+	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+	// and the rest is in memory. We can fold 16 bytes@a time if y>=16
+	// continue folding 16B at a time
+
+_16B_reduction_loop:
+	vmull.p64	q8, d14, d20
+	vmull.p64	q7, d15, d21
+	veor.8		q7, q7, q8
+
+	vld1.64		{q0}, [arg2]!
+CPU_LE(	vrev64.8	q0, q0		)
+	vswp		d0, d1
+	veor.8		q7, q7, q0
+	subs		arg3, arg3, #16
+
+	// instead of a cmp instruction, we utilize the flags with the
+	// jge instruction equivalent of: cmp arg3, 16-16
+	// check if there is any more 16B in the buffer to be able to fold
+	bge		_16B_reduction_loop
+
+_final_reduction_for_128:
+	// compute crc of a 128-bit value
+	vldr		d20, rk5
+	vldr		d21, rk6		// rk5 and rk6 in xmm10
+
+	// 64b fold
+	vext.8		q0, qzr, q7, #8
+	vmull.p64	q7, d15, d20
+	veor.8		q7, q7, q0
+
+	// 32b fold
+	vext.8		q0, q7, qzr, #12
+	vmov		s31, s3
+	vmull.p64	q0, d0, d21
+	veor.8		q7, q0, q7
+
+	// barrett reduction
+_barrett:
+	vldr		d20, rk7
+	vldr		d21, rk8
+
+	vmull.p64	q0, d15, d20
+	vext.8		q0, qzr, q0, #12
+	vmull.p64	q0, d1, d21
+	vext.8		q0, qzr, q0, #12
+	veor.8		q7, q7, q0
+	vmov		r0, s29
+
+_cleanup:
+	// scale the result back to 16 bits
+	lsr		r0, r0, #16
+	pop		{r4, pc}
+
+_less_than_128:
+	teq		arg3, #0
+	beq		_cleanup
+
+	vmov.i8		q0, #0
+	vmov		s3, arg1_low32		// get the initial crc value
+
+	vld1.64		{q7}, [arg2]!
+CPU_LE(	vrev64.8	q7, q7		)
+	vswp		d14, d15
+	veor.8		q7, q7, q0
+
+	// check if there is enough buffer to be able to fold 16B at a time
+	cmp		arg3, #32
+	blt		_final_reduction_for_128
+
+	// now if there is, load the constants
+	vldr		d20, rk1
+	vldr		d21, rk2		// rk1 and rk2 in xmm10
+
+	// update the counter. subtract 32 instead of 16 to save one
+	// instruction from the loop
+	sub		arg3, arg3, #32
+
+	b		_16B_reduction_loop
+ENDPROC(crc_t10dif_pmull)
+
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+	.align		4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q
+
+rk3:	.quad		0x9d9d000000000000
+rk4:	.quad		0x7cf5000000000000
+rk5:	.quad		0x2d56000000000000
+rk6:	.quad		0x1368000000000000
+rk7:	.quad		0x00000001f65a57f8
+rk8:	.quad		0x000000018bb70000
+rk9:	.quad		0xceae000000000000
+rk10:	.quad		0xbfd6000000000000
+rk11:	.quad		0x1e16000000000000
+rk12:	.quad		0x713c000000000000
+rk13:	.quad		0xf7f9000000000000
+rk14:	.quad		0x80a6000000000000
+rk15:	.quad		0x044c000000000000
+rk16:	.quad		0xe658000000000000
+rk17:	.quad		0xad18000000000000
+rk18:	.quad		0xa497000000000000
+rk19:	.quad		0x6ee3000000000000
+rk20:	.quad		0xe7b5000000000000
+rk1:	.quad		0x2d56000000000000
+rk2:	.quad		0x06df000000000000
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
new file mode 100644
index 000000000000..8225422c34a7
--- /dev/null
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -0,0 +1,95 @@
+/*
+ * Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
+
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
+
+static int crct10dif_init(struct shash_desc *desc)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*crc = 0;
+	return 0;
+}
+
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	if (may_use_simd() && length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+		unsigned int l = length & ~(CRC_T10DIF_PMULL_CHUNK_SIZE - 1);
+
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull(*crc, data, l);
+		kernel_neon_end();
+
+		length -= l;
+		data += l;
+	}
+	if (length > 0)
+		*crc = crc_t10dif_generic(*crc, data, length);
+
+	return 0;
+}
+
+static int crct10dif_final(struct shash_desc *desc, u8 *out)
+{
+	u16 *crc = shash_desc_ctx(desc);
+
+	*(u16 *)out = *crc;
+	return 0;
+}
+
+static struct shash_alg crc_t10dif_alg = {
+	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
+	.init			= crct10dif_init,
+	.update			= crct10dif_update,
+	.final			= crct10dif_final,
+	.descsize		= CRC_T10DIF_DIGEST_SIZE,
+
+	.base.cra_name		= "crct10dif",
+	.base.cra_driver_name	= "crct10dif-arm-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+};
+
+static int __init crc_t10dif_mod_init(void)
+{
+	if (!(elf_hwcap2 & HWCAP2_PMULL))
+		return -ENODEV;
+
+	return crypto_register_shash(&crc_t10dif_alg);
+}
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+	crypto_unregister_shash(&crc_t10dif_alg);
+}
+
+module_init(crc_t10dif_mod_init);
+module_exit(crc_t10dif_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crct10dif");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 5/6] crypto: arm64/crc32 - accelerated support based on x86 SSE implementation
  2016-12-04 11:54 [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup Ard Biesheuvel
                   ` (3 preceding siblings ...)
  2016-12-04 11:54 ` [PATCH v2 4/6] crypto: arm/crct10dif - port x86 SSE implementation to ARM Ard Biesheuvel
@ 2016-12-04 11:54 ` Ard Biesheuvel
  2016-12-04 11:54 ` [PATCH v2 6/6] crypto: arm/crc32 " Ard Biesheuvel
  5 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2016-12-04 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

This is a combination of the the Intel algorithm implemented using SSE
and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and
the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in
version 8 of the architecture. Two versions of the above combo are
provided, one for CRC32 and one for CRC32C.

The PMULL/NEON algorithm is faster, but operates on blocks of at least
64 bytes, and on multiples of 16 bytes only. For the remaining input,
or for all input on systems that lack the PMULL 64x64->128 instructions,
the CRC32 instructions will be used.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Kconfig         |   6 +
 arch/arm64/crypto/Makefile        |   3 +
 arch/arm64/crypto/crc32-ce-core.S | 266 ++++++++++++++++++++
 arch/arm64/crypto/crc32-ce-glue.c | 188 ++++++++++++++
 4 files changed, 463 insertions(+)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index d773c0659202..21835deb1ab9 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -28,6 +28,11 @@ config CRYPTO_CRCT10DIF_ARM64_CE
 	depends on KERNEL_MODE_NEON && CRC_T10DIF
 	select CRYPTO_HASH
 
+config CRYPTO_CRC32_ARM64_CE
+	tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC32
+	select CRYPTO_HASH
+
 config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
@@ -58,4 +63,5 @@ config CRYPTO_CRC32_ARM64
 	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
 	depends on ARM64
 	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 36fd3eb4201b..144387805a46 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -20,6 +20,9 @@ ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
 crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
 
+obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
+crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
 
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..18f5a8442276
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -0,0 +1,266 @@
+/*
+ * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align		6
+	.cpu		generic+crypto+crc
+
+.Lcrc32_constants:
+	/*
+	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+	 * #define CONSTANT_R1  0x154442bd4LL
+	 *
+	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+	 * #define CONSTANT_R2  0x1c6e41596LL
+	 */
+	.octa		0x00000001c6e415960000000154442bd4
+
+	/*
+	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+	 * #define CONSTANT_R3  0x1751997d0LL
+	 *
+	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+	 * #define CONSTANT_R4  0x0ccaa009eLL
+	 */
+	.octa		0x00000000ccaa009e00000001751997d0
+
+	/*
+	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+	 * #define CONSTANT_R5  0x163cd6124LL
+	 */
+	.quad		0x0000000163cd6124
+	.quad		0x00000000FFFFFFFF
+
+	/*
+	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+	 *
+	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+	 *                                                      = 0x1F7011641LL
+	 * #define CONSTANT_RU  0x1F7011641LL
+	 */
+	.octa		0x00000001F701164100000001DB710641
+
+.Lcrc32c_constants:
+	.octa		0x000000009e4addf800000000740eef02
+	.octa		0x000000014cd00bd600000000f20c0dfe
+	.quad		0x00000000dd45aab8
+	.quad		0x00000000FFFFFFFF
+	.octa		0x00000000dea713f10000000105ec76f0
+
+	vCONSTANT	.req	v0
+	dCONSTANT	.req	d0
+	qCONSTANT	.req	q0
+
+	BUF		.req	x0
+	LEN		.req	x1
+	CRC		.req	x2
+
+	vzr		.req	v9
+
+	/**
+	 * Calculate crc32
+	 * BUF - buffer
+	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+	 * CRC - initial crc32
+	 * return %eax crc32
+	 * uint crc32_pmull_le(unsigned char const *buffer,
+	 *                     size_t len, uint crc32)
+	 */
+ENTRY(crc32_pmull_le)
+	adr		x3, .Lcrc32_constants
+	b		0f
+
+ENTRY(crc32c_pmull_le)
+	adr		x3, .Lcrc32c_constants
+
+0:	bic		LEN, LEN, #15
+	ld1		{v1.16b-v4.16b}, [BUF], #0x40
+	movi		vzr.16b, #0
+	fmov		dCONSTANT, CRC
+	eor		v1.16b, v1.16b, vCONSTANT.16b
+	sub		LEN, LEN, #0x40
+	cmp		LEN, #0x40
+	b.lt		less_64
+
+	ldr		qCONSTANT, [x3]
+
+loop_64:		/* 64 bytes Full cache line folding */
+	sub		LEN, LEN, #0x40
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull2		v6.1q, v2.2d, vCONSTANT.2d
+	pmull2		v7.1q, v3.2d, vCONSTANT.2d
+	pmull2		v8.1q, v4.2d, vCONSTANT.2d
+
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	pmull		v3.1q, v3.1d, vCONSTANT.1d
+	pmull		v4.1q, v4.1d, vCONSTANT.1d
+
+	eor		v1.16b, v1.16b, v5.16b
+	ld1		{v5.16b}, [BUF], #0x10
+	eor		v2.16b, v2.16b, v6.16b
+	ld1		{v6.16b}, [BUF], #0x10
+	eor		v3.16b, v3.16b, v7.16b
+	ld1		{v7.16b}, [BUF], #0x10
+	eor		v4.16b, v4.16b, v8.16b
+	ld1		{v8.16b}, [BUF], #0x10
+
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+	eor		v3.16b, v3.16b, v7.16b
+	eor		v4.16b, v4.16b, v8.16b
+
+	cmp		LEN, #0x40
+	b.ge		loop_64
+
+less_64:		/* Folding cache line into 128bit */
+	ldr		qCONSTANT, [x3, #16]
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v3.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v4.16b
+
+	cbz		LEN, fold_64
+
+loop_16:		/* Folding rest buffer into 128bit */
+	subs		LEN, LEN, #0x10
+
+	ld1		{v2.16b}, [BUF], #0x10
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	b.ne		loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	ext		v2.16b, v1.16b, v1.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	ext		v1.16b, v1.16b, vzr.16b, #8
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* final 32-bit fold */
+	ldr		dCONSTANT, [x3, #32]
+	ldr		d3, [x3, #40]
+
+	ext		v2.16b, v1.16b, vzr.16b, #4
+	and		v1.16b, v1.16b, v3.16b
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+	ldr		qCONSTANT, [x3, #48]
+
+	and		v2.16b, v1.16b, v3.16b
+	ext		v2.16b, vzr.16b, v2.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	and		v2.16b, v2.16b, v3.16b
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+	mov		w0, v1.s[1]
+
+	ret
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+	.macro		__crc32, c
+0:	subs		x2, x2, #16
+	b.mi		8f
+	ldp		x3, x4, [x1], #16
+CPU_BE(	rev		x3, x3		)
+CPU_BE(	rev		x4, x4		)
+	crc32\c\()x	w0, w0, x3
+	crc32\c\()x	w0, w0, x4
+	b.ne		0b
+	ret
+
+8:	tbz		x2, #3, 4f
+	ldr		x3, [x1], #8
+CPU_BE(	rev		x3, x3		)
+	crc32\c\()x	w0, w0, x3
+4:	tbz		x2, #2, 2f
+	ldr		w3, [x1], #4
+CPU_BE(	rev		w3, w3		)
+	crc32\c\()w	w0, w0, w3
+2:	tbz		x2, #1, 1f
+	ldrh		w3, [x1], #2
+CPU_BE(	rev16		w3, w3		)
+	crc32\c\()h	w0, w0, w3
+1:	tbz		x2, #0, 0f
+	ldrb		w3, [x1]
+	crc32\c\()b	w0, w0, w3
+0:	ret
+	.endm
+
+	.align		5
+ENTRY(crc32_armv8_le)
+	__crc32
+ENDPROC(crc32_armv8_le)
+
+	.align		5
+ENTRY(crc32c_armv8_le)
+	__crc32		c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..c3c7e5848e2a
--- /dev/null
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -0,0 +1,188 @@
+/*
+ * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN		64L	/* minimum size of buffer
+					 * for crc32_pmull_le_16 */
+#define SCALE_F			16L	/* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u64 len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u64 len);
+
+static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+	return 0;
+}
+
+static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+	return 0;
+}
+
+static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
+			      unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_pmull_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = *mctx;
+	return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	if (length >= PMULL_MIN_LEN) {
+		unsigned int l = round_down(length, SCALE_F);
+
+		kernel_neon_begin_partial(10);
+		*crc = crc32_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0) {
+		if (elf_hwcap & HWCAP_CRC32)
+			*crc = crc32_armv8_le(*crc, data, length);
+		else
+			*crc = crc32_le(*crc, data, length);
+	}
+
+	return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	if (length >= PMULL_MIN_LEN) {
+		unsigned int l = round_down(length, SCALE_F);
+
+		kernel_neon_begin_partial(10);
+		*crc = crc32c_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0) {
+		if (elf_hwcap & HWCAP_CRC32)
+			*crc = crc32c_armv8_le(*crc, data, length);
+		else
+			*crc = __crc32c_le(*crc, data, length);
+	}
+
+	return 0;
+}
+
+static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(*crc, out);
+	return 0;
+}
+
+static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(~*crc, out);
+	return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32_pmull_update,
+	.final			= crc32_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32_pmull_cra_init,
+	.base.cra_name		= "crc32",
+	.base.cra_driver_name	= "crc32-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32c_pmull_update,
+	.final			= crc32c_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32c_pmull_cra_init,
+	.base.cra_name		= "crc32c",
+	.base.cra_driver_name	= "crc32c-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+	return crypto_register_shashes(crc32_pmull_algs,
+				       ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+	crypto_unregister_shashes(crc32_pmull_algs,
+				  ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 6/6] crypto: arm/crc32 - accelerated support based on x86 SSE implementation
  2016-12-04 11:54 [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup Ard Biesheuvel
                   ` (4 preceding siblings ...)
  2016-12-04 11:54 ` [PATCH v2 5/6] crypto: arm64/crc32 - accelerated support based on x86 SSE implementation Ard Biesheuvel
@ 2016-12-04 11:54 ` Ard Biesheuvel
  5 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2016-12-04 11:54 UTC (permalink / raw)
  To: linux-arm-kernel

This is a combination of the the Intel algorithm implemented using SSE
and PCLMULQDQ instructions from arch/x86/crypto/crc32-pclmul_asm.S, and
the new CRC32 extensions introduced for both 32-bit and 64-bit ARM in
version 8 of the architecture. Two versions of the above combo are
provided, one for CRC32 and one for CRC32C.

The PMULL/NEON algorithm is faster, but operates on blocks of at least
64 bytes, and on multiples of 16 bytes only. For the remaining input,
or for all input on systems that lack the PMULL 64x64->128 instructions,
the CRC32 instructions will be used.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm/crypto/Kconfig         |   5 +
 arch/arm/crypto/Makefile        |   2 +
 arch/arm/crypto/crc32-ce-core.S | 306 ++++++++++++++++++++
 arch/arm/crypto/crc32-ce-glue.c | 195 +++++++++++++
 4 files changed, 508 insertions(+)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index fce801fa52a1..de7bb20815bf 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -125,4 +125,9 @@ config CRYPTO_CRCT10DIF_ARM_CE
 	depends on KERNEL_MODE_NEON && CRC_T10DIF
 	select CRYPTO_HASH
 
+config CRYPTO_CRC32_ARM_CE
+	tristate "CRC32(C) digest algorithm using CRC and/or PMULL instructions"
+	depends on KERNEL_MODE_NEON && CRC32
+	select CRYPTO_HASH
+
 endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index fc77265014b7..b578a1820ab1 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -14,6 +14,7 @@ ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM_CE) += crct10dif-arm-ce.o
+ce-obj-$(CONFIG_CRYPTO_CRC32_ARM_CE) += crc32-arm-ce.o
 
 ifneq ($(ce-obj-y)$(ce-obj-m),)
 ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y)
@@ -38,6 +39,7 @@ sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
+crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S
new file mode 100644
index 000000000000..70e0c8042880
--- /dev/null
+++ b/arch/arm/crypto/crc32-ce-core.S
@@ -0,0 +1,306 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align		6
+	.arch		armv8-a
+	.arch_extension	crc
+	.fpu		crypto-neon-fp-armv8
+
+.Lcrc32_constants:
+	/*
+	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+	 * #define CONSTANT_R1  0x154442bd4LL
+	 *
+	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+	 * #define CONSTANT_R2  0x1c6e41596LL
+	 */
+	.quad		0x0000000154442bd4
+	.quad		0x00000001c6e41596
+
+	/*
+	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+	 * #define CONSTANT_R3  0x1751997d0LL
+	 *
+	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+	 * #define CONSTANT_R4  0x0ccaa009eLL
+	 */
+	.quad		0x00000001751997d0
+	.quad		0x00000000ccaa009e
+
+	/*
+	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+	 * #define CONSTANT_R5  0x163cd6124LL
+	 */
+	.quad		0x0000000163cd6124
+	.quad		0x00000000FFFFFFFF
+
+	/*
+	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+	 *
+	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+	 *                                                      = 0x1F7011641LL
+	 * #define CONSTANT_RU  0x1F7011641LL
+	 */
+	.quad		0x00000001DB710641
+	.quad		0x00000001F7011641
+
+.Lcrc32c_constants:
+	.quad		0x00000000740eef02
+	.quad		0x000000009e4addf8
+	.quad		0x00000000f20c0dfe
+	.quad		0x000000014cd00bd6
+	.quad		0x00000000dd45aab8
+	.quad		0x00000000FFFFFFFF
+	.quad		0x0000000105ec76f0
+	.quad		0x00000000dea713f1
+
+	dCONSTANTl	.req	d0
+	dCONSTANTh	.req	d1
+	qCONSTANT	.req	q0
+
+	BUF		.req	r0
+	LEN		.req	r1
+	CRC		.req	r2
+
+	qzr		.req	q9
+
+	/**
+	 * Calculate crc32
+	 * BUF - buffer
+	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+	 * CRC - initial crc32
+	 * return %eax crc32
+	 * uint crc32_pmull_le(unsigned char const *buffer,
+	 *                     size_t len, uint crc32)
+	 */
+ENTRY(crc32_pmull_le)
+	adr		r3, .Lcrc32_constants
+	b		0f
+
+ENTRY(crc32c_pmull_le)
+	adr		r3, .Lcrc32c_constants
+
+0:	bic		LEN, LEN, #15
+	vld1.8		{q1-q2}, [BUF]!
+	vld1.8		{q3-q4}, [BUF]!
+	vmov.i8		qzr, #0
+	vmov.i8		qCONSTANT, #0
+	vmov		dCONSTANTl[0], CRC
+	veor.8		d2, d2, dCONSTANTl
+	sub		LEN, LEN, #0x40
+	cmp		LEN, #0x40
+	blt		less_64
+
+	vld1.64		{qCONSTANT}, [r3]
+
+loop_64:		/* 64 bytes Full cache line folding */
+	sub		LEN, LEN, #0x40
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q6, d5, dCONSTANTh
+	vmull.p64	q7, d7, dCONSTANTh
+	vmull.p64	q8, d9, dCONSTANTh
+
+	vmull.p64	q1, d2, dCONSTANTl
+	vmull.p64	q2, d4, dCONSTANTl
+	vmull.p64	q3, d6, dCONSTANTl
+	vmull.p64	q4, d8, dCONSTANTl
+
+	veor.8		q1, q1, q5
+	vld1.8		{q5}, [BUF]!
+	veor.8		q2, q2, q6
+	vld1.8		{q6}, [BUF]!
+	veor.8		q3, q3, q7
+	vld1.8		{q7}, [BUF]!
+	veor.8		q4, q4, q8
+	vld1.8		{q8}, [BUF]!
+
+	veor.8		q1, q1, q5
+	veor.8		q2, q2, q6
+	veor.8		q3, q3, q7
+	veor.8		q4, q4, q8
+
+	cmp		LEN, #0x40
+	bge		loop_64
+
+less_64:		/* Folding cache line into 128bit */
+	vldr		dCONSTANTl, [r3, #16]
+	vldr		dCONSTANTh, [r3, #24]
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q2
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q3
+
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q4
+
+	teq		LEN, #0
+	beq		fold_64
+
+loop_16:		/* Folding rest buffer into 128bit */
+	subs		LEN, LEN, #0x10
+
+	vld1.8		{q2}, [BUF]!
+	vmull.p64	q5, d3, dCONSTANTh
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q5
+	veor.8		q1, q1, q2
+
+	bne		loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	vmull.p64	q2, d2, dCONSTANTh
+	vext.8		q1, q1, qzr, #8
+	veor.8		q1, q1, q2
+
+	/* final 32-bit fold */
+	vldr		dCONSTANTl, [r3, #32]
+	vldr		d6, [r3, #40]
+	vmov.i8		d7, #0
+
+	vext.8		q2, q1, qzr, #4
+	vand.8		d2, d2, d6
+	vmull.p64	q1, d2, dCONSTANTl
+	veor.8		q1, q1, q2
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+	vldr		dCONSTANTl, [r3, #48]
+	vldr		dCONSTANTh, [r3, #56]
+
+	vand.8		q2, q1, q3
+	vext.8		q2, qzr, q2, #8
+	vmull.p64	q2, d5, dCONSTANTh
+	vand.8		q2, q2, q3
+	vmull.p64	q2, d4, dCONSTANTl
+	veor.8		q1, q1, q2
+	vmov		r0, s5
+
+	bx		lr
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+	.macro		__crc32, c
+	subs		ip, r2, #8
+	bmi		.Ltail
+
+	tst		r1, #3
+	bne		.Lunaligned
+
+	teq		ip, #0
+.Laligned8\c:
+	ldrd		r2, r3, [r1], #8
+ARM_BE8(rev		r2, r2		)
+ARM_BE8(rev		r3, r3		)
+	crc32\c\()w	r0, r0, r2
+	crc32\c\()w	r0, r0, r3
+	bxeq		lr
+	subs		ip, ip, #8
+	bpl		.Laligned8\c
+
+.Ltail\c:
+	tst		ip, #4
+	beq		2f
+	ldr		r3, [r1], #4
+ARM_BE8(rev		r3, r3		)
+	crc32\c\()w	r0, r0, r3
+
+2:	tst		ip, #2
+	beq		1f
+	ldrh		r3, [r1], #2
+ARM_BE8(rev16		r3, r3		)
+	crc32\c\()h	r0, r0, r3
+
+1:	tst		ip, #1
+	bxeq		lr
+	ldrb		r3, [r1]
+	crc32\c\()b	r0, r0, r3
+	bx		lr
+
+.Lunaligned\c:
+	tst		r1, #1
+	beq		2f
+	ldrb		r3, [r1], #1
+	subs		r2, r2, #1
+	crc32\c\()b	r0, r0, r3
+
+	tst		r1, #2
+	beq		0f
+2:	ldrh		r3, [r1], #2
+	subs		r2, r2, #2
+ARM_BE8(rev16		r3, r3		)
+	crc32\c\()h	r0, r0, r3
+
+0:	subs		ip, r2, #8
+	bpl		.Laligned8\c
+	b		.Ltail\c
+	.endm
+
+	.align		5
+ENTRY(crc32_armv8_le)
+	__crc32
+ENDPROC(crc32_armv8_le)
+
+	.align		5
+ENTRY(crc32c_armv8_le)
+	__crc32		c
+ENDPROC(crc32c_armv8_le)
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
new file mode 100644
index 000000000000..f78cf1689669
--- /dev/null
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -0,0 +1,195 @@
+/*
+ * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN		64L	/* minimum size of buffer
+					 * for crc32_pmull_le_16 */
+#define SCALE_F			16L	/* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u32 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], u32 len);
+
+static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+	return 0;
+}
+
+static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+	return 0;
+}
+
+static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
+			      unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_pmull_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = *mctx;
+	return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	if (length >= PMULL_MIN_LEN && may_use_simd() &&
+	    (elf_hwcap2 & HWCAP2_PMULL)) {
+		unsigned int l = round_down(length, SCALE_F);
+
+		kernel_neon_begin();
+		*crc = crc32_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0) {
+		if (elf_hwcap2 & HWCAP2_CRC32)
+			*crc = crc32_armv8_le(*crc, data, length);
+		else
+			*crc = crc32_le(*crc, data, length);
+	}
+
+	return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	if (length >= PMULL_MIN_LEN && may_use_simd() &&
+	    (elf_hwcap2 & HWCAP2_PMULL)) {
+		unsigned int l = round_down(length, SCALE_F);
+
+		kernel_neon_begin();
+		*crc = crc32c_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0) {
+		if (elf_hwcap2 & HWCAP2_CRC32)
+			*crc = crc32c_armv8_le(*crc, data, length);
+		else
+			*crc = __crc32c_le(*crc, data, length);
+	}
+
+	return 0;
+}
+
+static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(*crc, out);
+	return 0;
+}
+
+static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(~*crc, out);
+	return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32_pmull_update,
+	.final			= crc32_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32_pmull_cra_init,
+	.base.cra_name		= "crc32",
+	.base.cra_driver_name	= "crc32-arm-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32c_pmull_update,
+	.final			= crc32c_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32c_pmull_cra_init,
+	.base.cra_name		= "crc32c",
+	.base.cra_driver_name	= "crc32c-arm-ce",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+	if (!(elf_hwcap2 & (HWCAP2_PMULL|HWCAP2_CRC32)))
+		return -ENODEV;
+
+	return crypto_register_shashes(crc32_pmull_algs,
+				       ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+	crypto_unregister_shashes(crc32_pmull_algs,
+				  ARRAY_SIZE(crc32_pmull_algs));
+}
+
+module_init(crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32c");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-12-04 11:54 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-12-04 11:54 [PATCH v2 0/6] crypto: ARM/arm64 CRC-T10DIF/CRC32/CRC32C roundup Ard Biesheuvel
2016-12-04 11:54 ` [PATCH v2 1/6] crypto: testmgr - avoid overlap in chunked tests Ard Biesheuvel
2016-12-04 11:54 ` [PATCH v2 2/6] crypto: testmgr - add/enhance test cases for CRC-T10DIF Ard Biesheuvel
2016-12-04 11:54 ` [PATCH v2 3/6] crypto: arm64/crct10dif - port x86 SSE implementation to arm64 Ard Biesheuvel
2016-12-04 11:54 ` [PATCH v2 4/6] crypto: arm/crct10dif - port x86 SSE implementation to ARM Ard Biesheuvel
2016-12-04 11:54 ` [PATCH v2 5/6] crypto: arm64/crc32 - accelerated support based on x86 SSE implementation Ard Biesheuvel
2016-12-04 11:54 ` [PATCH v2 6/6] crypto: arm/crc32 " Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).