[PATCH 1/4] crypto: [sha] Intel SHA Extensions optimized SHA1 transform function

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/4] crypto: [sha] Intel SHA Extensions optimized SHA1 transform function
       [not found] <cover.1441929717.git.tim.c.chen@linux.intel.com>
@ 2015-09-10 22:26 ` Tim Chen
  2015-09-10 22:27 ` [PATCH 2/4] crypto: [sha] Intel SHA Extensions optimized SHA256 " Tim Chen
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: Tim Chen @ 2015-09-10 22:26 UTC (permalink / raw)
  To: Herbert Xu, H. Peter Anvin, David S.Miller
  Cc: Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-crypto,
	linux-kernel


This patch includes the Intel SHA Extensions optimized implementation
of SHA-1 update function. This function has been tested on Broxton
platform and measured a speed up of 3.6x over the SSSE3 implementiation
for 4K blocks.

Originally-by: Chandramouli Narayanan <mouli_7982@yahoo.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 arch/x86/crypto/sha1_ni_asm.S | 302 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 302 insertions(+)
 create mode 100644 arch/x86/crypto/sha1_ni_asm.S

diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 0000000..874a651
--- /dev/null
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,302 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define RSPSAVE		%rax
+
+/* gcc conversion */
+#define FRAME_SIZE	32	/* space for 2x16 bytes */
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define SHUF_MASK	%xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process.  Once all blocks have
+ * been processed, the digest pointer is  updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(uint32_t *digest, const void *data,
+		uint32_t numBlocks)
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+.text
+.align 32
+ENTRY(sha1_ni_transform)
+	mov		%rsp, RSPSAVE
+	sub		$FRAME_SIZE, %rsp
+	and		$~0xF, %rsp
+
+	shl		$6, NUM_BLKS		/* convert to bytes */
+	jz		.Ldone_hash
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/* load initial hash values */
+	pinsrd		$3, 1*16(DIGEST_PTR), E0
+	movdqu		0*16(DIGEST_PTR), ABCD
+	pand		UPPER_WORD_MASK(%rip), E0
+	pshufd		$0x1B, ABCD, ABCD
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		E0, (0*16)(%rsp)
+	movdqa		ABCD, (1*16)(%rsp)
+
+	/* Rounds 0-3 */
+	movdqu		0*16(DATA_PTR), MSG0
+	pshufb		SHUF_MASK, MSG0
+		paddd		MSG0, E0
+		movdqa		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+
+	/* Rounds 4-7 */
+	movdqu		1*16(DATA_PTR), MSG1
+	pshufb		SHUF_MASK, MSG1
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG1, MSG0
+
+	/* Rounds 8-11 */
+	movdqu		2*16(DATA_PTR), MSG2
+	pshufb		SHUF_MASK, MSG2
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 12-15 */
+	movdqu		3*16(DATA_PTR), MSG3
+	pshufb		SHUF_MASK, MSG3
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 16-19 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 20-23 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 24-27 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 28-31 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 32-35 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 36-39 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 40-43 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 44-47 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 48-51 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 52-55 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 56-59 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 60-63 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$3, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 64-67 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$3, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 68-71 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$3, E1, ABCD
+	pxor		MSG1, MSG3
+
+	/* Rounds 72-75 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$3, E0, ABCD
+
+	/* Rounds 76-79 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+		sha1rnds4	$3, E1, ABCD
+
+	/* Add current hash values with previously saved */
+	sha1nexte	(0*16)(%rsp), E0
+	paddd		(1*16)(%rsp), ABCD
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	pshufd		$0x1B, ABCD, ABCD
+	movdqu		ABCD, 0*16(DIGEST_PTR)
+	pextrd		$3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+	mov		RSPSAVE, %rsp
+
+	ret
+ENDPROC(sha1_ni_transform)
+
+.data
+
+.align 64
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x000102030405060708090a0b0c0d0e0f
+UPPER_WORD_MASK:
+	.octa 0xFFFFFFFF000000000000000000000000
-- 
2.4.2




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] crypto: [sha] Intel SHA Extensions optimized SHA256 transform function
       [not found] <cover.1441929717.git.tim.c.chen@linux.intel.com>
  2015-09-10 22:26 ` [PATCH 1/4] crypto: [sha] Intel SHA Extensions optimized SHA1 transform function Tim Chen
@ 2015-09-10 22:27 ` Tim Chen
  2015-09-10 22:27 ` [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256 Tim Chen
  2015-09-10 22:27 ` [PATCH 4/4] crypto: [sha] Add build support for Intel SHA Extensions optimized SHA1 and SHA256 Tim Chen
  3 siblings, 0 replies; 12+ messages in thread
From: Tim Chen @ 2015-09-10 22:27 UTC (permalink / raw)
  To: Herbert Xu, H. Peter Anvin, David S.Miller
  Cc: Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-crypto,
	linux-kernel


This patch includes the Intel SHA Extensions optimized implementation
of SHA-256 update function. This function has been tested on Broxton
platform and measured a speed up of 3.6x over the SSSE3 implementiation
for 4K blocks.

Originally-by: Chandramouli Narayanan <mouli_7982@yahoo.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 353 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 353 insertions(+)
 create mode 100644 arch/x86/crypto/sha256_ni_asm.S

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 0000000..748cdf2
--- /dev/null
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,353 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+#define MSGTMP4		%xmm7
+
+#define SHUF_MASK	%xmm8
+
+#define ABEF_SAVE	%xmm9
+#define CDGH_SAVE	%xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process.  Once all blocks have
+ * been processed, the digest pointer is  updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks.  All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha256_ni_transform(uint32_t *digest, const void *data,
+		uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+
+.text
+.align 32
+ENTRY(sha256_ni_transform)
+
+	shl		$6, NUM_BLKS		/*  convert to bytes */
+	jz		.Ldone_hash
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/*
+	 * load initial hash values
+	 * Need to reorder these appropriately
+	 * DCBA, HGFE -> ABEF, CDGH
+	 */
+	movdqu		0*16(DIGEST_PTR), STATE0
+	movdqu		1*16(DIGEST_PTR), STATE1
+
+	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
+	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
+	movdqa		STATE0, MSGTMP4
+	palignr		$8, STATE1,  STATE0		/* ABEF */
+	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+	lea		K256(%rip), SHA256CONSTANTS
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		STATE0, ABEF_SAVE
+	movdqa		STATE1, CDGH_SAVE
+
+	/* Rounds 0-3 */
+	movdqu		0*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP0
+		paddd		0*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movdqu		1*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP1
+		paddd		1*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movdqu		2*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP2
+		paddd		2*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movdqu		3*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP3
+		paddd		3*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	movdqa		MSGTMP0, MSG
+		paddd		4*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	movdqa		MSGTMP1, MSG
+		paddd		5*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	movdqa		MSGTMP2, MSG
+		paddd		6*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	movdqa		MSGTMP3, MSG
+		paddd		7*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	movdqa		MSGTMP0, MSG
+		paddd		8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	movdqa		MSGTMP1, MSG
+		paddd		9*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	movdqa		MSGTMP2, MSG
+		paddd		10*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	movdqa		MSGTMP3, MSG
+		paddd		11*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	movdqa		MSGTMP0, MSG
+		paddd		12*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	movdqa		MSGTMP1, MSG
+		paddd		13*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 56-59 */
+	movdqa		MSGTMP2, MSG
+		paddd		14*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 60-63 */
+	movdqa		MSGTMP3, MSG
+		paddd		15*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Add current hash values with previously saved */
+	paddd		ABEF_SAVE, STATE0
+	paddd		CDGH_SAVE, STATE1
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
+	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
+	movdqa		STATE0, MSGTMP4
+	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
+	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+
+	movdqu		STATE0, 0*16(DIGEST_PTR)
+	movdqu		STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+	ret
+ENDPROC(sha256_ni_transform)
+
+.data
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
-- 
2.4.2




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
       [not found] <cover.1441929717.git.tim.c.chen@linux.intel.com>
  2015-09-10 22:26 ` [PATCH 1/4] crypto: [sha] Intel SHA Extensions optimized SHA1 transform function Tim Chen
  2015-09-10 22:27 ` [PATCH 2/4] crypto: [sha] Intel SHA Extensions optimized SHA256 " Tim Chen
@ 2015-09-10 22:27 ` Tim Chen
  2015-09-10 22:52   ` Stephan Mueller
  2015-09-10 22:27 ` [PATCH 4/4] crypto: [sha] Add build support for Intel SHA Extensions optimized SHA1 and SHA256 Tim Chen
  3 siblings, 1 reply; 12+ messages in thread
From: Tim Chen @ 2015-09-10 22:27 UTC (permalink / raw)
  To: Herbert Xu, H. Peter Anvin, David S.Miller
  Cc: Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-crypto,
	linux-kernel


This patch adds the glue code to detect and utilize the Intel SHA
extensions optimized SHA1 and SHA256 update transforms when available.

This code has been tested on Broxton for functionality.

Originally-by: Chandramouli Narayanan <mouli_7982@yahoo.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 arch/x86/crypto/sha1_ssse3_glue.c   | 12 +++++++++++-
 arch/x86/crypto/sha256_ssse3_glue.c | 38 ++++++++++++++++++++++---------------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7c48e8b..98be8cc 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -44,6 +44,10 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
 asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
 				    unsigned int rounds);
 #endif
+#ifdef CONFIG_AS_SHA1_NI
+asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
+				   unsigned int rounds);
+#endif
 
 static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
 
@@ -166,12 +170,18 @@ static int __init sha1_ssse3_mod_init(void)
 #endif
 	}
 #endif
+#ifdef CONFIG_AS_SHA1_NI
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		sha1_transform_asm = sha1_ni_transform;
+		algo_name = "SHA-NI";
+	}
+#endif
 
 	if (sha1_transform_asm) {
 		pr_info("Using %s optimized SHA-1 implementation\n", algo_name);
 		return crypto_register_shash(&alg);
 	}
-	pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
+	pr_info("Neither AVX nor AVX2 nor SSSE3/SHA-NI is available/usable.\n");
 
 	return -ENODEV;
 }
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index f8097fc..9c7b22c 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -50,6 +50,10 @@ asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
 asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
 				      u64 rounds);
 #endif
+#ifdef CONFIG_AS_SHA256_NI
+asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
+				   u64 rounds); /*unsigned int rounds);*/
+#endif
 
 static void (*sha256_transform_asm)(u32 *, const char *, u64);
 
@@ -142,36 +146,40 @@ static bool __init avx_usable(void)
 
 static int __init sha256_ssse3_mod_init(void)
 {
+	char *algo;
+
 	/* test for SSSE3 first */
-	if (cpu_has_ssse3)
+	if (cpu_has_ssse3) {
 		sha256_transform_asm = sha256_transform_ssse3;
+		algo = "SSSE3";
+	}
 
 #ifdef CONFIG_AS_AVX
 	/* allow AVX to override SSSE3, it's a little faster */
 	if (avx_usable()) {
+		sha256_transform_asm = sha256_transform_avx;
+		algo = "AVX";
 #ifdef CONFIG_AS_AVX2
-		if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2))
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2)) {
 			sha256_transform_asm = sha256_transform_rorx;
-		else
+			algo = "AVX2";
+		}
+#endif
+	}
 #endif
-			sha256_transform_asm = sha256_transform_avx;
+#ifdef CONFIG_AS_SHA256_NI
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		sha256_transform_asm = sha256_ni_transform;
+		algo = "SHA-256-NI";
 	}
 #endif
 
 	if (sha256_transform_asm) {
-#ifdef CONFIG_AS_AVX
-		if (sha256_transform_asm == sha256_transform_avx)
-			pr_info("Using AVX optimized SHA-256 implementation\n");
-#ifdef CONFIG_AS_AVX2
-		else if (sha256_transform_asm == sha256_transform_rorx)
-			pr_info("Using AVX2 optimized SHA-256 implementation\n");
-#endif
-		else
-#endif
-			pr_info("Using SSSE3 optimized SHA-256 implementation\n");
+		pr_info("Using %s optimized SHA-256 implementation\n", algo);
 		return crypto_register_shashes(algs, ARRAY_SIZE(algs));
 	}
-	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+	pr_info("Neither AVX nor SSSE3/SHA-NI is available/usable.\n");
 
 	return -ENODEV;
 }
-- 
2.4.2




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] crypto: [sha] Add build support for Intel SHA Extensions optimized SHA1 and SHA256
       [not found] <cover.1441929717.git.tim.c.chen@linux.intel.com>
                   ` (2 preceding siblings ...)
  2015-09-10 22:27 ` [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256 Tim Chen
@ 2015-09-10 22:27 ` Tim Chen
  3 siblings, 0 replies; 12+ messages in thread
From: Tim Chen @ 2015-09-10 22:27 UTC (permalink / raw)
  To: Herbert Xu, H. Peter Anvin, David S.Miller
  Cc: Sean Gulley, Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Tim Chen, Jussi Kivilinna, linux-crypto,
	linux-kernel


This patch provides the configuration and build support to
include and build the optimized SHA1 and SHA256 update transforms
for the kernel's crypto library.

Originally-by: Chandramouli Narayanan <mouli_7982@yahoo.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 arch/x86/Makefile        |  6 ++++--
 arch/x86/crypto/Makefile |  8 ++++++++
 crypto/Kconfig           | 10 ++++++----
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 747860c..a8009c7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -165,9 +165,11 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
 
-KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9a2838c..b9b912a 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,8 @@
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
 				$(comma)4)$(comma)%ymm2,yes,no)
+sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
 
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
@@ -91,9 +93,15 @@ ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
 poly1305-x86_64-y += poly1305-avx2-x86_64.o
 endif
+ifeq ($(sha1_ni_supported),yes)
+sha1-ssse3-y += sha1_ni_asm.o
+endif
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+ifeq ($(sha256_ni_supported),yes)
+sha256-ssse3-y += sha256_ni_asm.o
+endif
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 48ee3e1..fc93444 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -597,17 +597,18 @@ config CRYPTO_SHA1
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
 
 config CRYPTO_SHA1_SSSE3
-	tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2)"
+	tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)"
 	depends on X86 && 64BIT
 	select CRYPTO_SHA1
 	select CRYPTO_HASH
 	help
 	  SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented
 	  using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
-	  Extensions (AVX/AVX2), when available.
+	  Extensions (AVX/AVX2) or SHA-NI(SHA Extensions New Instructions),
+	  when available.
 
 config CRYPTO_SHA256_SSSE3
-	tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)"
+	tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)"
 	depends on X86 && 64BIT
 	select CRYPTO_SHA256
 	select CRYPTO_HASH
@@ -615,7 +616,8 @@ config CRYPTO_SHA256_SSSE3
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
 	  Extensions version 1 (AVX1), or Advanced Vector Extensions
-	  version 2 (AVX2) instructions, when available.
+	  version 2 (AVX2) instructions, or SHA-NI (SHA Extensions New
+	  Instructions) when available.
 
 config CRYPTO_SHA512_SSSE3
 	tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)"
-- 
2.4.2



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-10 22:27 ` [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256 Tim Chen
@ 2015-09-10 22:52   ` Stephan Mueller
  2015-09-11  0:04     ` Tim Chen
  0 siblings, 1 reply; 12+ messages in thread
From: Stephan Mueller @ 2015-09-10 22:52 UTC (permalink / raw)
  To: Tim Chen
  Cc: Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley,
	Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Jussi Kivilinna, linux-crypto, linux-kernel

Am Donnerstag, 10. September 2015, 15:27:20 schrieb Tim Chen:

Hi Tim,

>This patch adds the glue code to detect and utilize the Intel SHA
>extensions optimized SHA1 and SHA256 update transforms when available.
>
>This code has been tested on Broxton for functionality.

A general comment on this file: shouldn't this file be cleaned and use the 
standard mechanisms of the kernel crypto API?

This glue implements its own selection of which SHA implementation to use. But 
the kernel crypto API implements that logic already. The issue with the 
current implementation in this file is that you have no clue which particular 
implementation of SHA is in use in one particular case.

So, may I suggest a restructuring to define independent instances of SHA, such 
as

- cra_name == "sha1", cra_driver_name="sha1_ssse3", cra_priority=300
- cra_name == "sha1", cra_driver_name="sha1_avx", cra_priority=400
- cra_name == "sha1", cra_driver_name="sha1_avx2", cra_priority=500
- cra_name == "sha1", cra_driver_name="sha1_shavx", cra_priority=600

Similarly for the other SHAs?

In all the register functions for the ciphers, you can bail out if the 
hardware does not support an implementation.

Ciao
Stephan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-10 22:52   ` Stephan Mueller
@ 2015-09-11  0:04     ` Tim Chen
  2015-09-11 17:02       ` Stephan Mueller
  0 siblings, 1 reply; 12+ messages in thread
From: Tim Chen @ 2015-09-11  0:04 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley,
	Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Jussi Kivilinna, linux-crypto, linux-kernel

On Fri, 2015-09-11 at 00:52 +0200, Stephan Mueller wrote:
> Am Donnerstag, 10. September 2015, 15:27:20 schrieb Tim Chen:
> 
> Hi Tim,
> 
> >This patch adds the glue code to detect and utilize the Intel SHA
> >extensions optimized SHA1 and SHA256 update transforms when available.
> >
> >This code has been tested on Broxton for functionality.
> 
> A general comment on this file: shouldn't this file be cleaned and use the 
> standard mechanisms of the kernel crypto API?
> 
> This glue implements its own selection of which SHA implementation to use. But 
> the kernel crypto API implements that logic already. The issue with the 
> current implementation in this file is that you have no clue which particular 
> implementation of SHA is in use in one particular case.
> 
> So, may I suggest a restructuring to define independent instances of SHA, such 
> as
> 
> - cra_name == "sha1", cra_driver_name="sha1_ssse3", cra_priority=300
> - cra_name == "sha1", cra_driver_name="sha1_avx", cra_priority=400
> - cra_name == "sha1", cra_driver_name="sha1_avx2", cra_priority=500
> - cra_name == "sha1", cra_driver_name="sha1_shavx", cra_priority=600
> 
> Similarly for the other SHAs?
> 
> In all the register functions for the ciphers, you can bail out if the 
> hardware does not support an implementation.

Stephen,

Is there a scenario you can think of 
when a lower performing sha1 transform needs to
be exposed as a separate driver?  

Otherwise the glue code logic will only expose the
best performing one for a cpu and hide the others, which was intentional
on our part to prevent a lower performing sha from getting used.

Tim



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-11  0:04     ` Tim Chen
@ 2015-09-11 17:02       ` Stephan Mueller
  2015-09-11 18:49         ` Tim Chen
  0 siblings, 1 reply; 12+ messages in thread
From: Stephan Mueller @ 2015-09-11 17:02 UTC (permalink / raw)
  To: Tim Chen
  Cc: Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley,
	Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Jussi Kivilinna, linux-crypto, linux-kernel

Am Donnerstag, 10. September 2015, 17:04:31 schrieb Tim Chen:

Hi Tim,
>
>Is there a scenario you can think of
>when a lower performing sha1 transform needs to
>be exposed as a separate driver?

My immediate concern is testing: it is hard to test the individual 
implementations.
>
>Otherwise the glue code logic will only expose the
>best performing one for a cpu and hide the others, which was intentional
>on our part to prevent a lower performing sha from getting used.

Agreed, but the kernel crypto API does that already using the priorities -- 
IMHO a very clean and easy to interpret solution.

Furthermore, if somebody really has a need to not use the fastest HW 
implementation, the kernel crypto API allows him to do that. With the hard-
wired approach in the glue file, you are stuck.


Ciao
Stephan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-11 17:02       ` Stephan Mueller
@ 2015-09-11 18:49         ` Tim Chen
  2015-09-11 19:15           ` David Miller
  2015-09-11 19:15           ` Stephan Mueller
  0 siblings, 2 replies; 12+ messages in thread
From: Tim Chen @ 2015-09-11 18:49 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley,
	Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Jussi Kivilinna, linux-crypto, linux-kernel

On Fri, 2015-09-11 at 19:02 +0200, Stephan Mueller wrote:
> Am Donnerstag, 10. September 2015, 17:04:31 schrieb Tim Chen:
> 
> Hi Tim,
> >
> >Is there a scenario you can think of
> >when a lower performing sha1 transform needs to
> >be exposed as a separate driver?
> 
> My immediate concern is testing: it is hard to test the individual 
> implementations.
> >

Not hard, just one line in the glue code to set the transform
to the one you need it you really want to test individual 
implementation.  Usually user of sha don't care which sha driver
they got, but just the highest priority one. 
So you will anyway need to patch and change the priority of the sha
driver to expose a specific one for testing.

> >Otherwise the glue code logic will only expose the
> >best performing one for a cpu and hide the others, which was intentional
> >on our part to prevent a lower performing sha from getting used.
> 
> Agreed, but the kernel crypto API does that already using the priorities -- 
> IMHO a very clean and easy to interpret solution.
> 
> Furthermore, if somebody really has a need to not use the fastest HW 
> implementation, the kernel crypto API allows him to do that. With the hard-
> wired approach in the glue file, you are stuck.

Still, why would some kernel module specifically not want to 
use the fastest HW implementation, and explicitly ask for
a slower driver?

Tim


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-11 18:49         ` Tim Chen
@ 2015-09-11 19:15           ` David Miller
  2015-09-11 20:10             ` Tim Chen
  2015-09-11 19:15           ` Stephan Mueller
  1 sibling, 1 reply; 12+ messages in thread
From: David Miller @ 2015-09-11 19:15 UTC (permalink / raw)
  To: tim.c.chen
  Cc: smueller, herbert, hpa, sean.m.gulley, mouli_7982, vinodh.gopal,
	james.guilford, wajdi.k.feghali, jussi.kivilinna, linux-crypto,
	linux-kernel

From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 11 Sep 2015 11:49:32 -0700

> Still, why would some kernel module specifically not want to 
> use the fastest HW implementation, and explicitly ask for
> a slower driver?

Temporary workaround if a bug is found.

There is really no reason to prevent the user from having this
flexibility, and in return anyone can test any implementation
their cpu can support.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-11 18:49         ` Tim Chen
  2015-09-11 19:15           ` David Miller
@ 2015-09-11 19:15           ` Stephan Mueller
  1 sibling, 0 replies; 12+ messages in thread
From: Stephan Mueller @ 2015-09-11 19:15 UTC (permalink / raw)
  To: Tim Chen
  Cc: Herbert Xu, H. Peter Anvin, David S.Miller, Sean Gulley,
	Chandramouli Narayanan, Vinodh Gopal, James Guilford,
	Wajdi Feghali, Jussi Kivilinna, linux-crypto, linux-kernel

Am Freitag, 11. September 2015, 11:49:32 schrieb Tim Chen:

Hi Tim,

>On Fri, 2015-09-11 at 19:02 +0200, Stephan Mueller wrote:
>> Am Donnerstag, 10. September 2015, 17:04:31 schrieb Tim Chen:
>> 
>> Hi Tim,
>> 
>> >Is there a scenario you can think of
>> >when a lower performing sha1 transform needs to
>> >be exposed as a separate driver?
>> 
>> My immediate concern is testing: it is hard to test the individual
>> implementations.
>
>Not hard, just one line in the glue code to set the transform
>to the one you need it you really want to test individual
>implementation.  Usually user of sha don't care which sha driver
>they got, but just the highest priority one.
>So you will anyway need to patch and change the priority of the sha
>driver to expose a specific one for testing.

Sure, it is not hard when you recompile. But when you have to test one given 
kernel binary, it is a challenge.
>
>> >Otherwise the glue code logic will only expose the
>> >best performing one for a cpu and hide the others, which was intentional
>> >on our part to prevent a lower performing sha from getting used.
>> 
>> Agreed, but the kernel crypto API does that already using the priorities --
>> IMHO a very clean and easy to interpret solution.
>> 
>> Furthermore, if somebody really has a need to not use the fastest HW
>> implementation, the kernel crypto API allows him to do that. With the hard-
>> wired approach in the glue file, you are stuck.
>
>Still, why would some kernel module specifically not want to
>use the fastest HW implementation, and explicitly ask for
>a slower driver?

I have seen one instance where a hardware driver was broken on one particular 
hardware. So, the only way was to disable it. In our case here, disabling 
means to go back to the software implementation of SHA.

Ciao
Stephan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-11 19:15           ` David Miller
@ 2015-09-11 20:10             ` Tim Chen
  2015-09-12  8:09               ` Herbert Xu
  0 siblings, 1 reply; 12+ messages in thread
From: Tim Chen @ 2015-09-11 20:10 UTC (permalink / raw)
  To: David Miller
  Cc: smueller, herbert, hpa, sean.m.gulley, mouli_7982, vinodh.gopal,
	james.guilford, wajdi.k.feghali, jussi.kivilinna, linux-crypto,
	linux-kernel

On Fri, 2015-09-11 at 12:15 -0700, David Miller wrote:
> From: Tim Chen <tim.c.chen@linux.intel.com>
> Date: Fri, 11 Sep 2015 11:49:32 -0700
> 
> > Still, why would some kernel module specifically not want to 
> > use the fastest HW implementation, and explicitly ask for
> > a slower driver?
> 
> Temporary workaround if a bug is found.
> 
> There is really no reason to prevent the user from having this
> flexibility, and in return anyone can test any implementation
> their cpu can support.

Mmmm..., this is a restructuring of the algorithms within
the glue code into multiple drivers instead of one and exposing
them all.  It is a bit orthogonal to the intention of this
patch set.  I think it is better that I create a 
separate patch on the glue code on top of this patch set 
to implement this.

Herbert, do you agree with this approach?

Tim


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256
  2015-09-11 20:10             ` Tim Chen
@ 2015-09-12  8:09               ` Herbert Xu
  0 siblings, 0 replies; 12+ messages in thread
From: Herbert Xu @ 2015-09-12  8:09 UTC (permalink / raw)
  To: Tim Chen
  Cc: David Miller, smueller, hpa, sean.m.gulley, mouli_7982,
	vinodh.gopal, james.guilford, wajdi.k.feghali, jussi.kivilinna,
	linux-crypto, linux-kernel

On Fri, Sep 11, 2015 at 01:10:27PM -0700, Tim Chen wrote:
> 
> Mmmm..., this is a restructuring of the algorithms within
> the glue code into multiple drivers instead of one and exposing
> them all.  It is a bit orthogonal to the intention of this
> patch set.  I think it is better that I create a 
> separate patch on the glue code on top of this patch set 
> to implement this.
> 
> Herbert, do you agree with this approach?

Yes I think we can work on the individual crypto registration
separately from this patch-set.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2015-09-12  8:10 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <cover.1441929717.git.tim.c.chen@linux.intel.com>
2015-09-10 22:26 ` [PATCH 1/4] crypto: [sha] Intel SHA Extensions optimized SHA1 transform function Tim Chen
2015-09-10 22:27 ` [PATCH 2/4] crypto: [sha] Intel SHA Extensions optimized SHA256 " Tim Chen
2015-09-10 22:27 ` [PATCH 3/4] crypto: [sha] glue code for Intel SHA extensions optimized SHA1 & SHA256 Tim Chen
2015-09-10 22:52   ` Stephan Mueller
2015-09-11  0:04     ` Tim Chen
2015-09-11 17:02       ` Stephan Mueller
2015-09-11 18:49         ` Tim Chen
2015-09-11 19:15           ` David Miller
2015-09-11 20:10             ` Tim Chen
2015-09-12  8:09               ` Herbert Xu
2015-09-11 19:15           ` Stephan Mueller
2015-09-10 22:27 ` [PATCH 4/4] crypto: [sha] Add build support for Intel SHA Extensions optimized SHA1 and SHA256 Tim Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox