[PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization

public inbox for linux-crypto@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization
@ 2024-04-11 16:23 Eric Biggers
  2024-04-11 16:23 ` [PATCH v2 1/4] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Eric Biggers @ 2024-04-11 16:23 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-kernel, Stefan Kanthak

This patchset reduces the amount of source code duplication in
sha256_ni_asm.S and also reduces the binary code size slightly.

Changed in v2:
   - Take advantage of .irp in patch 1
   - Added two additional cleanup patches

Eric Biggers (4):
  crypto: x86/sha256-ni - convert to use rounds macros
  crypto: x86/sha256-ni - rename some register aliases
  crypto: x86/sha256-ni - optimize code size
  crypto: x86/sha256-ni - simplify do_4rounds

 arch/x86/crypto/sha256_ni_asm.S | 253 +++++++-------------------------
 1 file changed, 49 insertions(+), 204 deletions(-)


base-commit: 20cb38a7af88dc40095da7c2c9094da3873fea23
-- 
2.44.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 1/4] crypto: x86/sha256-ni - convert to use rounds macros
  2024-04-11 16:23 [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
@ 2024-04-11 16:23 ` Eric Biggers
  2024-04-11 16:23 ` [PATCH v2 2/4] crypto: x86/sha256-ni - rename some register aliases Eric Biggers
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Biggers @ 2024-04-11 16:23 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-kernel, Stefan Kanthak

From: Eric Biggers <ebiggers@google.com>

To avoid source code duplication, do the SHA-256 rounds using macros.
This reduces the length of sha256_ni_asm.S by 153 lines while still
producing the exact same object file.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 211 +++++---------------------------
 1 file changed, 29 insertions(+), 182 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 537b6dcd7ed8..498f67727b94 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -74,23 +74,43 @@
 #define SHUF_MASK	%xmm8
 
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
 
+.macro do_4rounds	i, m0, m1, m2, m3
+.if \i < 16
+	movdqu		\i*4(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, \m0
+.else
+	movdqa		\m0, MSG
+.endif
+	paddd		\i*4(SHA256CONSTANTS), MSG
+	sha256rnds2	STATE0, STATE1
+.if \i >= 12 && \i < 60
+	movdqa		\m0, MSGTMP4
+	palignr		$4, \m3, MSGTMP4
+	paddd		MSGTMP4, \m1
+	sha256msg2	\m0, \m1
+.endif
+	pshufd 		$0x0E, MSG, MSG
+	sha256rnds2	STATE1, STATE0
+.if \i >= 4 && \i < 52
+	sha256msg1	\m0, \m3
+.endif
+.endm
+
 /*
  * Intel SHA Extensions optimized implementation of a SHA-256 update function
  *
  * The function takes a pointer to the current hash values, a pointer to the
  * input data, and a number of 64 byte blocks to process.  Once all blocks have
  * been processed, the digest pointer is  updated with the resulting hash value.
  * The function only processes complete blocks, there is no functionality to
  * store partial blocks.  All message padding and hash value initialization must
  * be done outside the update function.
  *
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
  * void sha256_ni_transform(uint32_t *digest, const void *data,
 		uint32_t numBlocks);
  * digest : pointer to digest
  * data: pointer to input data
  * numBlocks: Number of blocks to process
@@ -123,189 +143,16 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 .Lloop0:
 	/* Save hash values for addition after rounds */
 	movdqa		STATE0, ABEF_SAVE
 	movdqa		STATE1, CDGH_SAVE
 
-	/* Rounds 0-3 */
-	movdqu		0*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP0
-		paddd		0*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 4-7 */
-	movdqu		1*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP1
-		paddd		1*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 8-11 */
-	movdqu		2*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP2
-		paddd		2*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 12-15 */
-	movdqu		3*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, MSGTMP3
-		paddd		3*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 16-19 */
-	movdqa		MSGTMP0, MSG
-		paddd		4*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 20-23 */
-	movdqa		MSGTMP1, MSG
-		paddd		5*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 24-27 */
-	movdqa		MSGTMP2, MSG
-		paddd		6*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 28-31 */
-	movdqa		MSGTMP3, MSG
-		paddd		7*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 32-35 */
-	movdqa		MSGTMP0, MSG
-		paddd		8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 36-39 */
-	movdqa		MSGTMP1, MSG
-		paddd		9*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 40-43 */
-	movdqa		MSGTMP2, MSG
-		paddd		10*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 44-47 */
-	movdqa		MSGTMP3, MSG
-		paddd		11*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 48-51 */
-	movdqa		MSGTMP0, MSG
-		paddd		12*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 52-55 */
-	movdqa		MSGTMP1, MSG
-		paddd		13*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 56-59 */
-	movdqa		MSGTMP2, MSG
-		paddd		14*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-	movdqa		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
-
-	/* Rounds 60-63 */
-	movdqa		MSGTMP3, MSG
-		paddd		15*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
-		pshufd 		$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+.irp i, 0, 16, 32, 48
+	do_4rounds	(\i + 0),  MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
+	do_4rounds	(\i + 4),  MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
+	do_4rounds	(\i + 8),  MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
+	do_4rounds	(\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
+.endr
 
 	/* Add current hash values with previously saved */
 	paddd		ABEF_SAVE, STATE0
 	paddd		CDGH_SAVE, STATE1
 
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 2/4] crypto: x86/sha256-ni - rename some register aliases
  2024-04-11 16:23 [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
  2024-04-11 16:23 ` [PATCH v2 1/4] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
@ 2024-04-11 16:23 ` Eric Biggers
  2024-04-11 16:23 ` [PATCH v2 3/4] crypto: x86/sha256-ni - optimize code size Eric Biggers
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Biggers @ 2024-04-11 16:23 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-kernel, Stefan Kanthak

From: Eric Biggers <ebiggers@google.com>

MSGTMP[0-3] are used to hold the message schedule and are not temporary
registers per se.  MSGTMP4 is used as a temporary register for several
different purposes and isn't really related to MSGTMP[0-3].  Rename them
to MSG[0-3] and TMP accordingly.

Also add a comment that clarifies what MSG is.

Suggested-by: Stefan Kanthak <stefan.kanthak@nexgo.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 34 ++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 498f67727b94..b7e7001dafdf 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -60,18 +60,18 @@
 #define DATA_PTR	%rsi	/* 2nd arg */
 #define NUM_BLKS	%rdx	/* 3rd arg */
 
 #define SHA256CONSTANTS	%rax
 
-#define MSG		%xmm0
+#define MSG		%xmm0  /* sha256rnds2 implicit operand */
 #define STATE0		%xmm1
 #define STATE1		%xmm2
-#define MSGTMP0		%xmm3
-#define MSGTMP1		%xmm4
-#define MSGTMP2		%xmm5
-#define MSGTMP3		%xmm6
-#define MSGTMP4		%xmm7
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define TMP		%xmm7
 
 #define SHUF_MASK	%xmm8
 
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
@@ -85,13 +85,13 @@
 	movdqa		\m0, MSG
 .endif
 	paddd		\i*4(SHA256CONSTANTS), MSG
 	sha256rnds2	STATE0, STATE1
 .if \i >= 12 && \i < 60
-	movdqa		\m0, MSGTMP4
-	palignr		$4, \m3, MSGTMP4
-	paddd		MSGTMP4, \m1
+	movdqa		\m0, TMP
+	palignr		$4, \m3, TMP
+	paddd		TMP, \m1
 	sha256msg2	\m0, \m1
 .endif
 	pshufd 		$0x0E, MSG, MSG
 	sha256rnds2	STATE1, STATE0
 .if \i >= 4 && \i < 52
@@ -131,27 +131,27 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	movdqu		0*16(DIGEST_PTR), STATE0
 	movdqu		1*16(DIGEST_PTR), STATE1
 
 	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
 	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
-	movdqa		STATE0, MSGTMP4
+	movdqa		STATE0, TMP
 	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+	pblendw		$0xF0, TMP, STATE1		/* CDGH */
 
 	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
 	lea		K256(%rip), SHA256CONSTANTS
 
 .Lloop0:
 	/* Save hash values for addition after rounds */
 	movdqa		STATE0, ABEF_SAVE
 	movdqa		STATE1, CDGH_SAVE
 
 .irp i, 0, 16, 32, 48
-	do_4rounds	(\i + 0),  MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
-	do_4rounds	(\i + 4),  MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
-	do_4rounds	(\i + 8),  MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
-	do_4rounds	(\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
+	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3
+	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0
+	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1
+	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2
 .endr
 
 	/* Add current hash values with previously saved */
 	paddd		ABEF_SAVE, STATE0
 	paddd		CDGH_SAVE, STATE1
@@ -162,13 +162,13 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	jne		.Lloop0
 
 	/* Write hash values back in the correct order */
 	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
 	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
-	movdqa		STATE0, MSGTMP4
+	movdqa		STATE0, TMP
 	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+	palignr		$8, TMP, STATE1			/* HGFE */
 
 	movdqu		STATE0, 0*16(DIGEST_PTR)
 	movdqu		STATE1, 1*16(DIGEST_PTR)
 
 .Ldone_hash:
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 3/4] crypto: x86/sha256-ni - optimize code size
  2024-04-11 16:23 [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
  2024-04-11 16:23 ` [PATCH v2 1/4] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
  2024-04-11 16:23 ` [PATCH v2 2/4] crypto: x86/sha256-ni - rename some register aliases Eric Biggers
@ 2024-04-11 16:23 ` Eric Biggers
  2024-04-11 16:23 ` [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds Eric Biggers
  2024-04-19 11:00 ` [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Herbert Xu
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Biggers @ 2024-04-11 16:23 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-kernel, Stefan Kanthak

From: Eric Biggers <ebiggers@google.com>

- Load the SHA-256 round constants relative to a pointer that points
  into the middle of the constants rather than to the beginning.  Since
  x86 instructions use signed offsets, this decreases the instruction
  length required to access some of the later round constants.

- Use punpcklqdq or punpckhqdq instead of longer instructions such as
  pshufd, pblendw, and palignr.  This doesn't harm performance.

The end result is that sha256_ni_transform shrinks from 839 bytes to 791
bytes, with no loss in performance.

Suggested-by: Stefan Kanthak <stefan.kanthak@nexgo.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index b7e7001dafdf..ffc9f1c75c15 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -82,19 +82,19 @@
 	pshufb		SHUF_MASK, MSG
 	movdqa		MSG, \m0
 .else
 	movdqa		\m0, MSG
 .endif
-	paddd		\i*4(SHA256CONSTANTS), MSG
+	paddd		(\i-32)*4(SHA256CONSTANTS), MSG
 	sha256rnds2	STATE0, STATE1
 .if \i >= 12 && \i < 60
 	movdqa		\m0, TMP
 	palignr		$4, \m3, TMP
 	paddd		TMP, \m1
 	sha256msg2	\m0, \m1
 .endif
-	pshufd 		$0x0E, MSG, MSG
+	punpckhqdq	MSG, MSG
 	sha256rnds2	STATE1, STATE0
 .if \i >= 4 && \i < 52
 	sha256msg1	\m0, \m3
 .endif
 .endm
@@ -126,21 +126,21 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	/*
 	 * load initial hash values
 	 * Need to reorder these appropriately
 	 * DCBA, HGFE -> ABEF, CDGH
 	 */
-	movdqu		0*16(DIGEST_PTR), STATE0
-	movdqu		1*16(DIGEST_PTR), STATE1
+	movdqu		0*16(DIGEST_PTR), STATE0	/* DCBA */
+	movdqu		1*16(DIGEST_PTR), STATE1	/* HGFE */
 
-	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
-	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
 	movdqa		STATE0, TMP
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, TMP, STATE1		/* CDGH */
+	punpcklqdq	STATE1, STATE0			/* FEBA */
+	punpckhqdq	TMP, STATE1			/* DCHG */
+	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
+	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
 
 	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
-	lea		K256(%rip), SHA256CONSTANTS
+	lea		K256+32*4(%rip), SHA256CONSTANTS
 
 .Lloop0:
 	/* Save hash values for addition after rounds */
 	movdqa		STATE0, ABEF_SAVE
 	movdqa		STATE1, CDGH_SAVE
@@ -160,18 +160,18 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
 	add		$64, DATA_PTR
 	cmp		NUM_BLKS, DATA_PTR
 	jne		.Lloop0
 
 	/* Write hash values back in the correct order */
-	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
-	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
 	movdqa		STATE0, TMP
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, TMP, STATE1			/* HGFE */
+	punpcklqdq	STATE1, STATE0			/* GHEF */
+	punpckhqdq	TMP, STATE1			/* ABCD */
+	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
+	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
 
-	movdqu		STATE0, 0*16(DIGEST_PTR)
-	movdqu		STATE1, 1*16(DIGEST_PTR)
+	movdqu		STATE1, 0*16(DIGEST_PTR)
+	movdqu		STATE0, 1*16(DIGEST_PTR)
 
 .Ldone_hash:
 
 	RET
 SYM_FUNC_END(sha256_ni_transform)
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds
  2024-04-11 16:23 [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
                   ` (2 preceding siblings ...)
  2024-04-11 16:23 ` [PATCH v2 3/4] crypto: x86/sha256-ni - optimize code size Eric Biggers
@ 2024-04-11 16:23 ` Eric Biggers
  2024-04-15 20:41   ` Stefan Kanthak
  2024-04-19 11:00 ` [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Herbert Xu
  4 siblings, 1 reply; 11+ messages in thread
From: Eric Biggers @ 2024-04-11 16:23 UTC (permalink / raw)
  To: linux-crypto; +Cc: linux-kernel, Stefan Kanthak

From: Eric Biggers <ebiggers@google.com>

Instead of loading the message words into both MSG and \m0 and then
adding the round constants to MSG, load the message words into \m0 and
the round constants into MSG and then add \m0 to MSG.  This shortens the
source code slightly.  It changes the instructions slightly, but it
doesn't affect binary code size and doesn't seem to affect performance.

Suggested-by: Stefan Kanthak <stefan.kanthak@nexgo.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/sha256_ni_asm.S | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index ffc9f1c75c15..d515a55a3bc1 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -76,17 +76,15 @@
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
 
 .macro do_4rounds	i, m0, m1, m2, m3
 .if \i < 16
-	movdqu		\i*4(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
-	movdqa		MSG, \m0
-.else
-	movdqa		\m0, MSG
+	movdqu		\i*4(DATA_PTR), \m0
+	pshufb		SHUF_MASK, \m0
 .endif
-	paddd		(\i-32)*4(SHA256CONSTANTS), MSG
+	movdqa		(\i-32)*4(SHA256CONSTANTS), MSG
+	paddd		\m0, MSG
 	sha256rnds2	STATE0, STATE1
 .if \i >= 12 && \i < 60
 	movdqa		\m0, TMP
 	palignr		$4, \m3, TMP
 	paddd		TMP, \m1
-- 
2.44.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds
  2024-04-11 16:23 ` [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds Eric Biggers
@ 2024-04-15 20:41   ` Stefan Kanthak
  2024-04-15 21:21     ` Eric Biggers
  0 siblings, 1 reply; 11+ messages in thread
From: Stefan Kanthak @ 2024-04-15 20:41 UTC (permalink / raw)
  To: Eric Biggers, linux-crypto; +Cc: linux-kernel

"Eric Biggers" <ebiggers@kernel.org> wrote:

> Instead of loading the message words into both MSG and \m0 and then
> adding the round constants to MSG, load the message words into \m0 and
> the round constants into MSG and then add \m0 to MSG.  This shortens the
> source code slightly.  It changes the instructions slightly, but it
> doesn't affect binary code size and doesn't seem to affect performance.

At last the final change: write the macro straightforward and SIMPLE,
closely matching NIST.FIPS.180-4.pdf and their order of operations.

@@ ...
+.macro  sha256  m0 :req, m1 :req, m2 :req, m3 :req
+.if \@ < 4
+        movdqu  \@*16(DATA_PTR), \m0
+        pshufb  SHUF_MASK, \m0          # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
+.else
+                                        # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
+                                        # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
+                                        # \m2 = {w(\@*16-8),  w(\@*16-7),  w(\@*16-6),  w(\@*16-5)}
+                                        # \m3 = {w(\@*16-4),  w(\@*16-3),  w(\@*16-2),  w(\@*16-1)}
+        sha256msg1 \m1, \m0
+        movdqa     \m3, TMP
+        palignr    $4, \m2, TMP
+        paddd      TMP, \m0
+        sha256msg2 \m3, \m0             # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
+.endif
+        movdqa      (\@-8)*16(SHA256CONSTANTS), MSG
+        paddd       \m0, MSG
+        sha256rnds2 STATE0, STATE1      # STATE1 = {f', e', b', a'}
+        punpckhqdq  MSG, MSG
+        sha256rnds2 STATE1, STATE0      # STATE0 = {f", e", b", a"},
+                                        # STATE1 = {h", g", d", c"}
+.endm

JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
      as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
      the macro, thus getting rid of its 4 arguments.

@@ ...
+.rept 4                                 # 4*4*4 rounds
+        sha256  MSG0, MSG1, MSG2, MSG3
+        sha256  MSG1, MSG2, MSG3, MSG0
+        sha256  MSG2, MSG3, MSG0, MSG1
+        sha256  MSG3, MSG0, MSG1, MSG2
+.endr

Now that all code written by Tim Chen and Sean Gulley is gone,
remove their copyright notice and insert your and my name instead.

regards
Stefan

PS: see <https://skanthak.homepage.t-online.de/fips-180.html>
    (which I still polish) not just for this implementation.

PPS: if MASM had a counter like \@, I'd used it there.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds
  2024-04-15 20:41   ` Stefan Kanthak
@ 2024-04-15 21:21     ` Eric Biggers
  2024-04-15 22:04       ` Stefan Kanthak
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Biggers @ 2024-04-15 21:21 UTC (permalink / raw)
  To: Stefan Kanthak; +Cc: linux-crypto, linux-kernel

On Mon, Apr 15, 2024 at 10:41:07PM +0200, Stefan Kanthak wrote:
> "Eric Biggers" <ebiggers@kernel.org> wrote:
> 
> > Instead of loading the message words into both MSG and \m0 and then
> > adding the round constants to MSG, load the message words into \m0 and
> > the round constants into MSG and then add \m0 to MSG.  This shortens the
> > source code slightly.  It changes the instructions slightly, but it
> > doesn't affect binary code size and doesn't seem to affect performance.
> 
> At last the final change: write the macro straightforward and SIMPLE,
> closely matching NIST.FIPS.180-4.pdf and their order of operations.
> 
> @@ ...
> +.macro  sha256  m0 :req, m1 :req, m2 :req, m3 :req
> +.if \@ < 4
> +        movdqu  \@*16(DATA_PTR), \m0
> +        pshufb  SHUF_MASK, \m0          # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
> +.else
> +                                        # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
> +                                        # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
> +                                        # \m2 = {w(\@*16-8),  w(\@*16-7),  w(\@*16-6),  w(\@*16-5)}
> +                                        # \m3 = {w(\@*16-4),  w(\@*16-3),  w(\@*16-2),  w(\@*16-1)}
> +        sha256msg1 \m1, \m0
> +        movdqa     \m3, TMP
> +        palignr    $4, \m2, TMP
> +        paddd      TMP, \m0
> +        sha256msg2 \m3, \m0             # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
> +.endif
> +        movdqa      (\@-8)*16(SHA256CONSTANTS), MSG
> +        paddd       \m0, MSG
> +        sha256rnds2 STATE0, STATE1      # STATE1 = {f', e', b', a'}
> +        punpckhqdq  MSG, MSG
> +        sha256rnds2 STATE1, STATE0      # STATE0 = {f", e", b", a"},
> +                                        # STATE1 = {h", g", d", c"}
> +.endm
> 
> JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
>       as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
>       the macro, thus getting rid of its 4 arguments.
> 
> @@ ...
> +.rept 4                                 # 4*4*4 rounds
> +        sha256  MSG0, MSG1, MSG2, MSG3
> +        sha256  MSG1, MSG2, MSG3, MSG0
> +        sha256  MSG2, MSG3, MSG0, MSG1
> +        sha256  MSG3, MSG0, MSG1, MSG2
> +.endr

Could you please send a real patch, following
Documentation/process/submitting-patches.rst?  It's hard to understand what
you're proposing here.

> Now that all code written by Tim Chen and Sean Gulley is gone,
> remove their copyright notice and insert your and my name instead.

Well, their code has been cleaned up.  We have to keep copyright notices around
unless we're certain they can go.

> 
> regards
> Stefan
> 
> PS: see <https://skanthak.homepage.t-online.de/fips-180.html>
>     (which I still polish) not just for this implementation.
> 
> PPS: if MASM had a counter like \@, I'd used it there.

Thanks,

- Eric

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds
  2024-04-15 21:21     ` Eric Biggers
@ 2024-04-15 22:04       ` Stefan Kanthak
  2024-04-15 22:46         ` Eric Biggers
  0 siblings, 1 reply; 11+ messages in thread
From: Stefan Kanthak @ 2024-04-15 22:04 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-crypto, linux-kernel

"Eric Biggers" <ebiggers@kernel.org> wrote:

> On Mon, Apr 15, 2024 at 10:41:07PM +0200, Stefan Kanthak wrote:
[...]
>> At last the final change: write the macro straightforward and SIMPLE,
>> closely matching NIST.FIPS.180-4.pdf and their order of operations.
>> 
>> @@ ...
>> +.macro  sha256  m0 :req, m1 :req, m2 :req, m3 :req
>> +.if \@ < 4
>> +        movdqu  \@*16(DATA_PTR), \m0
>> +        pshufb  SHUF_MASK, \m0          # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> +.else
>> +                                        # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
>> +                                        # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
>> +                                        # \m2 = {w(\@*16-8),  w(\@*16-7),  w(\@*16-6),  w(\@*16-5)}
>> +                                        # \m3 = {w(\@*16-4),  w(\@*16-3),  w(\@*16-2),  w(\@*16-1)}
>> +        sha256msg1 \m1, \m0
>> +        movdqa     \m3, TMP
>> +        palignr    $4, \m2, TMP
>> +        paddd      TMP, \m0
>> +        sha256msg2 \m3, \m0             # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> +.endif
>> +        movdqa      (\@-8)*16(SHA256CONSTANTS), MSG
>> +        paddd       \m0, MSG
>> +        sha256rnds2 STATE0, STATE1      # STATE1 = {f', e', b', a'}
>> +        punpckhqdq  MSG, MSG
>> +        sha256rnds2 STATE1, STATE0      # STATE0 = {f", e", b", a"},
>> +                                        # STATE1 = {h", g", d", c"}
>> +.endm
>> 
>> JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
>>       as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
>>       the macro, thus getting rid of its 4 arguments.
>> 
>> @@ ...
>> +.rept 4                                 # 4*4*4 rounds
>> +        sha256  MSG0, MSG1, MSG2, MSG3
>> +        sha256  MSG1, MSG2, MSG3, MSG0
>> +        sha256  MSG2, MSG3, MSG0, MSG1
>> +        sha256  MSG3, MSG0, MSG1, MSG2
>> +.endr
> 
> Could you please send a real patch, following
> Documentation/process/submitting-patches.rst?  It's hard to understand what
> you're proposing here.

1) I replace your macro (which unfortunately follows Tim Chens twisted code)
   COMPLETELY with a clean and simple implementation: message schedule first,
   update of state variables last.
   You don't need ".if \i >= 12 && \i < 60"/".if \i >= 4 && \i < 52" at all!

2) I replace the .irp which invokes your macro with a .rept: my macro uses \@
   instead of an argument for the round number.

<https://sourceware.org/binutils/docs/as.html#Macro>

| \@
|    as maintains a counter of how many macros it has executed in this pseudo-
|    variable; you can copy that number to your output with '\@', but only
|    within a macro definition. 

That's all.

regards
Stefan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds
  2024-04-15 22:04       ` Stefan Kanthak
@ 2024-04-15 22:46         ` Eric Biggers
  2024-04-16  0:17           ` Stefan Kanthak
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Biggers @ 2024-04-15 22:46 UTC (permalink / raw)
  To: Stefan Kanthak; +Cc: linux-crypto, linux-kernel

On Tue, Apr 16, 2024 at 12:04:56AM +0200, Stefan Kanthak wrote:
> "Eric Biggers" <ebiggers@kernel.org> wrote:
> 
> > On Mon, Apr 15, 2024 at 10:41:07PM +0200, Stefan Kanthak wrote:
> [...]
> >> At last the final change: write the macro straightforward and SIMPLE,
> >> closely matching NIST.FIPS.180-4.pdf and their order of operations.
> >> 
> >> @@ ...
> >> +.macro  sha256  m0 :req, m1 :req, m2 :req, m3 :req
> >> +.if \@ < 4
> >> +        movdqu  \@*16(DATA_PTR), \m0
> >> +        pshufb  SHUF_MASK, \m0          # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
> >> +.else
> >> +                                        # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
> >> +                                        # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
> >> +                                        # \m2 = {w(\@*16-8),  w(\@*16-7),  w(\@*16-6),  w(\@*16-5)}
> >> +                                        # \m3 = {w(\@*16-4),  w(\@*16-3),  w(\@*16-2),  w(\@*16-1)}
> >> +        sha256msg1 \m1, \m0
> >> +        movdqa     \m3, TMP
> >> +        palignr    $4, \m2, TMP
> >> +        paddd      TMP, \m0
> >> +        sha256msg2 \m3, \m0             # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
> >> +.endif
> >> +        movdqa      (\@-8)*16(SHA256CONSTANTS), MSG
> >> +        paddd       \m0, MSG
> >> +        sha256rnds2 STATE0, STATE1      # STATE1 = {f', e', b', a'}
> >> +        punpckhqdq  MSG, MSG
> >> +        sha256rnds2 STATE1, STATE0      # STATE0 = {f", e", b", a"},
> >> +                                        # STATE1 = {h", g", d", c"}
> >> +.endm
> >> 
> >> JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
> >>       as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
> >>       the macro, thus getting rid of its 4 arguments.
> >> 
> >> @@ ...
> >> +.rept 4                                 # 4*4*4 rounds
> >> +        sha256  MSG0, MSG1, MSG2, MSG3
> >> +        sha256  MSG1, MSG2, MSG3, MSG0
> >> +        sha256  MSG2, MSG3, MSG0, MSG1
> >> +        sha256  MSG3, MSG0, MSG1, MSG2
> >> +.endr
> > 
> > Could you please send a real patch, following
> > Documentation/process/submitting-patches.rst?  It's hard to understand what
> > you're proposing here.
> 
> 1) I replace your macro (which unfortunately follows Tim Chens twisted code)
>    COMPLETELY with a clean and simple implementation: message schedule first,
>    update of state variables last.
>    You don't need ".if \i >= 12 && \i < 60"/".if \i >= 4 && \i < 52" at all!

It's probably intentional that the code does the message schedule computations a
bit ahead of time.  This might improve performance by reducing the time spent
waiting for the message schedule.

It would be worth trying a few different variants on different CPUs and seeing
how they actually perform in practice, though.

> 
> 2) I replace the .irp which invokes your macro with a .rept: my macro uses \@
>    instead of an argument for the round number.
> 
> <https://sourceware.org/binutils/docs/as.html#Macro>

The \@ feature is a bit obscure and maybe is best avoided.

- Eric

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds
  2024-04-15 22:46         ` Eric Biggers
@ 2024-04-16  0:17           ` Stefan Kanthak
  0 siblings, 0 replies; 11+ messages in thread
From: Stefan Kanthak @ 2024-04-16  0:17 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-crypto, linux-kernel

"Eric Biggers" <ebiggers@kernel.org> wrote:

> On Tue, Apr 16, 2024 at 12:04:56AM +0200, Stefan Kanthak wrote:
>> "Eric Biggers" <ebiggers@kernel.org> wrote:
>> 
>> > On Mon, Apr 15, 2024 at 10:41:07PM +0200, Stefan Kanthak wrote:
>> [...]
>> >> At last the final change: write the macro straightforward and SIMPLE,
>> >> closely matching NIST.FIPS.180-4.pdf and their order of operations.
>> >> 
>> >> @@ ...
>> >> +.macro  sha256  m0 :req, m1 :req, m2 :req, m3 :req
>> >> +.if \@ < 4
>> >> +        movdqu  \@*16(DATA_PTR), \m0
>> >> +        pshufb  SHUF_MASK, \m0          # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> >> +.else
>> >> +                                        # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
>> >> +                                        # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
>> >> +                                        # \m2 = {w(\@*16-8),  w(\@*16-7),  w(\@*16-6),  w(\@*16-5)}
>> >> +                                        # \m3 = {w(\@*16-4),  w(\@*16-3),  w(\@*16-2),  w(\@*16-1)}
>> >> +        sha256msg1 \m1, \m0
>> >> +        movdqa     \m3, TMP
>> >> +        palignr    $4, \m2, TMP
>> >> +        paddd      TMP, \m0
>> >> +        sha256msg2 \m3, \m0             # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> >> +.endif
>> >> +        movdqa      (\@-8)*16(SHA256CONSTANTS), MSG
>> >> +        paddd       \m0, MSG
>> >> +        sha256rnds2 STATE0, STATE1      # STATE1 = {f', e', b', a'}
>> >> +        punpckhqdq  MSG, MSG
>> >> +        sha256rnds2 STATE1, STATE0      # STATE0 = {f", e", b", a"},
>> >> +                                        # STATE1 = {h", g", d", c"}
>> >> +.endm
>> >> 
>> >> JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
>> >>       as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
>> >>       the macro, thus getting rid of its 4 arguments.
>> >> 
>> >> @@ ...
>> >> +.rept 4                                 # 4*4*4 rounds
>> >> +        sha256  MSG0, MSG1, MSG2, MSG3
>> >> +        sha256  MSG1, MSG2, MSG3, MSG0
>> >> +        sha256  MSG2, MSG3, MSG0, MSG1
>> >> +        sha256  MSG3, MSG0, MSG1, MSG2
>> >> +.endr
>> > 
>> > Could you please send a real patch, following
>> > Documentation/process/submitting-patches.rst?  It's hard to understand what
>> > you're proposing here.
>> 
>> 1) I replace your macro (which unfortunately follows Tim Chens twisted code)
>>    COMPLETELY with a clean and simple implementation: message schedule first,
>>    update of state variables last.
>>    You don't need ".if \i >= 12 && \i < 60"/".if \i >= 4 && \i < 52" at all!
> 
> It's probably intentional that the code does the message schedule computations a
> bit ahead of time.  This might improve performance by reducing the time spent
> waiting for the message schedule.

While this is a valid point, Intel states in
<https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html>
only for SHA-1:

| In other words, the rounds processing is the critical path and the latency of
| sha1rnds4 determines the performance of SHA-1 calculations.

For SHA-256 no such explicit statement is given: did the authors consider it not
worthwhile?

JFTR: while Tim Chen's code (following the paper) executes 3 instructions and 1
      sha256msg2 between every 2 sha256rnds2, my macro executes them back to back,
      so my code would be slower if their latency determines performance.

> It would be worth trying a few different variants on different CPUs and seeing
> how they actually perform in practice, though.

Right. I noticed no difference on Zen2+ and Zen3; Intel CPUs with SHA-NI are not
available to me (I didn't bother to use __asm__ on Compiler Explorer).

Stefan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization
  2024-04-11 16:23 [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
                   ` (3 preceding siblings ...)
  2024-04-11 16:23 ` [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds Eric Biggers
@ 2024-04-19 11:00 ` Herbert Xu
  4 siblings, 0 replies; 11+ messages in thread
From: Herbert Xu @ 2024-04-19 11:00 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-crypto, linux-kernel, stefan.kanthak

Eric Biggers <ebiggers@kernel.org> wrote:
> This patchset reduces the amount of source code duplication in
> sha256_ni_asm.S and also reduces the binary code size slightly.
> 
> Changed in v2:
>   - Take advantage of .irp in patch 1
>   - Added two additional cleanup patches
> 
> Eric Biggers (4):
>  crypto: x86/sha256-ni - convert to use rounds macros
>  crypto: x86/sha256-ni - rename some register aliases
>  crypto: x86/sha256-ni - optimize code size
>  crypto: x86/sha256-ni - simplify do_4rounds
> 
> arch/x86/crypto/sha256_ni_asm.S | 253 +++++++-------------------------
> 1 file changed, 49 insertions(+), 204 deletions(-)
> 
> 
> base-commit: 20cb38a7af88dc40095da7c2c9094da3873fea23

All applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-04-19 11:00 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-11 16:23 [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Eric Biggers
2024-04-11 16:23 ` [PATCH v2 1/4] crypto: x86/sha256-ni - convert to use rounds macros Eric Biggers
2024-04-11 16:23 ` [PATCH v2 2/4] crypto: x86/sha256-ni - rename some register aliases Eric Biggers
2024-04-11 16:23 ` [PATCH v2 3/4] crypto: x86/sha256-ni - optimize code size Eric Biggers
2024-04-11 16:23 ` [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds Eric Biggers
2024-04-15 20:41   ` Stefan Kanthak
2024-04-15 21:21     ` Eric Biggers
2024-04-15 22:04       ` Stefan Kanthak
2024-04-15 22:46         ` Eric Biggers
2024-04-16  0:17           ` Stefan Kanthak
2024-04-19 11:00 ` [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization Herbert Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox