[PATCH] crypto: arm64/aes-ce - Simplify round key load sequence

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] crypto: arm64/aes-ce - Simplify round key load sequence
@ 2024-04-15 13:04 Ard Biesheuvel
  2024-04-17  6:14 ` Eric Biggers
  2024-04-26  9:30 ` Herbert Xu
  0 siblings, 2 replies; 3+ messages in thread
From: Ard Biesheuvel @ 2024-04-15 13:04 UTC (permalink / raw)
  To: linux-crypto; +Cc: herbert, ebiggers, Ard Biesheuvel

From: Ard Biesheuvel <ardb@kernel.org>

Tweak the round key logic so that they can be loaded using a single
branchless sequence using overlapping loads. This is shorter and
simpler, and puts the conditional branches based on the key size further
apart, which might benefit microarchitectures that cannot record taken
branches at every instruction. For these branches, use test-bit-branch
instructions that don't clobber the condition flags.

Note that none of this has any impact on performance, positive or
otherwise (and the branch prediction benefit would only benefit AES-192
which nobody uses). It does make for nicer code, though.

While at it, use \@ to generate the labels inside the macros, which is
more robust than using fixed numbers, which could clash inadvertently.
Also, bring aes-neon.S in line with these changes, including the switch
to test-and-branch instructions, to avoid surprises in the future when
we might start relying on the condition flags being preserved in the
chaining mode wrappers in aes-modes.S

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/crypto/aes-ce.S   | 34 ++++++++++++++--------------------
 arch/arm64/crypto/aes-neon.S | 20 ++++++++++----------
 2 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 1dc5bbbfeed2..b262eaa9170c 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -25,33 +25,28 @@
 	.endm
 
 	/* preload all round keys */
-	.macro		load_round_keys, rounds, rk
-	cmp		\rounds, #12
-	blo		2222f		/* 128 bits */
-	beq		1111f		/* 192 bits */
-	ld1		{v17.4s-v18.4s}, [\rk], #32
-1111:	ld1		{v19.4s-v20.4s}, [\rk], #32
-2222:	ld1		{v21.4s-v24.4s}, [\rk], #64
-	ld1		{v25.4s-v28.4s}, [\rk], #64
-	ld1		{v29.4s-v31.4s}, [\rk]
+	.macro		load_round_keys, rk, nr, tmp
+	add		\tmp, \rk, \nr, sxtw #4
+	sub		\tmp, \tmp, #160
+	ld1		{v17.4s-v20.4s}, [\rk]
+	ld1		{v21.4s-v24.4s}, [\tmp], #64
+	ld1		{v25.4s-v28.4s}, [\tmp], #64
+	ld1		{v29.4s-v31.4s}, [\tmp]
 	.endm
 
 	/* prepare for encryption with key in rk[] */
 	.macro		enc_prepare, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	load_round_keys	\rk, \rounds, \temp
 	.endm
 
 	/* prepare for encryption (again) but with new key in rk[] */
 	.macro		enc_switch_key, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	load_round_keys	\rk, \rounds, \temp
 	.endm
 
 	/* prepare for decryption with key in rk[] */
 	.macro		dec_prepare, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	load_round_keys	\rk, \rounds, \temp
 	.endm
 
 	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
@@ -110,14 +105,13 @@
 
 	/* up to 5 interleaved blocks */
 	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
-	cmp		\rounds, #12
-	blo		2222f		/* 128 bits */
-	beq		1111f		/* 192 bits */
+	tbz		\rounds, #2, .L\@	/* 128 bits */
 	round_Nx	\enc, v17, \i0, \i1, \i2, \i3, \i4
 	round_Nx	\enc, v18, \i0, \i1, \i2, \i3, \i4
-1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3, \i4
+	tbz		\rounds, #1, .L\@	/* 192 bits */
+	round_Nx	\enc, v19, \i0, \i1, \i2, \i3, \i4
 	round_Nx	\enc, v20, \i0, \i1, \i2, \i3, \i4
-2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+.L\@:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
 	round_Nx	\enc, \key, \i0, \i1, \i2, \i3, \i4
 	.endr
 	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3, \i4
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 9de7fbc797af..3a8961b6ea51 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -99,16 +99,16 @@
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
-1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+.La\@:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	movi		v15.16b, #0x40
 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes	\in
-	subs		\i, \i, #1
+	sub		\i, \i, #1
 	ld1		{v15.4s}, [\rkp], #16
-	beq		2222f
+	cbz		\i, .Lb\@
 	mix_columns	\in, \enc
-	b		1111b
-2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	b		.La\@
+.Lb\@:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	.endm
 
 	.macro		encrypt_block, in, rounds, rk, rkp, i
@@ -206,7 +206,7 @@
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
-1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+.La\@:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
@@ -216,13 +216,13 @@
 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes_4x	\in0, \in1, \in2, \in3
-	subs		\i, \i, #1
+	sub		\i, \i, #1
 	ld1		{v15.4s}, [\rkp], #16
-	beq		2222f
+	cbz		\i, .Lb\@
 	mix_columns_2x	\in0, \in1, \enc
 	mix_columns_2x	\in2, \in3, \enc
-	b		1111b
-2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	b		.La\@
+.Lb\@:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
-- 
2.44.0.683.g7961c838ac-goog


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] crypto: arm64/aes-ce - Simplify round key load sequence
  2024-04-15 13:04 [PATCH] crypto: arm64/aes-ce - Simplify round key load sequence Ard Biesheuvel
@ 2024-04-17  6:14 ` Eric Biggers
  2024-04-26  9:30 ` Herbert Xu
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Biggers @ 2024-04-17  6:14 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, herbert, Ard Biesheuvel

On Mon, Apr 15, 2024 at 03:04:26PM +0200, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Tweak the round key logic so that they can be loaded using a single
> branchless sequence using overlapping loads. This is shorter and
> simpler, and puts the conditional branches based on the key size further
> apart, which might benefit microarchitectures that cannot record taken
> branches at every instruction. For these branches, use test-bit-branch
> instructions that don't clobber the condition flags.
> 
> Note that none of this has any impact on performance, positive or
> otherwise (and the branch prediction benefit would only benefit AES-192
> which nobody uses). It does make for nicer code, though.
> 
> While at it, use \@ to generate the labels inside the macros, which is
> more robust than using fixed numbers, which could clash inadvertently.
> Also, bring aes-neon.S in line with these changes, including the switch
> to test-and-branch instructions, to avoid surprises in the future when
> we might start relying on the condition flags being preserved in the
> chaining mode wrappers in aes-modes.S
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/arm64/crypto/aes-ce.S   | 34 ++++++++++++++--------------------
>  arch/arm64/crypto/aes-neon.S | 20 ++++++++++----------
>  2 files changed, 24 insertions(+), 30 deletions(-)

Reviewed-by: Eric Biggers <ebiggers@google.com>

- Eric

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] crypto: arm64/aes-ce - Simplify round key load sequence
  2024-04-15 13:04 [PATCH] crypto: arm64/aes-ce - Simplify round key load sequence Ard Biesheuvel
  2024-04-17  6:14 ` Eric Biggers
@ 2024-04-26  9:30 ` Herbert Xu
  1 sibling, 0 replies; 3+ messages in thread
From: Herbert Xu @ 2024-04-26  9:30 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, ebiggers, Ard Biesheuvel

On Mon, Apr 15, 2024 at 03:04:26PM +0200, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Tweak the round key logic so that they can be loaded using a single
> branchless sequence using overlapping loads. This is shorter and
> simpler, and puts the conditional branches based on the key size further
> apart, which might benefit microarchitectures that cannot record taken
> branches at every instruction. For these branches, use test-bit-branch
> instructions that don't clobber the condition flags.
> 
> Note that none of this has any impact on performance, positive or
> otherwise (and the branch prediction benefit would only benefit AES-192
> which nobody uses). It does make for nicer code, though.
> 
> While at it, use \@ to generate the labels inside the macros, which is
> more robust than using fixed numbers, which could clash inadvertently.
> Also, bring aes-neon.S in line with these changes, including the switch
> to test-and-branch instructions, to avoid surprises in the future when
> we might start relying on the condition flags being preserved in the
> chaining mode wrappers in aes-modes.S
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/arm64/crypto/aes-ce.S   | 34 ++++++++++++++--------------------
>  arch/arm64/crypto/aes-neon.S | 20 ++++++++++----------
>  2 files changed, 24 insertions(+), 30 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2024-04-26  9:29 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-04-15 13:04 [PATCH] crypto: arm64/aes-ce - Simplify round key load sequence Ard Biesheuvel
2024-04-17  6:14 ` Eric Biggers
2024-04-26  9:30 ` Herbert Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.