All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Herbert Xu <herbert@gondor.apana.org.au>,
	linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, linux-riscv@lists.infradead.org,
	linux-s390@vger.kernel.org, sparclinux@vger.kernel.org,
	x86@kernel.org, Holger Dengler <dengler@linux.ibm.com>,
	Harald Freudenberger <freude@linux.ibm.com>,
	Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 17/36] lib/crypto: x86/aes: Add AES-NI optimization
Date: Sun,  4 Jan 2026 21:12:50 -0800	[thread overview]
Message-ID: <20260105051311.1607207-18-ebiggers@kernel.org> (raw)
In-Reply-To: <20260105051311.1607207-1-ebiggers@kernel.org>

Optimize the AES library with x86 AES-NI instructions.

The relevant existing assembly functions, aesni_set_key(), aesni_enc(),
and aesni_dec(), are a bit difficult to extract into the library:

- They're coupled to the code for the AES modes.
- They operate on struct crypto_aes_ctx.  The AES library now uses
  different structs.
- They assume the key is 16-byte aligned.  The AES library only
  *prefers* 16-byte alignment; it doesn't require it.

Moreover, they're not all that great in the first place:

- They use unrolled loops, which isn't a great choice on x86.
- They use the 'aeskeygenassist' instruction, which is unnecessary, is
  slow on Intel CPUs, and forces the loop to be unrolled.
- They have special code for AES-192 key expansion, despite that being
  kind of useless.  AES-128 and AES-256 are the ones used in practice.

These are small functions anyway.

Therefore, I opted to just write replacements of these functions for the
library.  They address all the above issues.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/Kconfig         |   1 +
 lib/crypto/Makefile        |   1 +
 lib/crypto/x86/aes-aesni.S | 261 +++++++++++++++++++++++++++++++++++++
 lib/crypto/x86/aes.h       |  85 ++++++++++++
 4 files changed, 348 insertions(+)
 create mode 100644 lib/crypto/x86/aes-aesni.S
 create mode 100644 lib/crypto/x86/aes.h

diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 222887c04240..e3ee31217988 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -19,10 +19,11 @@ config CRYPTO_LIB_AES_ARCH
 	default y if PPC && (SPE || (PPC64 && VSX))
 	default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
 		     RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	default y if S390
 	default y if SPARC64
+	default y if X86
 
 config CRYPTO_LIB_AESCFB
 	tristate
 	select CRYPTO_LIB_AES
 	select CRYPTO_LIB_UTILS
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 761d52d91f92..725eef05b758 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -50,10 +50,11 @@ OBJECT_FILES_NON_STANDARD_powerpc/aesp8-ppc.o := y
 endif # !CONFIG_SPE
 endif # CONFIG_PPC
 
 libaes-$(CONFIG_RISCV) += riscv/aes-riscv64-zvkned.o
 libaes-$(CONFIG_SPARC) += sparc/aes_asm.o
+libaes-$(CONFIG_X86) += x86/aes-aesni.o
 endif # CONFIG_CRYPTO_LIB_AES_ARCH
 
 ################################################################################
 
 obj-$(CONFIG_CRYPTO_LIB_AESCFB)			+= libaescfb.o
diff --git a/lib/crypto/x86/aes-aesni.S b/lib/crypto/x86/aes-aesni.S
new file mode 100644
index 000000000000..b8c3e104a3be
--- /dev/null
+++ b/lib/crypto/x86/aes-aesni.S
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// AES block cipher using AES-NI instructions
+//
+// Copyright 2026 Google LLC
+//
+// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
+// AVX.  It does use up to SSE4.1, which all CPUs with AES-NI have.
+#include <linux/linkage.h>
+
+.section .rodata
+#ifdef __x86_64__
+#define RODATA(label)	label(%rip)
+#else
+#define RODATA(label)	label
+#endif
+
+	// A mask for pshufb that extracts the last dword, rotates it right by 8
+	// bits, and copies the result to all four dwords.
+.p2align 4
+.Lmask:
+	.byte	13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12
+
+	// The AES round constants, used during key expansion
+.Lrcon:
+	.long	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+
+.text
+
+// Transform four dwords [a0, a1, a2, a3] in \a into
+// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3].  \tmp is a temporary xmm register.
+//
+// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
+// if the temporary register were zero-initialized ahead of time.  We instead do
+// it in an easier-to-understand way that doesn't require zero-initialization
+// and avoids the unusual shufps instruction.  movdqa is usually "free" anyway.
+.macro	_prefix_sum	a, tmp
+	movdqa		\a, \tmp	// [a0, a1, a2, a3]
+	pslldq		$4, \a		// [0, a0, a1, a2]
+	pxor		\tmp, \a	// [a0, a0^a1, a1^a2, a2^a3]
+	movdqa		\a, \tmp
+	pslldq		$8, \a		// [0, 0, a0, a0^a1]
+	pxor		\tmp, \a	// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
+.endm
+
+.macro	_gen_round_key	a, b
+	// Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
+	// the last dword of the previous round key (given in \b).
+	//
+	// 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
+	// It is used here solely for the SubBytes and the XOR.  The ShiftRows
+	// is a no-op because all four columns are the same here.
+	//
+	// Don't use the 'aeskeygenassist' instruction, since:
+	//  - On most Intel CPUs it is microcoded, making it have a much higher
+	//    latency and use more execution ports than 'aesenclast'.
+	//  - It cannot be used in a loop, since it requires an immediate.
+	//  - It doesn't do much more than 'aesenclast' in the first place.
+	movdqa		\b, %xmm2
+	pshufb		MASK, %xmm2
+	aesenclast	RCON, %xmm2
+
+	// XOR in the prefix sum of the four dwords of \a, which is the
+	// previous round key (AES-128) or the first round key in the previous
+	// pair of round keys (AES-256).  The result is the next round key.
+	_prefix_sum	\a, tmp=%xmm3
+	pxor		%xmm2, \a
+
+	// Store the next round key to memory.  Also leave it in \a.
+	movdqu		\a, (RNDKEYS)
+.endm
+
+.macro	_aes_expandkey_aesni	is_aes128
+#ifdef __x86_64__
+	// Arguments
+	.set	RNDKEYS,	%rdi
+	.set	INV_RNDKEYS,	%rsi
+	.set	IN_KEY,		%rdx
+
+	// Other local variables
+	.set	RCON_PTR,	%rcx
+	.set	COUNTER,	%eax
+#else
+	// Arguments, assuming -mregparm=3
+	.set	RNDKEYS,	%eax
+	.set	INV_RNDKEYS,	%edx
+	.set	IN_KEY,		%ecx
+
+	// Other local variables
+	.set	RCON_PTR,	%ebx
+	.set	COUNTER,	%esi
+#endif
+	.set	RCON,		%xmm6
+	.set	MASK,		%xmm7
+
+#ifdef __i386__
+	push		%ebx
+	push		%esi
+#endif
+
+.if \is_aes128
+	// AES-128: the first round key is simply a copy of the raw key.
+	movdqu		(IN_KEY), %xmm0
+	movdqu		%xmm0, (RNDKEYS)
+.else
+	// AES-256: the first two round keys are simply a copy of the raw key.
+	movdqu		(IN_KEY), %xmm0
+	movdqu		%xmm0, (RNDKEYS)
+	movdqu		16(IN_KEY), %xmm1
+	movdqu		%xmm1, 16(RNDKEYS)
+	add		$32, RNDKEYS
+.endif
+
+	// Generate the remaining round keys.
+	movdqa		RODATA(.Lmask), MASK
+.if \is_aes128
+	lea		RODATA(.Lrcon), RCON_PTR
+	mov		$10, COUNTER
+.Lgen_next_aes128_round_key:
+	add		$16, RNDKEYS
+	movd		(RCON_PTR), RCON
+	pshufd		$0x00, RCON, RCON
+	add		$4, RCON_PTR
+	_gen_round_key	%xmm0, %xmm0
+	dec		COUNTER
+	jnz		.Lgen_next_aes128_round_key
+.else
+	// AES-256: only the first 7 round constants are needed, so instead of
+	// loading each one from memory, just start by loading [1, 1, 1, 1] and
+	// then generate the rest by doubling.
+	pshufd		$0x00, RODATA(.Lrcon), RCON
+	pxor		%xmm5, %xmm5	// All-zeroes
+	mov		$7, COUNTER
+.Lgen_next_aes256_round_key_pair:
+	// Generate the next AES-256 round key: either the first of a pair of
+	// two, or the last one.
+	_gen_round_key	%xmm0, %xmm1
+
+	dec		COUNTER
+	jz		.Lgen_aes256_round_keys_done
+
+	// Generate the second AES-256 round key of the pair.  Compared to the
+	// first, there's no rotation and no XOR of a round constant.
+	pshufd		$0xff, %xmm0, %xmm2	// Get four copies of last dword
+	aesenclast	%xmm5, %xmm2		// Just does SubBytes
+	_prefix_sum	%xmm1, tmp=%xmm3
+	pxor		%xmm2, %xmm1
+	movdqu		%xmm1, 16(RNDKEYS)
+	add		$32, RNDKEYS
+	paddd		RCON, RCON		// RCON <<= 1
+	jmp		.Lgen_next_aes256_round_key_pair
+.Lgen_aes256_round_keys_done:
+.endif
+
+	// If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
+	// Inverse Cipher to it.  To do that, reverse the standard round keys,
+	// and apply aesimc (InvMixColumn) to each except the first and last.
+	test		INV_RNDKEYS, INV_RNDKEYS
+	jz		.Ldone\@
+	movdqu		(RNDKEYS), %xmm0	// Last standard round key
+	movdqu		%xmm0, (INV_RNDKEYS)	// => First inverse round key
+.if \is_aes128
+	mov		$9, COUNTER
+.else
+	mov		$13, COUNTER
+.endif
+.Lgen_next_inv_round_key\@:
+	sub		$16, RNDKEYS
+	add		$16, INV_RNDKEYS
+	movdqu		(RNDKEYS), %xmm0
+	aesimc		%xmm0, %xmm0
+	movdqu		%xmm0, (INV_RNDKEYS)
+	dec		COUNTER
+	jnz		.Lgen_next_inv_round_key\@
+	movdqu		-16(RNDKEYS), %xmm0	// First standard round key
+	movdqu		%xmm0, 16(INV_RNDKEYS)	// => Last inverse round key
+
+.Ldone\@:
+#ifdef __i386__
+	pop		%esi
+	pop		%ebx
+#endif
+	RET
+.endm
+
+// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+//			       const u8 in_key[AES_KEYSIZE_128]);
+SYM_FUNC_START(aes128_expandkey_aesni)
+	_aes_expandkey_aesni	1
+SYM_FUNC_END(aes128_expandkey_aesni)
+
+// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+//			       const u8 in_key[AES_KEYSIZE_256]);
+SYM_FUNC_START(aes256_expandkey_aesni)
+	_aes_expandkey_aesni	0
+SYM_FUNC_END(aes256_expandkey_aesni)
+
+.macro	_aes_crypt_aesni	enc
+#ifdef __x86_64__
+	.set	RNDKEYS,	%rdi
+	.set	NROUNDS,	%esi
+	.set	OUT,		%rdx
+	.set	IN,		%rcx
+#else
+	// Assuming -mregparm=3
+	.set	RNDKEYS,	%eax
+	.set	NROUNDS,	%edx
+	.set	OUT,		%ecx
+	.set	IN,		%ebx	// Passed on stack
+#endif
+
+#ifdef __i386__
+	push		%ebx
+	mov		8(%esp), %ebx
+#endif
+
+	// Zero-th round
+	movdqu		(IN), %xmm0
+	movdqu		(RNDKEYS), %xmm1
+	pxor		%xmm1, %xmm0
+
+	// Normal rounds
+	add		$16, RNDKEYS
+	dec		NROUNDS
+.Lnext_round\@:
+	movdqu		(RNDKEYS), %xmm1
+.if \enc
+	aesenc		%xmm1, %xmm0
+.else
+	aesdec		%xmm1, %xmm0
+.endif
+	add		$16, RNDKEYS
+	dec		NROUNDS
+	jne		.Lnext_round\@
+
+	// Last round
+	movdqu		(RNDKEYS), %xmm1
+.if \enc
+	aesenclast	%xmm1, %xmm0
+.else
+	aesdeclast	%xmm1, %xmm0
+.endif
+	movdqu		%xmm0, (OUT)
+
+#ifdef __i386__
+	pop		%ebx
+#endif
+	RET
+.endm
+
+// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_encrypt_aesni)
+	_aes_crypt_aesni	1
+SYM_FUNC_END(aes_encrypt_aesni)
+
+// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_decrypt_aesni)
+	_aes_crypt_aesni	0
+SYM_FUNC_END(aes_decrypt_aesni)
diff --git a/lib/crypto/x86/aes.h b/lib/crypto/x86/aes.h
new file mode 100644
index 000000000000..b047dee94f57
--- /dev/null
+++ b/lib/crypto/x86/aes.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES block cipher using AES-NI instructions
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <asm/fpu/api.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes);
+
+void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+			    const u8 in_key[AES_KEYSIZE_128]);
+void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+			    const u8 in_key[AES_KEYSIZE_256]);
+void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+		       u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+		       u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+
+/*
+ * Expand an AES key using AES-NI if supported and usable or generic code
+ * otherwise.  The expanded key format is compatible between the two cases.  The
+ * outputs are @k->rndkeys (required) and @inv_k->inv_rndkeys (optional).
+ *
+ * We could just always use the generic key expansion code.  AES key expansion
+ * is usually less performance-critical than AES en/decryption.  However,
+ * there's still *some* value in speed here, as well as in non-key-dependent
+ * execution time which AES-NI provides.  So, do use AES-NI to expand AES-128
+ * and AES-256 keys.  (Don't bother with AES-192, as it's almost never used.)
+ */
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	u32 *rndkeys = k->rndkeys;
+	u32 *inv_rndkeys = inv_k ? inv_k->inv_rndkeys : NULL;
+
+	if (static_branch_likely(&have_aes) && key_len != AES_KEYSIZE_192 &&
+	    irq_fpu_usable()) {
+		kernel_fpu_begin();
+		if (key_len == AES_KEYSIZE_128)
+			aes128_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+		else
+			aes256_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+		kernel_fpu_end();
+	} else {
+		aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len);
+	}
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		aes_encrypt_aesni(key->k.rndkeys, key->nrounds, out, in);
+		kernel_fpu_end();
+	} else {
+		aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+	}
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		aes_decrypt_aesni(key->inv_k.inv_rndkeys, key->nrounds,
+				  out, in);
+		kernel_fpu_end();
+	} else {
+		aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+				    out, in);
+	}
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AES))
+		static_branch_enable(&have_aes);
+}
-- 
2.52.0



WARNING: multiple messages have this Message-ID (diff)
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Herbert Xu <herbert@gondor.apana.org.au>,
	linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, linux-riscv@lists.infradead.org,
	linux-s390@vger.kernel.org, sparclinux@vger.kernel.org,
	x86@kernel.org, Holger Dengler <dengler@linux.ibm.com>,
	Harald Freudenberger <freude@linux.ibm.com>,
	Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH 17/36] lib/crypto: x86/aes: Add AES-NI optimization
Date: Sun,  4 Jan 2026 21:12:50 -0800	[thread overview]
Message-ID: <20260105051311.1607207-18-ebiggers@kernel.org> (raw)
In-Reply-To: <20260105051311.1607207-1-ebiggers@kernel.org>

Optimize the AES library with x86 AES-NI instructions.

The relevant existing assembly functions, aesni_set_key(), aesni_enc(),
and aesni_dec(), are a bit difficult to extract into the library:

- They're coupled to the code for the AES modes.
- They operate on struct crypto_aes_ctx.  The AES library now uses
  different structs.
- They assume the key is 16-byte aligned.  The AES library only
  *prefers* 16-byte alignment; it doesn't require it.

Moreover, they're not all that great in the first place:

- They use unrolled loops, which isn't a great choice on x86.
- They use the 'aeskeygenassist' instruction, which is unnecessary, is
  slow on Intel CPUs, and forces the loop to be unrolled.
- They have special code for AES-192 key expansion, despite that being
  kind of useless.  AES-128 and AES-256 are the ones used in practice.

These are small functions anyway.

Therefore, I opted to just write replacements of these functions for the
library.  They address all the above issues.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/Kconfig         |   1 +
 lib/crypto/Makefile        |   1 +
 lib/crypto/x86/aes-aesni.S | 261 +++++++++++++++++++++++++++++++++++++
 lib/crypto/x86/aes.h       |  85 ++++++++++++
 4 files changed, 348 insertions(+)
 create mode 100644 lib/crypto/x86/aes-aesni.S
 create mode 100644 lib/crypto/x86/aes.h

diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 222887c04240..e3ee31217988 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -19,10 +19,11 @@ config CRYPTO_LIB_AES_ARCH
 	default y if PPC && (SPE || (PPC64 && VSX))
 	default y if RISCV && 64BIT && TOOLCHAIN_HAS_VECTOR_CRYPTO && \
 		     RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
 	default y if S390
 	default y if SPARC64
+	default y if X86
 
 config CRYPTO_LIB_AESCFB
 	tristate
 	select CRYPTO_LIB_AES
 	select CRYPTO_LIB_UTILS
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 761d52d91f92..725eef05b758 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -50,10 +50,11 @@ OBJECT_FILES_NON_STANDARD_powerpc/aesp8-ppc.o := y
 endif # !CONFIG_SPE
 endif # CONFIG_PPC
 
 libaes-$(CONFIG_RISCV) += riscv/aes-riscv64-zvkned.o
 libaes-$(CONFIG_SPARC) += sparc/aes_asm.o
+libaes-$(CONFIG_X86) += x86/aes-aesni.o
 endif # CONFIG_CRYPTO_LIB_AES_ARCH
 
 ################################################################################
 
 obj-$(CONFIG_CRYPTO_LIB_AESCFB)			+= libaescfb.o
diff --git a/lib/crypto/x86/aes-aesni.S b/lib/crypto/x86/aes-aesni.S
new file mode 100644
index 000000000000..b8c3e104a3be
--- /dev/null
+++ b/lib/crypto/x86/aes-aesni.S
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+//
+// AES block cipher using AES-NI instructions
+//
+// Copyright 2026 Google LLC
+//
+// The code in this file supports 32-bit and 64-bit CPUs, and it doesn't require
+// AVX.  It does use up to SSE4.1, which all CPUs with AES-NI have.
+#include <linux/linkage.h>
+
+.section .rodata
+#ifdef __x86_64__
+#define RODATA(label)	label(%rip)
+#else
+#define RODATA(label)	label
+#endif
+
+	// A mask for pshufb that extracts the last dword, rotates it right by 8
+	// bits, and copies the result to all four dwords.
+.p2align 4
+.Lmask:
+	.byte	13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12
+
+	// The AES round constants, used during key expansion
+.Lrcon:
+	.long	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
+
+.text
+
+// Transform four dwords [a0, a1, a2, a3] in \a into
+// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3].  \tmp is a temporary xmm register.
+//
+// Note: this could be done in four instructions, shufps + pxor + shufps + pxor,
+// if the temporary register were zero-initialized ahead of time.  We instead do
+// it in an easier-to-understand way that doesn't require zero-initialization
+// and avoids the unusual shufps instruction.  movdqa is usually "free" anyway.
+.macro	_prefix_sum	a, tmp
+	movdqa		\a, \tmp	// [a0, a1, a2, a3]
+	pslldq		$4, \a		// [0, a0, a1, a2]
+	pxor		\tmp, \a	// [a0, a0^a1, a1^a2, a2^a3]
+	movdqa		\a, \tmp
+	pslldq		$8, \a		// [0, 0, a0, a0^a1]
+	pxor		\tmp, \a	// [a0, a0^a1, a0^a1^a2, a0^a1^a2^a3]
+.endm
+
+.macro	_gen_round_key	a, b
+	// Compute four copies of rcon[i] ^ SubBytes(ror32(w, 8)), where w is
+	// the last dword of the previous round key (given in \b).
+	//
+	// 'aesenclast src, dst' does dst = src XOR SubBytes(ShiftRows(dst)).
+	// It is used here solely for the SubBytes and the XOR.  The ShiftRows
+	// is a no-op because all four columns are the same here.
+	//
+	// Don't use the 'aeskeygenassist' instruction, since:
+	//  - On most Intel CPUs it is microcoded, making it have a much higher
+	//    latency and use more execution ports than 'aesenclast'.
+	//  - It cannot be used in a loop, since it requires an immediate.
+	//  - It doesn't do much more than 'aesenclast' in the first place.
+	movdqa		\b, %xmm2
+	pshufb		MASK, %xmm2
+	aesenclast	RCON, %xmm2
+
+	// XOR in the prefix sum of the four dwords of \a, which is the
+	// previous round key (AES-128) or the first round key in the previous
+	// pair of round keys (AES-256).  The result is the next round key.
+	_prefix_sum	\a, tmp=%xmm3
+	pxor		%xmm2, \a
+
+	// Store the next round key to memory.  Also leave it in \a.
+	movdqu		\a, (RNDKEYS)
+.endm
+
+.macro	_aes_expandkey_aesni	is_aes128
+#ifdef __x86_64__
+	// Arguments
+	.set	RNDKEYS,	%rdi
+	.set	INV_RNDKEYS,	%rsi
+	.set	IN_KEY,		%rdx
+
+	// Other local variables
+	.set	RCON_PTR,	%rcx
+	.set	COUNTER,	%eax
+#else
+	// Arguments, assuming -mregparm=3
+	.set	RNDKEYS,	%eax
+	.set	INV_RNDKEYS,	%edx
+	.set	IN_KEY,		%ecx
+
+	// Other local variables
+	.set	RCON_PTR,	%ebx
+	.set	COUNTER,	%esi
+#endif
+	.set	RCON,		%xmm6
+	.set	MASK,		%xmm7
+
+#ifdef __i386__
+	push		%ebx
+	push		%esi
+#endif
+
+.if \is_aes128
+	// AES-128: the first round key is simply a copy of the raw key.
+	movdqu		(IN_KEY), %xmm0
+	movdqu		%xmm0, (RNDKEYS)
+.else
+	// AES-256: the first two round keys are simply a copy of the raw key.
+	movdqu		(IN_KEY), %xmm0
+	movdqu		%xmm0, (RNDKEYS)
+	movdqu		16(IN_KEY), %xmm1
+	movdqu		%xmm1, 16(RNDKEYS)
+	add		$32, RNDKEYS
+.endif
+
+	// Generate the remaining round keys.
+	movdqa		RODATA(.Lmask), MASK
+.if \is_aes128
+	lea		RODATA(.Lrcon), RCON_PTR
+	mov		$10, COUNTER
+.Lgen_next_aes128_round_key:
+	add		$16, RNDKEYS
+	movd		(RCON_PTR), RCON
+	pshufd		$0x00, RCON, RCON
+	add		$4, RCON_PTR
+	_gen_round_key	%xmm0, %xmm0
+	dec		COUNTER
+	jnz		.Lgen_next_aes128_round_key
+.else
+	// AES-256: only the first 7 round constants are needed, so instead of
+	// loading each one from memory, just start by loading [1, 1, 1, 1] and
+	// then generate the rest by doubling.
+	pshufd		$0x00, RODATA(.Lrcon), RCON
+	pxor		%xmm5, %xmm5	// All-zeroes
+	mov		$7, COUNTER
+.Lgen_next_aes256_round_key_pair:
+	// Generate the next AES-256 round key: either the first of a pair of
+	// two, or the last one.
+	_gen_round_key	%xmm0, %xmm1
+
+	dec		COUNTER
+	jz		.Lgen_aes256_round_keys_done
+
+	// Generate the second AES-256 round key of the pair.  Compared to the
+	// first, there's no rotation and no XOR of a round constant.
+	pshufd		$0xff, %xmm0, %xmm2	// Get four copies of last dword
+	aesenclast	%xmm5, %xmm2		// Just does SubBytes
+	_prefix_sum	%xmm1, tmp=%xmm3
+	pxor		%xmm2, %xmm1
+	movdqu		%xmm1, 16(RNDKEYS)
+	add		$32, RNDKEYS
+	paddd		RCON, RCON		// RCON <<= 1
+	jmp		.Lgen_next_aes256_round_key_pair
+.Lgen_aes256_round_keys_done:
+.endif
+
+	// If INV_RNDKEYS is non-NULL, write the round keys for the Equivalent
+	// Inverse Cipher to it.  To do that, reverse the standard round keys,
+	// and apply aesimc (InvMixColumn) to each except the first and last.
+	test		INV_RNDKEYS, INV_RNDKEYS
+	jz		.Ldone\@
+	movdqu		(RNDKEYS), %xmm0	// Last standard round key
+	movdqu		%xmm0, (INV_RNDKEYS)	// => First inverse round key
+.if \is_aes128
+	mov		$9, COUNTER
+.else
+	mov		$13, COUNTER
+.endif
+.Lgen_next_inv_round_key\@:
+	sub		$16, RNDKEYS
+	add		$16, INV_RNDKEYS
+	movdqu		(RNDKEYS), %xmm0
+	aesimc		%xmm0, %xmm0
+	movdqu		%xmm0, (INV_RNDKEYS)
+	dec		COUNTER
+	jnz		.Lgen_next_inv_round_key\@
+	movdqu		-16(RNDKEYS), %xmm0	// First standard round key
+	movdqu		%xmm0, 16(INV_RNDKEYS)	// => Last inverse round key
+
+.Ldone\@:
+#ifdef __i386__
+	pop		%esi
+	pop		%ebx
+#endif
+	RET
+.endm
+
+// void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+//			       const u8 in_key[AES_KEYSIZE_128]);
+SYM_FUNC_START(aes128_expandkey_aesni)
+	_aes_expandkey_aesni	1
+SYM_FUNC_END(aes128_expandkey_aesni)
+
+// void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+//			       const u8 in_key[AES_KEYSIZE_256]);
+SYM_FUNC_START(aes256_expandkey_aesni)
+	_aes_expandkey_aesni	0
+SYM_FUNC_END(aes256_expandkey_aesni)
+
+.macro	_aes_crypt_aesni	enc
+#ifdef __x86_64__
+	.set	RNDKEYS,	%rdi
+	.set	NROUNDS,	%esi
+	.set	OUT,		%rdx
+	.set	IN,		%rcx
+#else
+	// Assuming -mregparm=3
+	.set	RNDKEYS,	%eax
+	.set	NROUNDS,	%edx
+	.set	OUT,		%ecx
+	.set	IN,		%ebx	// Passed on stack
+#endif
+
+#ifdef __i386__
+	push		%ebx
+	mov		8(%esp), %ebx
+#endif
+
+	// Zero-th round
+	movdqu		(IN), %xmm0
+	movdqu		(RNDKEYS), %xmm1
+	pxor		%xmm1, %xmm0
+
+	// Normal rounds
+	add		$16, RNDKEYS
+	dec		NROUNDS
+.Lnext_round\@:
+	movdqu		(RNDKEYS), %xmm1
+.if \enc
+	aesenc		%xmm1, %xmm0
+.else
+	aesdec		%xmm1, %xmm0
+.endif
+	add		$16, RNDKEYS
+	dec		NROUNDS
+	jne		.Lnext_round\@
+
+	// Last round
+	movdqu		(RNDKEYS), %xmm1
+.if \enc
+	aesenclast	%xmm1, %xmm0
+.else
+	aesdeclast	%xmm1, %xmm0
+.endif
+	movdqu		%xmm0, (OUT)
+
+#ifdef __i386__
+	pop		%ebx
+#endif
+	RET
+.endm
+
+// void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_encrypt_aesni)
+	_aes_crypt_aesni	1
+SYM_FUNC_END(aes_encrypt_aesni)
+
+// void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+//			  u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+SYM_FUNC_START(aes_decrypt_aesni)
+	_aes_crypt_aesni	0
+SYM_FUNC_END(aes_decrypt_aesni)
diff --git a/lib/crypto/x86/aes.h b/lib/crypto/x86/aes.h
new file mode 100644
index 000000000000..b047dee94f57
--- /dev/null
+++ b/lib/crypto/x86/aes.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES block cipher using AES-NI instructions
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <asm/fpu/api.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_aes);
+
+void aes128_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+			    const u8 in_key[AES_KEYSIZE_128]);
+void aes256_expandkey_aesni(u32 rndkeys[], u32 *inv_rndkeys,
+			    const u8 in_key[AES_KEYSIZE_256]);
+void aes_encrypt_aesni(const u32 rndkeys[], int nrounds,
+		       u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+void aes_decrypt_aesni(const u32 inv_rndkeys[], int nrounds,
+		       u8 out[AES_BLOCK_SIZE], const u8 in[AES_BLOCK_SIZE]);
+
+/*
+ * Expand an AES key using AES-NI if supported and usable or generic code
+ * otherwise.  The expanded key format is compatible between the two cases.  The
+ * outputs are @k->rndkeys (required) and @inv_k->inv_rndkeys (optional).
+ *
+ * We could just always use the generic key expansion code.  AES key expansion
+ * is usually less performance-critical than AES en/decryption.  However,
+ * there's still *some* value in speed here, as well as in non-key-dependent
+ * execution time which AES-NI provides.  So, do use AES-NI to expand AES-128
+ * and AES-256 keys.  (Don't bother with AES-192, as it's almost never used.)
+ */
+static void aes_preparekey_arch(union aes_enckey_arch *k,
+				union aes_invkey_arch *inv_k,
+				const u8 *in_key, int key_len, int nrounds)
+{
+	u32 *rndkeys = k->rndkeys;
+	u32 *inv_rndkeys = inv_k ? inv_k->inv_rndkeys : NULL;
+
+	if (static_branch_likely(&have_aes) && key_len != AES_KEYSIZE_192 &&
+	    irq_fpu_usable()) {
+		kernel_fpu_begin();
+		if (key_len == AES_KEYSIZE_128)
+			aes128_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+		else
+			aes256_expandkey_aesni(rndkeys, inv_rndkeys, in_key);
+		kernel_fpu_end();
+	} else {
+		aes_expandkey_generic(rndkeys, inv_rndkeys, in_key, key_len);
+	}
+}
+
+static void aes_encrypt_arch(const struct aes_enckey *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		aes_encrypt_aesni(key->k.rndkeys, key->nrounds, out, in);
+		kernel_fpu_end();
+	} else {
+		aes_encrypt_generic(key->k.rndkeys, key->nrounds, out, in);
+	}
+}
+
+static void aes_decrypt_arch(const struct aes_key *key,
+			     u8 out[AES_BLOCK_SIZE],
+			     const u8 in[AES_BLOCK_SIZE])
+{
+	if (static_branch_likely(&have_aes) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		aes_decrypt_aesni(key->inv_k.inv_rndkeys, key->nrounds,
+				  out, in);
+		kernel_fpu_end();
+	} else {
+		aes_decrypt_generic(key->inv_k.inv_rndkeys, key->nrounds,
+				    out, in);
+	}
+}
+
+#define aes_mod_init_arch aes_mod_init_arch
+static void aes_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AES))
+		static_branch_enable(&have_aes);
+}
-- 
2.52.0


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

  parent reply	other threads:[~2026-01-05  5:16 UTC|newest]

Thread overview: 102+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-05  5:12 [PATCH 00/36] AES library improvements Eric Biggers
2026-01-05  5:12 ` Eric Biggers
2026-01-05  5:12 ` [PATCH 01/36] crypto: powerpc/aes - Rename struct aes_key Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 02/36] lib/crypto: aes: Introduce improved AES library Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  7:47   ` Qingfang Deng
2026-01-05  7:47     ` Qingfang Deng
2026-01-06  6:36     ` Eric Biggers
2026-01-06  6:36       ` Eric Biggers
2026-01-05  5:12 ` [PATCH 03/36] crypto: arm/aes-neonbs - Use AES library for single blocks Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 04/36] crypto: arm/aes - Switch to aes_enc_tab[] and aes_dec_tab[] Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 05/36] crypto: arm64/aes " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 06/36] crypto: arm64/aes - Select CRYPTO_LIB_SHA256 from correct places Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 07/36] crypto: aegis - Switch from crypto_ft_tab[] to aes_enc_tab[] Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 08/36] crypto: aes - Remove aes-fixed-time / CONFIG_CRYPTO_AES_TI Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 09/36] crypto: aes - Replace aes-generic with wrapper around lib Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 10/36] lib/crypto: arm/aes: Migrate optimized code into library Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 11/36] lib/crypto: arm64/aes: " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 12/36] lib/crypto: powerpc/aes: Migrate SPE " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 13/36] lib/crypto: powerpc/aes: Migrate POWER8 " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 14/36] lib/crypto: riscv/aes: Migrate " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 15/36] lib/crypto: s390/aes: " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-07  7:41   ` Holger Dengler
2026-01-07  7:41     ` Holger Dengler
2026-01-07 20:34     ` Eric Biggers
2026-01-07 20:34       ` Eric Biggers
2026-01-14 12:12       ` Holger Dengler
2026-01-14 12:12         ` Holger Dengler
2026-01-05  5:12 ` [PATCH 16/36] lib/crypto: sparc/aes: " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` Eric Biggers [this message]
2026-01-05  5:12   ` [PATCH 17/36] lib/crypto: x86/aes: Add AES-NI optimization Eric Biggers
2026-01-05  5:12 ` [PATCH 18/36] crypto: x86/aes - Remove the superseded AES-NI crypto_cipher Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 19/36] Bluetooth: SMP: Use new AES library API Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05 15:40   ` Andrew Cooper
2026-01-05 15:40     ` Andrew Cooper
2026-01-05 19:05     ` David Laight
2026-01-05 19:05       ` David Laight
2026-01-06  6:58       ` Eric Biggers
2026-01-06  6:58         ` Eric Biggers
2026-01-05  5:12 ` [PATCH 20/36] chelsio: " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 21/36] net: phy: mscc: macsec: " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 22/36] staging: rtl8723bs: core: " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 23/36] crypto: arm/ghash - " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 24/36] crypto: arm64/ghash " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 25/36] crypto: x86/aes-gcm " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:12 ` [PATCH 26/36] crypto: ccp " Eric Biggers
2026-01-05  5:12   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 27/36] crypto: chelsio " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 28/36] crypto: crypto4xx " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 29/36] crypto: drbg " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 30/36] crypto: inside-secure " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-07  3:48   ` Qingfang Deng
2026-01-07  3:48     ` Qingfang Deng
2026-01-07  4:01     ` Eric Biggers
2026-01-07  4:01       ` Eric Biggers
2026-01-05  5:13 ` [PATCH 31/36] crypto: omap " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 32/36] lib/crypto: aescfb: " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 33/36] lib/crypto: aesgcm: " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 34/36] lib/crypto: aes: Remove old AES en/decryption functions Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 35/36] lib/crypto: aes: Drop "_new" suffix from " Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-05  5:13 ` [PATCH 36/36] lib/crypto: aes: Drop 'volatile' from aes_sbox and aes_inv_sbox Eric Biggers
2026-01-05  5:13   ` Eric Biggers
2026-01-08 11:32 ` [PATCH 00/36] AES library improvements Ard Biesheuvel
2026-01-08 11:32   ` Ard Biesheuvel
2026-01-08 20:26   ` Eric Biggers
2026-01-08 20:26     ` Eric Biggers
2026-01-09  1:27     ` Eric Biggers
2026-01-09  1:27       ` Eric Biggers
2026-01-09  9:08       ` Ard Biesheuvel
2026-01-09  9:08         ` Ard Biesheuvel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260105051311.1607207-18-ebiggers@kernel.org \
    --to=ebiggers@kernel.org \
    --cc=Jason@zx2c4.com \
    --cc=ardb@kernel.org \
    --cc=dengler@linux.ibm.com \
    --cc=freude@linux.ibm.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=sparclinux@vger.kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.