From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
"Chang S . Bae" <chang.seok.bae@intel.com>
Subject: [PATCH 3/3] crypto: x86/aes-xts - optimize size of instructions operating on lengths
Date: Fri, 12 Apr 2024 20:17:28 -0700 [thread overview]
Message-ID: <20240413031728.159495-4-ebiggers@kernel.org> (raw)
In-Reply-To: <20240413031728.159495-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
x86_64 has the "interesting" property that the instruction size is
generally a bit shorter for instructions that operate on the 32-bit (or
less) part of registers, or registers that are in the original set of 8.
This patch adjusts the AES-XTS code to take advantage of that property
by changing the LEN parameter from size_t to unsigned int (which is all
that's needed and is what the non-AVX implementation uses) and using the
%eax register for KEYLEN.
This decreases the size of aes-xts-avx-x86_64.o by 1.2%.
Note that changing the kmovq to kmovd was going to be needed anyway to
make the AVX10/256 code really work on CPUs that don't support 512-bit
vectors (since the AVX10 spec says that 64-bit opmask instructions will
only be supported on processors that support 512-bit vectors).
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aes-xts-avx-x86_64.S | 40 +++++++++++++++-------------
arch/x86/crypto/aesni-intel_glue.c | 18 ++++++-------
2 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 802d3b90d337..48f97b79f7a9 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -83,18 +83,20 @@
// Function parameters
.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
// advanced to point to 7th-from-last round key
.set SRC, %rsi // Pointer to next source data
.set DST, %rdx // Pointer to next destination data
-.set LEN, %rcx // Remaining length in bytes
+.set LEN, %ecx // Remaining length in bytes
+.set LEN8, %cl
+.set LEN64, %rcx
.set TWEAK, %r8 // Pointer to next tweak
-// %r9 holds the AES key length in bytes.
-.set KEYLEN, %r9d
-.set KEYLEN64, %r9
+// %rax holds the AES key length in bytes.
+.set KEYLEN, %eax
+.set KEYLEN64, %rax
-// %rax and %r10-r11 are available as temporaries.
+// %r9-r11 are available as temporaries.
.macro _define_Vi i
.if VL == 16
.set V\i, %xmm\i
.elseif VL == 32
@@ -563,13 +565,13 @@
// When decrypting a message whose length isn't a multiple of the AES
// block length, exclude the last full block from the main loop by
// subtracting 16 from LEN. This is needed because ciphertext stealing
// decryption uses the last two tweaks in reverse order. We'll handle
// the last full block and the partial block specially at the end.
- lea -16(LEN), %rax
- test $15, LEN
- cmovnz %rax, LEN
+ lea -16(LEN), %eax
+ test $15, LEN8
+ cmovnz %eax, LEN
.endif
// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
movl 480(KEY), KEYLEN
@@ -648,11 +650,11 @@
jge .Lmain_loop\@
// Check for the uncommon case where the data length isn't a multiple of
// 4*VL. Handle it out-of-line in order to optimize for the common
// case. In the common case, just fall through to the ret.
- test $4*VL-1, LEN
+ test $4*VL-1, LEN8
jnz .Lhandle_remainder\@
.Ldone\@:
// Store the next tweak back to *TWEAK to support continuation calls.
vmovdqu TWEAK0_XMM, (TWEAK)
.if VL > 16
@@ -716,39 +718,39 @@
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
.endif
.if USE_AVX10
// Create a mask that has the first LEN bits set.
- mov $-1, %rax
- bzhi LEN, %rax, %rax
- kmovq %rax, %k1
+ mov $-1, %r9d
+ bzhi LEN, %r9d, %r9d
+ kmovd %r9d, %k1
// Swap the first LEN bytes of the en/decryption of the last full block
// with the partial block. Note that to support in-place en/decryption,
// the load from the src partial block must happen before the store to
// the dst partial block.
vmovdqa %xmm0, %xmm1
vmovdqu8 16(SRC), %xmm0{%k1}
vmovdqu8 %xmm1, 16(DST){%k1}
.else
- lea .Lcts_permute_table(%rip), %rax
+ lea .Lcts_permute_table(%rip), %r9
// Load the src partial block, left-aligned. Note that to support
// in-place en/decryption, this must happen before the store to the dst
// partial block.
- vmovdqu (SRC, LEN, 1), %xmm1
+ vmovdqu (SRC, LEN64, 1), %xmm1
// Shift the first LEN bytes of the en/decryption of the last full block
// to the end of a register, then store it to DST+LEN. This stores the
// dst partial block. It also writes to the second part of the dst last
// full block, but that part is overwritten later.
- vpshufb (%rax, LEN, 1), %xmm0, %xmm2
- vmovdqu %xmm2, (DST, LEN, 1)
+ vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
+ vmovdqu %xmm2, (DST, LEN64, 1)
// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
- sub LEN, %rax
- vmovdqu 32(%rax), %xmm3
+ sub LEN64, %r9
+ vmovdqu 32(%r9), %xmm3
// Shift the src partial block to the beginning of its register.
vpshufb %xmm3, %xmm1, %xmm1
// Do a blend to generate the src partial block followed by the second
@@ -793,11 +795,11 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
// Below are the actual AES-XTS encryption and decryption functions,
// instantiated from the above macro. They all have the following prototype:
//
// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
-// const u8 *src, u8 *dst, size_t len,
+// const u8 *src, u8 *dst, unsigned int len,
// u8 tweak[AES_BLOCK_SIZE]);
//
// |key| is the data key. |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done. This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e7d21000cb05..110b3282a1f2 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -897,11 +897,11 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
}
typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
u8 iv[AES_BLOCK_SIZE]);
typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, size_t len,
+ const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE]);
/* This handles cases where the source and/or destination span pages. */
static noinline int
xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
@@ -1019,18 +1019,18 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
{
aesni_enc(tweak_key, iv, iv);
}
static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, size_t len,
+ const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE])
{
aesni_xts_enc(key, dst, src, len, tweak);
}
static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
- const u8 *src, u8 *dst, size_t len,
+ const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE])
{
aesni_xts_dec(key, dst, src, len, tweak);
}
@@ -1183,16 +1183,16 @@ static struct simd_skcipher_alg *aesni_simd_xctr;
asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
u8 iv[AES_BLOCK_SIZE]);
#define DEFINE_XTS_ALG(suffix, driver_name, priority) \
\
-asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, \
- const u8 *src, u8 *dst, size_t len, \
- u8 tweak[AES_BLOCK_SIZE]); \
-asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, \
- const u8 *src, u8 *dst, size_t len, \
- u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
\
static int xts_encrypt_##suffix(struct skcipher_request *req) \
{ \
return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \
} \
--
2.44.0
next prev parent reply other threads:[~2024-04-13 3:21 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-04-13 3:17 [PATCH 0/3] crypto: x86/aes-xts - additional tuning Eric Biggers
2024-04-13 3:17 ` [PATCH 1/3] crypto: x86/aes-xts - handle AES-128 and AES-192 more efficiently Eric Biggers
2024-04-13 3:17 ` [PATCH 2/3] crypto: x86/aes-xts - eliminate a few more instructions Eric Biggers
2024-04-13 3:17 ` Eric Biggers [this message]
2024-04-15 12:11 ` [PATCH 0/3] crypto: x86/aes-xts - additional tuning Ard Biesheuvel
2024-04-19 11:04 ` Herbert Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240413031728.159495-4-ebiggers@kernel.org \
--to=ebiggers@kernel.org \
--cc=chang.seok.bae@intel.com \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.