* [PATCH 00/10] AEGIS x86 assembly tuning
@ 2024-10-07 1:24 Eric Biggers
2024-10-07 1:24 ` [PATCH 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
` (10 more replies)
0 siblings, 11 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
This series cleans up the AES-NI optimized implementation of AEGIS-128.
Performance is improved by 1-5% depending on the input lengths. Binary
code size is reduced by about 20% (measuring glue + assembly combined),
and source code length is reduced by about 150 lines.
The first patch also fixes a bug which could theoretically cause
incorrect behavior but was seemingly not being encountered in practice.
Note: future optimizations for AEGIS-128 could involve adding AVX512 /
AVX10 optimized assembly code. However, unfortunately due to the way
that AEGIS-128 is specified, its level of parallelism is limited, and it
can't really take advantage of vector lengths greater than 128 bits.
So, probably this would provide only another modest improvement, mostly
coming from being able to use the ternary logic instructions.
Eric Biggers (10):
crypto: x86/aegis128 - access 32-bit arguments as 32-bit
crypto: x86/aegis128 - remove no-op init and exit functions
crypto: x86/aegis128 - eliminate some indirect calls
crypto: x86/aegis128 - don't bother with special code for aligned data
crypto: x86/aegis128 - optimize length block preparation using SSE4.1
crypto: x86/aegis128 - improve assembly function prototypes
crypto: x86/aegis128 - optimize partial block handling using SSE4.1
crypto: x86/aegis128 - take advantage of block-aligned len
crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END
crypto: x86/aegis128 - remove unneeded RETs
arch/x86/crypto/Kconfig | 4 +-
arch/x86/crypto/aegis128-aesni-asm.S | 532 ++++++++++----------------
arch/x86/crypto/aegis128-aesni-glue.c | 145 ++++---
3 files changed, 261 insertions(+), 420 deletions(-)
base-commit: 9852d85ec9d492ebef56dc5f229416c925758edc
--
2.46.2
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 02/10] crypto: x86/aegis128 - remove no-op init and exit functions Eric Biggers
` (9 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek, stable
From: Eric Biggers <ebiggers@google.com>
Fix the AEGIS assembly code to access 'unsigned int' arguments as 32-bit
values instead of 64-bit, since the upper bits of the corresponding
64-bit registers are not guaranteed to be zero.
Note: there haven't been any reports of this bug actually causing
incorrect behavior. Neither gcc nor clang guarantee zero-extension to
64 bits, but zero-extension is likely to happen in practice because most
instructions that operate on 32-bit registers zero-extend to 64 bits.
Fixes: 1d373d4e8e15 ("crypto: x86 - Add optimized AEGIS implementations")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 29 ++++++++++++++--------------
1 file changed, 15 insertions(+), 14 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index ad7f4c891625..2de859173940 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -19,11 +19,11 @@
#define MSG %xmm5
#define T0 %xmm6
#define T1 %xmm7
#define STATEP %rdi
-#define LEN %rsi
+#define LEN %esi
#define SRC %rdx
#define DST %rcx
.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
.align 16
@@ -74,50 +74,50 @@
*/
SYM_FUNC_START_LOCAL(__load_partial)
xor %r9d, %r9d
pxor MSG, MSG
- mov LEN, %r8
+ mov LEN, %r8d
and $0x1, %r8
jz .Lld_partial_1
- mov LEN, %r8
+ mov LEN, %r8d
and $0x1E, %r8
add SRC, %r8
mov (%r8), %r9b
.Lld_partial_1:
- mov LEN, %r8
+ mov LEN, %r8d
and $0x2, %r8
jz .Lld_partial_2
- mov LEN, %r8
+ mov LEN, %r8d
and $0x1C, %r8
add SRC, %r8
shl $0x10, %r9
mov (%r8), %r9w
.Lld_partial_2:
- mov LEN, %r8
+ mov LEN, %r8d
and $0x4, %r8
jz .Lld_partial_4
- mov LEN, %r8
+ mov LEN, %r8d
and $0x18, %r8
add SRC, %r8
shl $32, %r9
mov (%r8), %r8d
xor %r8, %r9
.Lld_partial_4:
movq %r9, MSG
- mov LEN, %r8
+ mov LEN, %r8d
and $0x8, %r8
jz .Lld_partial_8
- mov LEN, %r8
+ mov LEN, %r8d
and $0x10, %r8
add SRC, %r8
pslldq $8, MSG
movq (%r8), T0
pxor T0, MSG
@@ -137,11 +137,11 @@ SYM_FUNC_END(__load_partial)
* %r8
* %r9
* %r10
*/
SYM_FUNC_START_LOCAL(__store_partial)
- mov LEN, %r8
+ mov LEN, %r8d
mov DST, %r9
movq T0, %r10
cmp $8, %r8
@@ -675,11 +675,11 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqa MSG, T0
call __store_partial
/* mask with byte count: */
- movq LEN, T0
+ movd LEN, T0
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
movdqa .Laegis128_counter(%rip), T1
@@ -700,11 +700,12 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
RET
SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
/*
* void crypto_aegis128_aesni_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
+ * unsigned int assoclen,
+ * unsigned int cryptlen);
*/
SYM_FUNC_START(crypto_aegis128_aesni_final)
FRAME_BEGIN
/* load the state: */
@@ -713,12 +714,12 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
/* prepare length block: */
- movq %rdx, MSG
- movq %rcx, T0
+ movd %edx, MSG
+ movd %ecx, T0
pslldq $8, T0
pxor T0, MSG
psllq $3, MSG /* multiply by 8 (to get bit count) */
pxor STATE3, MSG
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 02/10] crypto: x86/aegis128 - remove no-op init and exit functions
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
2024-10-07 1:24 ` [PATCH 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 03/10] crypto: x86/aegis128 - eliminate some indirect calls Eric Biggers
` (8 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Don't bother providing empty stubs for the init and exit methods in
struct aead_alg, since they are optional anyway.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-glue.c | 11 -----------
1 file changed, 11 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 4623189000d8..96586470154e 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -225,26 +225,15 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
}
-static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
-{
- return 0;
-}
-
-static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
-{
-}
-
static struct aead_alg crypto_aegis128_aesni_alg = {
.setkey = crypto_aegis128_aesni_setkey,
.setauthsize = crypto_aegis128_aesni_setauthsize,
.encrypt = crypto_aegis128_aesni_encrypt,
.decrypt = crypto_aegis128_aesni_decrypt,
- .init = crypto_aegis128_aesni_init_tfm,
- .exit = crypto_aegis128_aesni_exit_tfm,
.ivsize = AEGIS128_NONCE_SIZE,
.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
.chunksize = AEGIS128_BLOCK_SIZE,
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 03/10] crypto: x86/aegis128 - eliminate some indirect calls
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
2024-10-07 1:24 ` [PATCH 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
2024-10-07 1:24 ` [PATCH 02/10] crypto: x86/aegis128 - remove no-op init and exit functions Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-15 12:41 ` Ondrej Mosnacek
2024-10-07 1:24 ` [PATCH 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data Eric Biggers
` (7 subsequent siblings)
10 siblings, 1 reply; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Instead of using a struct of function pointers to decide whether to call
the encryption or decryption assembly functions, use a conditional
branch on a bool. Force-inline the functions to avoid actually
generating the branch. This improves performance slightly since
indirect calls are slow. Remove the now-unnecessary CFI stubs.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 9 ++--
arch/x86/crypto/aegis128-aesni-glue.c | 74 +++++++++++++--------------
2 files changed, 40 insertions(+), 43 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 2de859173940..1b57558548c7 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -5,11 +5,10 @@
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*/
#include <linux/linkage.h>
-#include <linux/cfi_types.h>
#include <asm/frame.h>
#define STATE0 %xmm0
#define STATE1 %xmm1
#define STATE2 %xmm2
@@ -401,11 +400,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad)
/*
* void crypto_aegis128_aesni_enc(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+SYM_FUNC_START(crypto_aegis128_aesni_enc)
FRAME_BEGIN
cmp $0x10, LEN
jb .Lenc_out
@@ -498,11 +497,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc)
/*
* void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
@@ -555,11 +554,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
/*
* void crypto_aegis128_aesni_dec(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+SYM_FUNC_START(crypto_aegis128_aesni_dec)
FRAME_BEGIN
cmp $0x10, LEN
jb .Ldec_out
@@ -652,11 +651,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec)
/*
* void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 96586470154e..deb39cef0be1 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -54,20 +54,10 @@ struct aegis_state {
struct aegis_ctx {
struct aegis_block key;
};
-struct aegis_crypt_ops {
- int (*skcipher_walk_init)(struct skcipher_walk *walk,
- struct aead_request *req, bool atomic);
-
- void (*crypt_blocks)(void *state, unsigned int length, const void *src,
- void *dst);
- void (*crypt_tail)(void *state, unsigned int length, const void *src,
- void *dst);
-};
-
static void crypto_aegis128_aesni_process_ad(
struct aegis_state *state, struct scatterlist *sg_src,
unsigned int assoclen)
{
struct scatter_walk walk;
@@ -112,24 +102,41 @@ static void crypto_aegis128_aesni_process_ad(
memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
}
}
-static void crypto_aegis128_aesni_process_crypt(
- struct aegis_state *state, struct skcipher_walk *walk,
- const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
+ struct skcipher_walk *walk, bool enc)
{
while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
- ops->crypt_blocks(state,
- round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
- walk->src.virt.addr, walk->dst.virt.addr);
+ if (enc)
+ crypto_aegis128_aesni_enc(
+ state,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE),
+ walk->src.virt.addr,
+ walk->dst.virt.addr);
+ else
+ crypto_aegis128_aesni_dec(
+ state,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE),
+ walk->src.virt.addr,
+ walk->dst.virt.addr);
skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
}
if (walk->nbytes) {
- ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
- walk->dst.virt.addr);
+ if (enc)
+ crypto_aegis128_aesni_enc_tail(state, walk->nbytes,
+ walk->src.virt.addr,
+ walk->dst.virt.addr);
+ else
+ crypto_aegis128_aesni_dec_tail(state, walk->nbytes,
+ walk->src.virt.addr,
+ walk->dst.virt.addr);
skcipher_walk_done(walk, 0);
}
}
static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
@@ -160,71 +167,62 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
if (authsize < AEGIS128_MIN_AUTH_SIZE)
return -EINVAL;
return 0;
}
-static void crypto_aegis128_aesni_crypt(struct aead_request *req,
- struct aegis_block *tag_xor,
- unsigned int cryptlen,
- const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen, bool enc)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
struct skcipher_walk walk;
struct aegis_state state;
- ops->skcipher_walk_init(&walk, req, true);
+ if (enc)
+ skcipher_walk_aead_encrypt(&walk, req, true);
+ else
+ skcipher_walk_aead_decrypt(&walk, req, true);
kernel_fpu_begin();
crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
+ crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
}
static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
{
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_encrypt,
- .crypt_blocks = crypto_aegis128_aesni_enc,
- .crypt_tail = crypto_aegis128_aesni_enc_tail,
- };
-
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_block tag = {};
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen;
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
scatterwalk_map_and_copy(tag.bytes, req->dst,
req->assoclen + cryptlen, authsize, 1);
return 0;
}
static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
{
static const struct aegis_block zeros = {};
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_decrypt,
- .crypt_blocks = crypto_aegis128_aesni_dec,
- .crypt_tail = crypto_aegis128_aesni_dec_tail,
- };
-
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_block tag;
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen - authsize;
scatterwalk_map_and_copy(tag.bytes, req->src,
req->assoclen + cryptlen, authsize, 0);
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
}
static struct aead_alg crypto_aegis128_aesni_alg = {
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (2 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 03/10] crypto: x86/aegis128 - eliminate some indirect calls Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1 Eric Biggers
` (6 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Remove the AEGIS assembly code paths that were "optimized" to operate on
16-byte aligned data using movdqa, and instead just use the code paths
that use movdqu and can handle data with any alignment.
This does not reduce performance. movdqa is basically a historical
artifact; on aligned data, movdqu and movdqa have had the same
performance since Intel Nehalem (2008) and AMD Bulldozer (2011). And
code that requires AES-NI cannot run on CPUs older than those anyway.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 122 +++++----------------------
1 file changed, 22 insertions(+), 100 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 1b57558548c7..5541aca2fd0d 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -243,56 +243,12 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- and $0xF, %r8
- jnz .Lad_u_loop
-
-.align 8
-.Lad_a_loop:
- movdqa 0x00(SRC), MSG
- aegis128_update
- pxor MSG, STATE4
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_1
-
- movdqa 0x10(SRC), MSG
- aegis128_update
- pxor MSG, STATE3
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_2
-
- movdqa 0x20(SRC), MSG
- aegis128_update
- pxor MSG, STATE2
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_3
-
- movdqa 0x30(SRC), MSG
- aegis128_update
- pxor MSG, STATE1
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_4
-
- movdqa 0x40(SRC), MSG
- aegis128_update
- pxor MSG, STATE0
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_0
-
- add $0x50, SRC
- jmp .Lad_a_loop
-
.align 8
-.Lad_u_loop:
+.Lad_loop:
movdqu 0x00(SRC), MSG
aegis128_update
pxor MSG, STATE4
sub $0x10, LEN
cmp $0x10, LEN
@@ -325,11 +281,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
sub $0x10, LEN
cmp $0x10, LEN
jl .Lad_out_0
add $0x50, SRC
- jmp .Lad_u_loop
+ jmp .Lad_loop
/* store the state: */
.Lad_out_0:
movdqu STATE0, 0x00(STATEP)
movdqu STATE1, 0x10(STATEP)
@@ -378,19 +334,19 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
.Lad_out:
FRAME_END
RET
SYM_FUNC_END(crypto_aegis128_aesni_ad)
-.macro encrypt_block a s0 s1 s2 s3 s4 i
- movdq\a (\i * 0x10)(SRC), MSG
+.macro encrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
movdqa MSG, T0
pxor \s1, T0
pxor \s4, T0
movdqa \s2, T1
pand \s3, T1
pxor T1, T0
- movdq\a T0, (\i * 0x10)(DST)
+ movdqu T0, (\i * 0x10)(DST)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
@@ -413,38 +369,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Lenc_u_loop
-
.align 8
-.Lenc_a_loop:
- encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
- encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
- encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
- encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
- encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Lenc_loop:
+ encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
add $0x50, SRC
add $0x50, DST
- jmp .Lenc_a_loop
-
-.align 8
-.Lenc_u_loop:
- encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
- encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
- encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
- encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
- encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
-
- add $0x50, SRC
- add $0x50, DST
- jmp .Lenc_u_loop
+ jmp .Lenc_loop
/* store the state: */
.Lenc_out_0:
movdqu STATE4, 0x00(STATEP)
movdqu STATE0, 0x10(STATEP)
@@ -533,18 +472,18 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
FRAME_END
RET
SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
-.macro decrypt_block a s0 s1 s2 s3 s4 i
- movdq\a (\i * 0x10)(SRC), MSG
+.macro decrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
pxor \s1, MSG
pxor \s4, MSG
movdqa \s2, T1
pand \s3, T1
pxor T1, MSG
- movdq\a MSG, (\i * 0x10)(DST)
+ movdqu MSG, (\i * 0x10)(DST)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
@@ -567,38 +506,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Ldec_u_loop
-
-.align 8
-.Ldec_a_loop:
- decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
- decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
- decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
- decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
- decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
-
- add $0x50, SRC
- add $0x50, DST
- jmp .Ldec_a_loop
-
.align 8
-.Ldec_u_loop:
- decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
- decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
- decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
- decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
- decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Ldec_loop:
+ decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
add $0x50, SRC
add $0x50, DST
- jmp .Ldec_u_loop
+ jmp .Ldec_loop
/* store the state: */
.Ldec_out_0:
movdqu STATE4, 0x00(STATEP)
movdqu STATE0, 0x10(STATEP)
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (3 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 06/10] crypto: x86/aegis128 - improve assembly function prototypes Eric Biggers
` (5 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Start using SSE4.1 instructions in the AES-NI AEGIS code, with the first
use case being preparing the length block in fewer instructions.
In practice this does not reduce the set of CPUs on which the code can
run, because all Intel and AMD CPUs with AES-NI also have SSE4.1.
Upgrade the existing SSE2 feature check to SSE4.1, though it seems this
check is not strictly necessary; the aesni-intel module has been getting
away with using SSE4.1 despite checking for AES-NI only.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/Kconfig | 4 ++--
arch/x86/crypto/aegis128-aesni-asm.S | 6 ++----
arch/x86/crypto/aegis128-aesni-glue.c | 6 +++---
3 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 7b1bebed879d..3d2e38ba5240 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -361,20 +361,20 @@ config CRYPTO_CHACHA20_X86_64
- SSSE3 (Supplemental SSE3)
- AVX2 (Advanced Vector Extensions 2)
- AVX-512VL (Advanced Vector Extensions-512VL)
config CRYPTO_AEGIS128_AESNI_SSE2
- tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)"
+ tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
depends on X86 && 64BIT
select CRYPTO_AEAD
select CRYPTO_SIMD
help
AEGIS-128 AEAD algorithm
Architecture: x86_64 using:
- AES-NI (AES New Instructions)
- - SSE2 (Streaming SIMD Extensions 2)
+ - SSE4.1 (Streaming SIMD Extensions 4.1)
config CRYPTO_NHPOLY1305_SSE2
tristate "Hash functions: NHPoly1305 (SSE2)"
depends on X86 && 64BIT
select CRYPTO_NHPOLY1305
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 5541aca2fd0d..6ed4bc452c29 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * AES-NI + SSE2 implementation of AEGIS-128
+ * AES-NI + SSE4.1 implementation of AEGIS-128
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*/
@@ -636,13 +636,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
/* prepare length block: */
movd %edx, MSG
- movd %ecx, T0
- pslldq $8, T0
- pxor T0, MSG
+ pinsrd $2, %ecx, MSG
psllq $3, MSG /* multiply by 8 (to get bit count) */
pxor STATE3, MSG
/* update state: */
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index deb39cef0be1..4dd2d981a514 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* The AEGIS-128 Authenticated-Encryption Algorithm
- * Glue for AES-NI + SSE2 implementation
+ * Glue for AES-NI + SSE4.1 implementation
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
*/
@@ -252,11 +252,11 @@ static struct aead_alg crypto_aegis128_aesni_alg = {
static struct simd_aead_alg *simd_alg;
static int __init crypto_aegis128_aesni_module_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
!boot_cpu_has(X86_FEATURE_AES) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1,
@@ -271,8 +271,8 @@ static void __exit crypto_aegis128_aesni_module_exit(void)
module_init(crypto_aegis128_aesni_module_init);
module_exit(crypto_aegis128_aesni_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation");
MODULE_ALIAS_CRYPTO("aegis128");
MODULE_ALIAS_CRYPTO("aegis128-aesni");
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 06/10] crypto: x86/aegis128 - improve assembly function prototypes
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (4 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1 Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1 Eric Biggers
` (4 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Adjust the prototypes of the AEGIS assembly functions:
- Use proper types instead of 'void *', when applicable.
- Move the length parameter to after the buffers it describes rather
than before, to match the usual convention. Also shorten its name to
just len (which is the name used in the assembly code).
- Declare register aliases at the beginning of each function rather than
once per file. This was necessary because len was moved, but also it
allows adding some aliases where raw registers were used before.
- Remove the unnecessary "crypto_" prefix.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 105 ++++++++++++++++----------
arch/x86/crypto/aegis128-aesni-glue.c | 92 +++++++++++-----------
2 files changed, 112 insertions(+), 85 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 6ed4bc452c29..8131903cc7ff 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -17,15 +17,10 @@
#define KEY %xmm5
#define MSG %xmm5
#define T0 %xmm6
#define T1 %xmm7
-#define STATEP %rdi
-#define LEN %esi
-#define SRC %rdx
-#define DST %rcx
-
.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
.align 16
.Laegis128_const_0:
.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
@@ -70,10 +65,12 @@
* T0
* %r8
* %r9
*/
SYM_FUNC_START_LOCAL(__load_partial)
+ .set LEN, %ecx
+ .set SRC, %rsi
xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8d
and $0x1, %r8
@@ -136,10 +133,12 @@ SYM_FUNC_END(__load_partial)
* %r8
* %r9
* %r10
*/
SYM_FUNC_START_LOCAL(__store_partial)
+ .set LEN, %ecx
+ .set DST, %rdx
mov LEN, %r8d
mov DST, %r9
movq T0, %r10
@@ -182,20 +181,25 @@ SYM_FUNC_START_LOCAL(__store_partial)
.Lst_partial_1:
RET
SYM_FUNC_END(__store_partial)
/*
- * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ * void aegis128_aesni_init(struct aegis_state *state,
+ * const struct aegis_block *key,
+ * const u8 iv[AEGIS128_NONCE_SIZE]);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_init)
+SYM_FUNC_START(aegis128_aesni_init)
+ .set STATEP, %rdi
+ .set KEYP, %rsi
+ .set IVP, %rdx
FRAME_BEGIN
/* load IV: */
- movdqu (%rdx), T1
+ movdqu (IVP), T1
/* load key: */
- movdqa (%rsi), KEY
+ movdqa (KEYP), KEY
pxor KEY, T1
movdqa T1, STATE0
movdqa KEY, STATE3
movdqa KEY, STATE4
@@ -224,17 +228,20 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_init)
+SYM_FUNC_END(aegis128_aesni_init)
/*
- * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
- * const void *data);
+ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ * unsigned int len);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_ad)
+SYM_FUNC_START(aegis128_aesni_ad)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set LEN, %edx
FRAME_BEGIN
cmp $0x10, LEN
jb .Lad_out
@@ -332,11 +339,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
RET
.Lad_out:
FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_ad)
+SYM_FUNC_END(aegis128_aesni_ad)
.macro encrypt_block s0 s1 s2 s3 s4 i
movdqu (\i * 0x10)(SRC), MSG
movdqa MSG, T0
pxor \s1, T0
@@ -353,14 +360,18 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad)
cmp $0x10, LEN
jl .Lenc_out_\i
.endm
/*
- * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_enc)
+SYM_FUNC_START(aegis128_aesni_enc)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
FRAME_BEGIN
cmp $0x10, LEN
jb .Lenc_out
@@ -430,17 +441,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
RET
.Lenc_out:
FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc)
+SYM_FUNC_END(aegis128_aesni_enc)
/*
- * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_START(aegis128_aesni_enc_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
@@ -470,11 +485,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_END(aegis128_aesni_enc_tail)
.macro decrypt_block s0 s1 s2 s3 s4 i
movdqu (\i * 0x10)(SRC), MSG
pxor \s1, MSG
pxor \s4, MSG
@@ -490,14 +505,18 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
cmp $0x10, LEN
jl .Ldec_out_\i
.endm
/*
- * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_dec)
+SYM_FUNC_START(aegis128_aesni_dec)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
FRAME_BEGIN
cmp $0x10, LEN
jb .Ldec_out
@@ -567,17 +586,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
RET
.Ldec_out:
FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec)
+SYM_FUNC_END(aegis128_aesni_dec)
/*
- * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_START(aegis128_aesni_dec_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
@@ -617,30 +640,34 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_END(aegis128_aesni_dec_tail)
/*
- * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
- * unsigned int assoclen,
- * unsigned int cryptlen);
+ * void aegis128_aesni_final(struct aegis_state *state,
+ * struct aegis_block *tag_xor,
+ * unsigned int cryptlen, unsigned int assoclen);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_final)
+SYM_FUNC_START(aegis128_aesni_final)
+ .set STATEP, %rdi
+ .set TAG_XOR, %rsi
+ .set ASSOCLEN, %edx
+ .set CRYPTLEN, %ecx
FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
/* prepare length block: */
- movd %edx, MSG
- pinsrd $2, %ecx, MSG
+ movd ASSOCLEN, MSG
+ pinsrd $2, CRYPTLEN, MSG
psllq $3, MSG /* multiply by 8 (to get bit count) */
pxor STATE3, MSG
/* update state: */
@@ -651,18 +678,18 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
aegis128_update; pxor MSG, STATE0
aegis128_update; pxor MSG, STATE4
aegis128_update; pxor MSG, STATE3
/* xor tag: */
- movdqu (%rsi), MSG
+ movdqu (TAG_XOR), MSG
pxor STATE0, MSG
pxor STATE1, MSG
pxor STATE2, MSG
pxor STATE3, MSG
pxor STATE4, MSG
- movdqu MSG, (%rsi)
+ movdqu MSG, (TAG_XOR)
FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_final)
+SYM_FUNC_END(aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 4dd2d981a514..739d92c85790 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -21,31 +21,10 @@
#define AEGIS128_STATE_BLOCKS 5
#define AEGIS128_KEY_SIZE 16
#define AEGIS128_MIN_AUTH_SIZE 8
#define AEGIS128_MAX_AUTH_SIZE 16
-asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
-
-asmlinkage void crypto_aegis128_aesni_ad(
- void *state, unsigned int length, const void *data);
-
-asmlinkage void crypto_aegis128_aesni_enc(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_enc_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_final(
- void *state, void *tag_xor, unsigned int cryptlen,
- unsigned int assoclen);
-
struct aegis_block {
u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
};
struct aegis_state {
@@ -54,10 +33,36 @@ struct aegis_state {
struct aegis_ctx {
struct aegis_block key;
};
+asmlinkage void aegis128_aesni_init(struct aegis_state *state,
+ const struct aegis_block *key,
+ const u8 iv[AEGIS128_NONCE_SIZE]);
+
+asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_final(struct aegis_state *state,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ unsigned int assoclen);
+
static void crypto_aegis128_aesni_process_ad(
struct aegis_state *state, struct scatterlist *sg_src,
unsigned int assoclen)
{
struct scatter_walk walk;
@@ -73,19 +78,18 @@ static void crypto_aegis128_aesni_process_ad(
if (pos + size >= AEGIS128_BLOCK_SIZE) {
if (pos > 0) {
unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
memcpy(buf.bytes + pos, src, fill);
- crypto_aegis128_aesni_ad(state,
- AEGIS128_BLOCK_SIZE,
- buf.bytes);
+ aegis128_aesni_ad(state, buf.bytes,
+ AEGIS128_BLOCK_SIZE);
pos = 0;
left -= fill;
src += fill;
}
- crypto_aegis128_aesni_ad(state, left, src);
+ aegis128_aesni_ad(state, src, left);
src += left & ~(AEGIS128_BLOCK_SIZE - 1);
left &= AEGIS128_BLOCK_SIZE - 1;
}
@@ -98,45 +102,41 @@ static void crypto_aegis128_aesni_process_ad(
scatterwalk_done(&walk, 0, assoclen);
}
if (pos > 0) {
memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
- crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+ aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE);
}
}
static __always_inline void
crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
struct skcipher_walk *walk, bool enc)
{
while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
if (enc)
- crypto_aegis128_aesni_enc(
- state,
- round_down(walk->nbytes,
- AEGIS128_BLOCK_SIZE),
- walk->src.virt.addr,
- walk->dst.virt.addr);
+ aegis128_aesni_enc(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
else
- crypto_aegis128_aesni_dec(
- state,
- round_down(walk->nbytes,
- AEGIS128_BLOCK_SIZE),
- walk->src.virt.addr,
- walk->dst.virt.addr);
+ aegis128_aesni_dec(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
}
if (walk->nbytes) {
if (enc)
- crypto_aegis128_aesni_enc_tail(state, walk->nbytes,
- walk->src.virt.addr,
- walk->dst.virt.addr);
+ aegis128_aesni_enc_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
else
- crypto_aegis128_aesni_dec_tail(state, walk->nbytes,
- walk->src.virt.addr,
- walk->dst.virt.addr);
+ aegis128_aesni_dec_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
skcipher_walk_done(walk, 0);
}
}
static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
@@ -184,14 +184,14 @@ crypto_aegis128_aesni_crypt(struct aead_request *req,
else
skcipher_walk_aead_decrypt(&walk, req, true);
kernel_fpu_begin();
- crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+ aegis128_aesni_init(&state, &ctx->key, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
- crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
}
static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (5 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 06/10] crypto: x86/aegis128 - improve assembly function prototypes Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 08/10] crypto: x86/aegis128 - take advantage of block-aligned len Eric Biggers
` (3 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Optimize the code that loads and stores partial blocks, taking advantage
of SSE4.1. The code is adapted from that in aes-gcm-aesni-x86_64.S.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 236 +++++++++++----------------
1 file changed, 95 insertions(+), 141 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 8131903cc7ff..b5c7abc9a0d4 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -2,10 +2,11 @@
/*
* AES-NI + SSE4.1 implementation of AEGIS-128
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ * Copyright 2024 Google LLC
*/
#include <linux/linkage.h>
#include <asm/frame.h>
@@ -26,15 +27,15 @@
.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
.Laegis128_const_1:
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
-.align 16
-.Laegis128_counter:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32
+.align 32
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
.text
/*
* aegis128_update
@@ -53,136 +54,90 @@
aesenc STATE3, STATE2
aesenc T0, STATE3
.endm
/*
- * __load_partial: internal ABI
- * input:
- * LEN - bytes
- * SRC - src
- * output:
- * MSG - message block
- * changed:
- * T0
- * %r8
- * %r9
+ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register
+ * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8.
*/
-SYM_FUNC_START_LOCAL(__load_partial)
- .set LEN, %ecx
- .set SRC, %rsi
- xor %r9d, %r9d
- pxor MSG, MSG
-
- mov LEN, %r8d
- and $0x1, %r8
- jz .Lld_partial_1
-
- mov LEN, %r8d
- and $0x1E, %r8
- add SRC, %r8
- mov (%r8), %r9b
-
-.Lld_partial_1:
- mov LEN, %r8d
- and $0x2, %r8
- jz .Lld_partial_2
-
- mov LEN, %r8d
- and $0x1C, %r8
- add SRC, %r8
- shl $0x10, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov LEN, %r8d
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov LEN, %r8d
- and $0x18, %r8
- add SRC, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG
-
- mov LEN, %r8d
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov LEN, %r8d
- and $0x10, %r8
- add SRC, %r8
- pslldq $8, MSG
- movq (%r8), T0
- pxor T0, MSG
-
-.Lld_partial_8:
- RET
-SYM_FUNC_END(__load_partial)
+.macro load_partial
+ sub $8, %ecx /* LEN - 8 */
+ jle .Lle8\@
+
+ /* Load 9 <= LEN <= 15 bytes: */
+ movq (SRC), MSG /* Load first 8 bytes */
+ mov (SRC, %rcx), %rax /* Load last 8 bytes */
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax /* Discard overlapping bytes */
+ pinsrq $1, %rax, MSG
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
+
+ /* Load 4 <= LEN <= 8 bytes: */
+ mov (SRC), %eax /* Load first 4 bytes */
+ mov (SRC, %rcx), %r8d /* Load last 4 bytes */
+ jmp .Lcombine\@
+
+.Llt4\@:
+ /* Load 1 <= LEN <= 3 bytes: */
+ add $2, %ecx /* LEN - 2 */
+ movzbl (SRC), %eax /* Load first byte */
+ jl .Lmovq\@
+ movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, %r8
+ or %r8, %rax /* Combine the two parts */
+.Lmovq\@:
+ movq %rax, MSG
+.Ldone\@:
+.endm
/*
- * __store_partial: internal ABI
- * input:
- * LEN - bytes
- * DST - dst
- * output:
- * T0 - message block
- * changed:
- * %r8
- * %r9
- * %r10
+ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer
+ * DST. Clobbers %rax, %rcx, and %r8.
*/
-SYM_FUNC_START_LOCAL(__store_partial)
- .set LEN, %ecx
- .set DST, %rdx
- mov LEN, %r8d
- mov DST, %r9
-
- movq T0, %r10
-
- cmp $8, %r8
- jl .Lst_partial_8
-
- mov %r10, (%r9)
- psrldq $8, T0
- movq T0, %r10
-
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $0x10, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- RET
-SYM_FUNC_END(__store_partial)
+.macro store_partial msg
+ sub $8, %ecx /* LEN - 8 */
+ jl .Llt8\@
+
+ /* Store 8 <= LEN <= 15 bytes: */
+ pextrq $1, \msg, %rax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */
+ movq \msg, (DST) /* Store first 8 bytes */
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
+
+ /* Store 4 <= LEN <= 7 bytes: */
+ pextrd $1, \msg, %eax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */
+ movd \msg, (DST) /* Store first 4 bytes */
+ jmp .Ldone\@
+
+.Llt4\@:
+ /* Store 1 <= LEN <= 3 bytes: */
+ pextrb $0, \msg, 0(DST)
+ cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */
+ jl .Ldone\@
+ pextrb $1, \msg, 1(DST)
+ je .Ldone\@
+ pextrb $2, \msg, 2(DST)
+.Ldone\@:
+.endm
/*
* void aegis128_aesni_init(struct aegis_state *state,
* const struct aegis_block *key,
* const u8 iv[AEGIS128_NONCE_SIZE]);
@@ -451,31 +406,33 @@ SYM_FUNC_END(aegis128_aesni_enc)
*/
SYM_FUNC_START(aegis128_aesni_enc_tail)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
- .set LEN, %ecx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
/* encrypt message: */
- call __load_partial
+ mov LEN, %r9d
+ load_partial
movdqa MSG, T0
pxor STATE1, T0
pxor STATE4, T0
movdqa STATE2, T1
pand STATE3, T1
pxor T1, T0
- call __store_partial
+ mov %r9d, LEN
+ store_partial T0
aegis128_update
pxor MSG, STATE4
/* store the state: */
@@ -596,40 +553,37 @@ SYM_FUNC_END(aegis128_aesni_dec)
*/
SYM_FUNC_START(aegis128_aesni_dec_tail)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
- .set LEN, %ecx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
/* decrypt message: */
- call __load_partial
+ mov LEN, %r9d
+ load_partial
pxor STATE1, MSG
pxor STATE4, MSG
movdqa STATE2, T1
pand STATE3, T1
pxor T1, MSG
- movdqa MSG, T0
- call __store_partial
+ mov %r9d, LEN
+ store_partial MSG
/* mask with byte count: */
- movd LEN, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- movdqa .Laegis128_counter(%rip), T1
- pcmpgtb T1, T0
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub %r9, %rax
+ movdqu (%rax), T0
pand T0, MSG
aegis128_update
pxor MSG, STATE4
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 08/10] crypto: x86/aegis128 - take advantage of block-aligned len
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (6 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1 Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END Eric Biggers
` (2 subsequent siblings)
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Update a caller of aegis128_aesni_ad() to round down the length to a
block boundary. After that, aegis128_aesni_ad(), aegis128_aesni_enc(),
and aegis128_aesni_dec() are only passed whole blocks. Update the
assembly code to take advantage of that, which eliminates some unneeded
instructions. For aegis128_aesni_enc() and aegis128_aesni_dec(), the
length is also always nonzero, so stop checking for zero length.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 37 +++++++++++----------------
arch/x86/crypto/aegis128-aesni-glue.c | 4 +--
2 files changed, 17 insertions(+), 24 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index b5c7abc9a0d4..583e4515e1f1 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -188,19 +188,21 @@ SYM_FUNC_START(aegis128_aesni_init)
SYM_FUNC_END(aegis128_aesni_init)
/*
* void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
* unsigned int len);
+ *
+ * len must be a multiple of 16.
*/
SYM_FUNC_START(aegis128_aesni_ad)
.set STATEP, %rdi
.set SRC, %rsi
.set LEN, %edx
FRAME_BEGIN
- cmp $0x10, LEN
- jb .Lad_out
+ test LEN, LEN
+ jz .Lad_out
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
@@ -211,40 +213,35 @@ SYM_FUNC_START(aegis128_aesni_ad)
.Lad_loop:
movdqu 0x00(SRC), MSG
aegis128_update
pxor MSG, STATE4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_1
+ jz .Lad_out_1
movdqu 0x10(SRC), MSG
aegis128_update
pxor MSG, STATE3
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_2
+ jz .Lad_out_2
movdqu 0x20(SRC), MSG
aegis128_update
pxor MSG, STATE2
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_3
+ jz .Lad_out_3
movdqu 0x30(SRC), MSG
aegis128_update
pxor MSG, STATE1
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_4
+ jz .Lad_out_4
movdqu 0x40(SRC), MSG
aegis128_update
pxor MSG, STATE0
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_0
+ jz .Lad_out_0
add $0x50, SRC
jmp .Lad_loop
/* store the state: */
@@ -310,28 +307,26 @@ SYM_FUNC_END(aegis128_aesni_ad)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lenc_out_\i
+ jz .Lenc_out_\i
.endm
/*
* void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
* unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
*/
SYM_FUNC_START(aegis128_aesni_enc)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx
FRAME_BEGIN
- cmp $0x10, LEN
- jb .Lenc_out
-
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
@@ -457,28 +452,26 @@ SYM_FUNC_END(aegis128_aesni_enc_tail)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Ldec_out_\i
+ jz .Ldec_out_\i
.endm
/*
* void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
* unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
*/
SYM_FUNC_START(aegis128_aesni_dec)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx
FRAME_BEGIN
- cmp $0x10, LEN
- jb .Ldec_out
-
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
movdqu 0x30(STATEP), STATE3
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 739d92c85790..32a42a7dcd3b 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -85,12 +85,12 @@ static void crypto_aegis128_aesni_process_ad(
pos = 0;
left -= fill;
src += fill;
}
- aegis128_aesni_ad(state, src, left);
-
+ aegis128_aesni_ad(state, src,
+ left & ~(AEGIS128_BLOCK_SIZE - 1));
src += left & ~(AEGIS128_BLOCK_SIZE - 1);
left &= AEGIS128_BLOCK_SIZE - 1;
}
memcpy(buf.bytes + pos, src, left);
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (7 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 08/10] crypto: x86/aegis128 - take advantage of block-aligned len Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 10/10] crypto: x86/aegis128 - remove unneeded RETs Eric Biggers
2024-10-15 12:48 ` [PATCH 00/10] AEGIS x86 assembly tuning Ondrej Mosnacek
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Stop using FRAME_BEGIN and FRAME_END in the AEGIS assembly functions,
since all these functions are now leaf functions. This eliminates some
unnecessary instructions.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 34 ----------------------------
1 file changed, 34 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 583e4515e1f1..e025c6bfadbd 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -6,11 +6,10 @@
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
* Copyright 2024 Google LLC
*/
#include <linux/linkage.h>
-#include <asm/frame.h>
#define STATE0 %xmm0
#define STATE1 %xmm1
#define STATE2 %xmm2
#define STATE3 %xmm3
@@ -144,11 +143,10 @@
*/
SYM_FUNC_START(aegis128_aesni_init)
.set STATEP, %rdi
.set KEYP, %rsi
.set IVP, %rdx
- FRAME_BEGIN
/* load IV: */
movdqu (IVP), T1
/* load key: */
@@ -180,12 +178,10 @@ SYM_FUNC_START(aegis128_aesni_init)
movdqu STATE0, 0x00(STATEP)
movdqu STATE1, 0x10(STATEP)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
-
- FRAME_END
RET
SYM_FUNC_END(aegis128_aesni_init)
/*
* void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
@@ -195,11 +191,10 @@ SYM_FUNC_END(aegis128_aesni_init)
*/
SYM_FUNC_START(aegis128_aesni_ad)
.set STATEP, %rdi
.set SRC, %rsi
.set LEN, %edx
- FRAME_BEGIN
test LEN, LEN
jz .Lad_out
/* load the state: */
@@ -249,51 +244,45 @@ SYM_FUNC_START(aegis128_aesni_ad)
movdqu STATE0, 0x00(STATEP)
movdqu STATE1, 0x10(STATEP)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_1:
movdqu STATE4, 0x00(STATEP)
movdqu STATE0, 0x10(STATEP)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_2:
movdqu STATE3, 0x00(STATEP)
movdqu STATE4, 0x10(STATEP)
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_3:
movdqu STATE2, 0x00(STATEP)
movdqu STATE3, 0x10(STATEP)
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_4:
movdqu STATE1, 0x00(STATEP)
movdqu STATE2, 0x10(STATEP)
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
RET
.Lad_out:
- FRAME_END
RET
SYM_FUNC_END(aegis128_aesni_ad)
.macro encrypt_block s0 s1 s2 s3 s4 i
movdqu (\i * 0x10)(SRC), MSG
@@ -321,11 +310,10 @@ SYM_FUNC_END(aegis128_aesni_ad)
SYM_FUNC_START(aegis128_aesni_enc)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx
- FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
@@ -349,51 +337,45 @@ SYM_FUNC_START(aegis128_aesni_enc)
movdqu STATE4, 0x00(STATEP)
movdqu STATE0, 0x10(STATEP)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_1:
movdqu STATE3, 0x00(STATEP)
movdqu STATE4, 0x10(STATEP)
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_2:
movdqu STATE2, 0x00(STATEP)
movdqu STATE3, 0x10(STATEP)
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_3:
movdqu STATE1, 0x00(STATEP)
movdqu STATE2, 0x10(STATEP)
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_4:
movdqu STATE0, 0x00(STATEP)
movdqu STATE1, 0x10(STATEP)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out:
- FRAME_END
RET
SYM_FUNC_END(aegis128_aesni_enc)
/*
* void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
@@ -402,11 +384,10 @@ SYM_FUNC_END(aegis128_aesni_enc)
SYM_FUNC_START(aegis128_aesni_enc_tail)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
- FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
@@ -434,12 +415,10 @@ SYM_FUNC_START(aegis128_aesni_enc_tail)
movdqu STATE4, 0x00(STATEP)
movdqu STATE0, 0x10(STATEP)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
-
- FRAME_END
RET
SYM_FUNC_END(aegis128_aesni_enc_tail)
.macro decrypt_block s0 s1 s2 s3 s4 i
movdqu (\i * 0x10)(SRC), MSG
@@ -466,11 +445,10 @@ SYM_FUNC_END(aegis128_aesni_enc_tail)
SYM_FUNC_START(aegis128_aesni_dec)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx
- FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
@@ -494,51 +472,45 @@ SYM_FUNC_START(aegis128_aesni_dec)
movdqu STATE4, 0x00(STATEP)
movdqu STATE0, 0x10(STATEP)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_1:
movdqu STATE3, 0x00(STATEP)
movdqu STATE4, 0x10(STATEP)
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_2:
movdqu STATE2, 0x00(STATEP)
movdqu STATE3, 0x10(STATEP)
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_3:
movdqu STATE1, 0x00(STATEP)
movdqu STATE2, 0x10(STATEP)
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_4:
movdqu STATE0, 0x00(STATEP)
movdqu STATE1, 0x10(STATEP)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out:
- FRAME_END
RET
SYM_FUNC_END(aegis128_aesni_dec)
/*
* void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
@@ -547,11 +519,10 @@ SYM_FUNC_END(aegis128_aesni_dec)
SYM_FUNC_START(aegis128_aesni_dec_tail)
.set STATEP, %rdi
.set SRC, %rsi
.set DST, %rdx
.set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
- FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
@@ -584,12 +555,10 @@ SYM_FUNC_START(aegis128_aesni_dec_tail)
movdqu STATE4, 0x00(STATEP)
movdqu STATE0, 0x10(STATEP)
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
-
- FRAME_END
RET
SYM_FUNC_END(aegis128_aesni_dec_tail)
/*
* void aegis128_aesni_final(struct aegis_state *state,
@@ -599,11 +568,10 @@ SYM_FUNC_END(aegis128_aesni_dec_tail)
SYM_FUNC_START(aegis128_aesni_final)
.set STATEP, %rdi
.set TAG_XOR, %rsi
.set ASSOCLEN, %edx
.set CRYPTLEN, %ecx
- FRAME_BEGIN
/* load the state: */
movdqu 0x00(STATEP), STATE0
movdqu 0x10(STATEP), STATE1
movdqu 0x20(STATEP), STATE2
@@ -634,9 +602,7 @@ SYM_FUNC_START(aegis128_aesni_final)
pxor STATE2, MSG
pxor STATE3, MSG
pxor STATE4, MSG
movdqu MSG, (TAG_XOR)
-
- FRAME_END
RET
SYM_FUNC_END(aegis128_aesni_final)
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 10/10] crypto: x86/aegis128 - remove unneeded RETs
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (8 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END Eric Biggers
@ 2024-10-07 1:24 ` Eric Biggers
2024-10-15 12:48 ` [PATCH 00/10] AEGIS x86 assembly tuning Ondrej Mosnacek
10 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-07 1:24 UTC (permalink / raw)
To: linux-crypto; +Cc: x86, Ondrej Mosnacek
From: Eric Biggers <ebiggers@google.com>
Remove returns that are immediately followed by another return.
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/aegis128-aesni-asm.S | 6 ------
1 file changed, 6 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index e025c6bfadbd..c899948d24c9 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -276,12 +276,10 @@ SYM_FUNC_START(aegis128_aesni_ad)
movdqu STATE1, 0x00(STATEP)
movdqu STATE2, 0x10(STATEP)
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- RET
-
.Lad_out:
RET
SYM_FUNC_END(aegis128_aesni_ad)
.macro encrypt_block s0 s1 s2 s3 s4 i
@@ -369,12 +367,10 @@ SYM_FUNC_START(aegis128_aesni_enc)
movdqu STATE0, 0x00(STATEP)
movdqu STATE1, 0x10(STATEP)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- RET
-
.Lenc_out:
RET
SYM_FUNC_END(aegis128_aesni_enc)
/*
@@ -504,12 +500,10 @@ SYM_FUNC_START(aegis128_aesni_dec)
movdqu STATE0, 0x00(STATEP)
movdqu STATE1, 0x10(STATEP)
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- RET
-
.Ldec_out:
RET
SYM_FUNC_END(aegis128_aesni_dec)
/*
--
2.46.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 03/10] crypto: x86/aegis128 - eliminate some indirect calls
2024-10-07 1:24 ` [PATCH 03/10] crypto: x86/aegis128 - eliminate some indirect calls Eric Biggers
@ 2024-10-15 12:41 ` Ondrej Mosnacek
2024-10-15 15:43 ` Eric Biggers
0 siblings, 1 reply; 14+ messages in thread
From: Ondrej Mosnacek @ 2024-10-15 12:41 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-crypto, x86
On Mon, Oct 7, 2024 at 3:33 AM Eric Biggers <ebiggers@kernel.org> wrote:
>
> From: Eric Biggers <ebiggers@google.com>
>
> Instead of using a struct of function pointers to decide whether to call
> the encryption or decryption assembly functions, use a conditional
> branch on a bool. Force-inline the functions to avoid actually
> generating the branch. This improves performance slightly since
> indirect calls are slow. Remove the now-unnecessary CFI stubs.
Wouldn't the compiler be able to optimize out the indirect calls
already if you merely force-inline the functions without the other
changes? Then again, it's just a few places that grow the if-else, so
I'm fine with the boolean approach, too.
>
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
> arch/x86/crypto/aegis128-aesni-asm.S | 9 ++--
> arch/x86/crypto/aegis128-aesni-glue.c | 74 +++++++++++++--------------
> 2 files changed, 40 insertions(+), 43 deletions(-)
>
> diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
> index 2de859173940..1b57558548c7 100644
> --- a/arch/x86/crypto/aegis128-aesni-asm.S
> +++ b/arch/x86/crypto/aegis128-aesni-asm.S
> @@ -5,11 +5,10 @@
> * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
> */
>
> #include <linux/linkage.h>
> -#include <linux/cfi_types.h>
> #include <asm/frame.h>
>
> #define STATE0 %xmm0
> #define STATE1 %xmm1
> #define STATE2 %xmm2
> @@ -401,11 +400,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad)
>
> /*
> * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
> * const void *src, void *dst);
> */
> -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
> +SYM_FUNC_START(crypto_aegis128_aesni_enc)
> FRAME_BEGIN
>
> cmp $0x10, LEN
> jb .Lenc_out
>
> @@ -498,11 +497,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc)
>
> /*
> * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
> * const void *src, void *dst);
> */
> -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
> +SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
> FRAME_BEGIN
>
> /* load the state: */
> movdqu 0x00(STATEP), STATE0
> movdqu 0x10(STATEP), STATE1
> @@ -555,11 +554,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
>
> /*
> * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
> * const void *src, void *dst);
> */
> -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
> +SYM_FUNC_START(crypto_aegis128_aesni_dec)
> FRAME_BEGIN
>
> cmp $0x10, LEN
> jb .Ldec_out
>
> @@ -652,11 +651,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec)
>
> /*
> * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
> * const void *src, void *dst);
> */
> -SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
> +SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
> FRAME_BEGIN
>
> /* load the state: */
> movdqu 0x00(STATEP), STATE0
> movdqu 0x10(STATEP), STATE1
> diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
> index 96586470154e..deb39cef0be1 100644
> --- a/arch/x86/crypto/aegis128-aesni-glue.c
> +++ b/arch/x86/crypto/aegis128-aesni-glue.c
> @@ -54,20 +54,10 @@ struct aegis_state {
>
> struct aegis_ctx {
> struct aegis_block key;
> };
>
> -struct aegis_crypt_ops {
> - int (*skcipher_walk_init)(struct skcipher_walk *walk,
> - struct aead_request *req, bool atomic);
> -
> - void (*crypt_blocks)(void *state, unsigned int length, const void *src,
> - void *dst);
> - void (*crypt_tail)(void *state, unsigned int length, const void *src,
> - void *dst);
> -};
> -
> static void crypto_aegis128_aesni_process_ad(
> struct aegis_state *state, struct scatterlist *sg_src,
> unsigned int assoclen)
> {
> struct scatter_walk walk;
> @@ -112,24 +102,41 @@ static void crypto_aegis128_aesni_process_ad(
> memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
> crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
> }
> }
>
> -static void crypto_aegis128_aesni_process_crypt(
> - struct aegis_state *state, struct skcipher_walk *walk,
> - const struct aegis_crypt_ops *ops)
> +static __always_inline void
> +crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
> + struct skcipher_walk *walk, bool enc)
> {
> while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
> - ops->crypt_blocks(state,
> - round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
> - walk->src.virt.addr, walk->dst.virt.addr);
> + if (enc)
> + crypto_aegis128_aesni_enc(
> + state,
> + round_down(walk->nbytes,
> + AEGIS128_BLOCK_SIZE),
> + walk->src.virt.addr,
> + walk->dst.virt.addr);
> + else
> + crypto_aegis128_aesni_dec(
> + state,
> + round_down(walk->nbytes,
> + AEGIS128_BLOCK_SIZE),
> + walk->src.virt.addr,
> + walk->dst.virt.addr);
> skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
> }
>
> if (walk->nbytes) {
> - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
> - walk->dst.virt.addr);
> + if (enc)
> + crypto_aegis128_aesni_enc_tail(state, walk->nbytes,
> + walk->src.virt.addr,
> + walk->dst.virt.addr);
> + else
> + crypto_aegis128_aesni_dec_tail(state, walk->nbytes,
> + walk->src.virt.addr,
> + walk->dst.virt.addr);
> skcipher_walk_done(walk, 0);
> }
> }
>
> static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
> @@ -160,71 +167,62 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
> if (authsize < AEGIS128_MIN_AUTH_SIZE)
> return -EINVAL;
> return 0;
> }
>
> -static void crypto_aegis128_aesni_crypt(struct aead_request *req,
> - struct aegis_block *tag_xor,
> - unsigned int cryptlen,
> - const struct aegis_crypt_ops *ops)
> +static __always_inline void
> +crypto_aegis128_aesni_crypt(struct aead_request *req,
> + struct aegis_block *tag_xor,
> + unsigned int cryptlen, bool enc)
> {
> struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
> struct skcipher_walk walk;
> struct aegis_state state;
>
> - ops->skcipher_walk_init(&walk, req, true);
> + if (enc)
> + skcipher_walk_aead_encrypt(&walk, req, true);
> + else
> + skcipher_walk_aead_decrypt(&walk, req, true);
>
> kernel_fpu_begin();
>
> crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
> crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
> - crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
> + crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
> crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
>
> kernel_fpu_end();
> }
>
> static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
> {
> - static const struct aegis_crypt_ops OPS = {
> - .skcipher_walk_init = skcipher_walk_aead_encrypt,
> - .crypt_blocks = crypto_aegis128_aesni_enc,
> - .crypt_tail = crypto_aegis128_aesni_enc_tail,
> - };
> -
> struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> struct aegis_block tag = {};
> unsigned int authsize = crypto_aead_authsize(tfm);
> unsigned int cryptlen = req->cryptlen;
>
> - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
> + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
>
> scatterwalk_map_and_copy(tag.bytes, req->dst,
> req->assoclen + cryptlen, authsize, 1);
> return 0;
> }
>
> static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
> {
> static const struct aegis_block zeros = {};
>
> - static const struct aegis_crypt_ops OPS = {
> - .skcipher_walk_init = skcipher_walk_aead_decrypt,
> - .crypt_blocks = crypto_aegis128_aesni_dec,
> - .crypt_tail = crypto_aegis128_aesni_dec_tail,
> - };
> -
> struct crypto_aead *tfm = crypto_aead_reqtfm(req);
> struct aegis_block tag;
> unsigned int authsize = crypto_aead_authsize(tfm);
> unsigned int cryptlen = req->cryptlen - authsize;
>
> scatterwalk_map_and_copy(tag.bytes, req->src,
> req->assoclen + cryptlen, authsize, 0);
>
> - crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
> + crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
>
> return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
> }
>
> static struct aead_alg crypto_aegis128_aesni_alg = {
> --
> 2.46.2
>
--
Ondrej Mosnacek
Senior Software Engineer, Linux Security - SELinux kernel
Red Hat, Inc.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 00/10] AEGIS x86 assembly tuning
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
` (9 preceding siblings ...)
2024-10-07 1:24 ` [PATCH 10/10] crypto: x86/aegis128 - remove unneeded RETs Eric Biggers
@ 2024-10-15 12:48 ` Ondrej Mosnacek
10 siblings, 0 replies; 14+ messages in thread
From: Ondrej Mosnacek @ 2024-10-15 12:48 UTC (permalink / raw)
To: Eric Biggers; +Cc: linux-crypto, x86
On Mon, Oct 7, 2024 at 3:33 AM Eric Biggers <ebiggers@kernel.org> wrote:
>
> This series cleans up the AES-NI optimized implementation of AEGIS-128.
>
> Performance is improved by 1-5% depending on the input lengths. Binary
> code size is reduced by about 20% (measuring glue + assembly combined),
> and source code length is reduced by about 150 lines.
>
> The first patch also fixes a bug which could theoretically cause
> incorrect behavior but was seemingly not being encountered in practice.
>
> Note: future optimizations for AEGIS-128 could involve adding AVX512 /
> AVX10 optimized assembly code. However, unfortunately due to the way
> that AEGIS-128 is specified, its level of parallelism is limited, and it
> can't really take advantage of vector lengths greater than 128 bits.
> So, probably this would provide only another modest improvement, mostly
> coming from being able to use the ternary logic instructions.
>
> Eric Biggers (10):
> crypto: x86/aegis128 - access 32-bit arguments as 32-bit
> crypto: x86/aegis128 - remove no-op init and exit functions
> crypto: x86/aegis128 - eliminate some indirect calls
> crypto: x86/aegis128 - don't bother with special code for aligned data
> crypto: x86/aegis128 - optimize length block preparation using SSE4.1
> crypto: x86/aegis128 - improve assembly function prototypes
> crypto: x86/aegis128 - optimize partial block handling using SSE4.1
> crypto: x86/aegis128 - take advantage of block-aligned len
> crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END
> crypto: x86/aegis128 - remove unneeded RETs
>
> arch/x86/crypto/Kconfig | 4 +-
> arch/x86/crypto/aegis128-aesni-asm.S | 532 ++++++++++----------------
> arch/x86/crypto/aegis128-aesni-glue.c | 145 ++++---
> 3 files changed, 261 insertions(+), 420 deletions(-)
>
>
> base-commit: 9852d85ec9d492ebef56dc5f229416c925758edc
> --
> 2.46.2
>
Nice work!
Notwithstanding my non-blocking comment on patch #3:
Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
--
Ondrej Mosnacek
Senior Software Engineer, Linux Security - SELinux kernel
Red Hat, Inc.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 03/10] crypto: x86/aegis128 - eliminate some indirect calls
2024-10-15 12:41 ` Ondrej Mosnacek
@ 2024-10-15 15:43 ` Eric Biggers
0 siblings, 0 replies; 14+ messages in thread
From: Eric Biggers @ 2024-10-15 15:43 UTC (permalink / raw)
To: Ondrej Mosnacek; +Cc: linux-crypto, x86
On Tue, Oct 15, 2024 at 02:41:34PM +0200, Ondrej Mosnacek wrote:
> On Mon, Oct 7, 2024 at 3:33 AM Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > From: Eric Biggers <ebiggers@google.com>
> >
> > Instead of using a struct of function pointers to decide whether to call
> > the encryption or decryption assembly functions, use a conditional
> > branch on a bool. Force-inline the functions to avoid actually
> > generating the branch. This improves performance slightly since
> > indirect calls are slow. Remove the now-unnecessary CFI stubs.
>
> Wouldn't the compiler be able to optimize out the indirect calls
> already if you merely force-inline the functions without the other
> changes? Then again, it's just a few places that grow the if-else, so
> I'm fine with the boolean approach, too.
There's no guarantee that the compiler will actually optimize out the indirect
calls that way.
- Eric
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2024-10-15 15:43 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-07 1:24 [PATCH 00/10] AEGIS x86 assembly tuning Eric Biggers
2024-10-07 1:24 ` [PATCH 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
2024-10-07 1:24 ` [PATCH 02/10] crypto: x86/aegis128 - remove no-op init and exit functions Eric Biggers
2024-10-07 1:24 ` [PATCH 03/10] crypto: x86/aegis128 - eliminate some indirect calls Eric Biggers
2024-10-15 12:41 ` Ondrej Mosnacek
2024-10-15 15:43 ` Eric Biggers
2024-10-07 1:24 ` [PATCH 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data Eric Biggers
2024-10-07 1:24 ` [PATCH 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1 Eric Biggers
2024-10-07 1:24 ` [PATCH 06/10] crypto: x86/aegis128 - improve assembly function prototypes Eric Biggers
2024-10-07 1:24 ` [PATCH 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1 Eric Biggers
2024-10-07 1:24 ` [PATCH 08/10] crypto: x86/aegis128 - take advantage of block-aligned len Eric Biggers
2024-10-07 1:24 ` [PATCH 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END Eric Biggers
2024-10-07 1:24 ` [PATCH 10/10] crypto: x86/aegis128 - remove unneeded RETs Eric Biggers
2024-10-15 12:48 ` [PATCH 00/10] AEGIS x86 assembly tuning Ondrej Mosnacek
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).