linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 00/10] AEGIS x86 assembly tuning
@ 2024-10-17  0:00 Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
                   ` (10 more replies)
  0 siblings, 11 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

This series cleans up the AES-NI optimized implementation of AEGIS-128.

Performance is improved by 1-5% depending on the input lengths.  Binary
code size is reduced by about 20% (measuring glue + assembly combined),
and source code length is reduced by about 150 lines.

The first patch also fixes a bug which could theoretically cause
incorrect behavior but was seemingly not being encountered in practice.

Note: future optimizations for AEGIS-128 could involve adding AVX512 /
AVX10 optimized assembly code.  However, unfortunately due to the way
that AEGIS-128 is specified, its level of parallelism is limited, and it
can't really take advantage of vector lengths greater than 128 bits.
So, probably this would provide only another modest improvement, mostly
coming from being able to use the ternary logic instructions.

Changed in v2:
- Put assoclen and cryptlen in the correct order in the prototype of
  aegis128_aesni_final().
- Expanded commit message of "eliminate some indirect calls"
- Added Ondrej's Reviewed-by.

Eric Biggers (10):
  crypto: x86/aegis128 - access 32-bit arguments as 32-bit
  crypto: x86/aegis128 - remove no-op init and exit functions
  crypto: x86/aegis128 - eliminate some indirect calls
  crypto: x86/aegis128 - don't bother with special code for aligned data
  crypto: x86/aegis128 - optimize length block preparation using SSE4.1
  crypto: x86/aegis128 - improve assembly function prototypes
  crypto: x86/aegis128 - optimize partial block handling using SSE4.1
  crypto: x86/aegis128 - take advantage of block-aligned len
  crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END
  crypto: x86/aegis128 - remove unneeded RETs

 arch/x86/crypto/Kconfig               |   4 +-
 arch/x86/crypto/aegis128-aesni-asm.S  | 532 ++++++++++----------------
 arch/x86/crypto/aegis128-aesni-glue.c | 145 ++++---
 3 files changed, 261 insertions(+), 420 deletions(-)

base-commit: 5c20772738e1d1d7bec41664eb9d61497e53c10e
-- 
2.47.0


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 02/10] crypto: x86/aegis128 - remove no-op init and exit functions Eric Biggers
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel, stable

From: Eric Biggers <ebiggers@google.com>

Fix the AEGIS assembly code to access 'unsigned int' arguments as 32-bit
values instead of 64-bit, since the upper bits of the corresponding
64-bit registers are not guaranteed to be zero.

Note: there haven't been any reports of this bug actually causing
incorrect behavior.  Neither gcc nor clang guarantee zero-extension to
64 bits, but zero-extension is likely to happen in practice because most
instructions that operate on 32-bit registers zero-extend to 64 bits.

Fixes: 1d373d4e8e15 ("crypto: x86 - Add optimized AEGIS implementations")
Cc: stable@vger.kernel.org
Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S | 29 ++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index ad7f4c8916256..2de859173940e 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -19,11 +19,11 @@
 #define MSG	%xmm5
 #define T0	%xmm6
 #define T1	%xmm7
 
 #define STATEP	%rdi
-#define LEN	%rsi
+#define LEN	%esi
 #define SRC	%rdx
 #define DST	%rcx
 
 .section .rodata.cst16.aegis128_const, "aM", @progbits, 32
 .align 16
@@ -74,50 +74,50 @@
  */
 SYM_FUNC_START_LOCAL(__load_partial)
 	xor %r9d, %r9d
 	pxor MSG, MSG
 
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x1, %r8
 	jz .Lld_partial_1
 
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x1E, %r8
 	add SRC, %r8
 	mov (%r8), %r9b
 
 .Lld_partial_1:
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x2, %r8
 	jz .Lld_partial_2
 
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x1C, %r8
 	add SRC, %r8
 	shl $0x10, %r9
 	mov (%r8), %r9w
 
 .Lld_partial_2:
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x4, %r8
 	jz .Lld_partial_4
 
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x18, %r8
 	add SRC, %r8
 	shl $32, %r9
 	mov (%r8), %r8d
 	xor %r8, %r9
 
 .Lld_partial_4:
 	movq %r9, MSG
 
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x8, %r8
 	jz .Lld_partial_8
 
-	mov LEN, %r8
+	mov LEN, %r8d
 	and $0x10, %r8
 	add SRC, %r8
 	pslldq $8, MSG
 	movq (%r8), T0
 	pxor T0, MSG
@@ -137,11 +137,11 @@ SYM_FUNC_END(__load_partial)
  *   %r8
  *   %r9
  *   %r10
  */
 SYM_FUNC_START_LOCAL(__store_partial)
-	mov LEN, %r8
+	mov LEN, %r8d
 	mov DST, %r9
 
 	movq T0, %r10
 
 	cmp $8, %r8
@@ -675,11 +675,11 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
 
 	movdqa MSG, T0
 	call __store_partial
 
 	/* mask with byte count: */
-	movq LEN, T0
+	movd LEN, T0
 	punpcklbw T0, T0
 	punpcklbw T0, T0
 	punpcklbw T0, T0
 	punpcklbw T0, T0
 	movdqa .Laegis128_counter(%rip), T1
@@ -700,11 +700,12 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
 	RET
 SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
 
 /*
  * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
- *                                  u64 assoclen, u64 cryptlen);
+ *                                  unsigned int assoclen,
+ *                                  unsigned int cryptlen);
  */
 SYM_FUNC_START(crypto_aegis128_aesni_final)
 	FRAME_BEGIN
 
 	/* load the state: */
@@ -713,12 +714,12 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
 	/* prepare length block: */
-	movq %rdx, MSG
-	movq %rcx, T0
+	movd %edx, MSG
+	movd %ecx, T0
 	pslldq $8, T0
 	pxor T0, MSG
 	psllq $3, MSG /* multiply by 8 (to get bit count) */
 
 	pxor STATE3, MSG
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 02/10] crypto: x86/aegis128 - remove no-op init and exit functions
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 03/10] crypto: x86/aegis128 - eliminate some indirect calls Eric Biggers
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Don't bother providing empty stubs for the init and exit methods in
struct aead_alg, since they are optional anyway.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-glue.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 4623189000d89..96586470154e0 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -225,26 +225,15 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
 	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
 
 	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
 }
 
-static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
-{
-	return 0;
-}
-
-static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
-{
-}
-
 static struct aead_alg crypto_aegis128_aesni_alg = {
 	.setkey = crypto_aegis128_aesni_setkey,
 	.setauthsize = crypto_aegis128_aesni_setauthsize,
 	.encrypt = crypto_aegis128_aesni_encrypt,
 	.decrypt = crypto_aegis128_aesni_decrypt,
-	.init = crypto_aegis128_aesni_init_tfm,
-	.exit = crypto_aegis128_aesni_exit_tfm,
 
 	.ivsize = AEGIS128_NONCE_SIZE,
 	.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
 	.chunksize = AEGIS128_BLOCK_SIZE,
 
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 03/10] crypto: x86/aegis128 - eliminate some indirect calls
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 02/10] crypto: x86/aegis128 - remove no-op init and exit functions Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data Eric Biggers
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Instead of using a struct of function pointers to decide whether to call
the encryption or decryption assembly functions, use a conditional
branch on a bool.  Force-inline the functions to avoid actually
generating the branch.  This improves performance slightly since
indirect calls are slow.  Remove the now-unnecessary CFI stubs.

Note that just force-inlining the existing functions might cause the
compiler to optimize out the indirect branches, but that would not be a
reliable way to do it and the CFI stubs would still be required.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S  |  9 ++--
 arch/x86/crypto/aegis128-aesni-glue.c | 74 +++++++++++++--------------
 2 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 2de859173940e..1b57558548c78 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -5,11 +5,10 @@
  * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
  */
 
 #include <linux/linkage.h>
-#include <linux/cfi_types.h>
 #include <asm/frame.h>
 
 #define STATE0	%xmm0
 #define STATE1	%xmm1
 #define STATE2	%xmm2
@@ -401,11 +400,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad)
 
 /*
  * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
  *                                const void *src, void *dst);
  */
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
+SYM_FUNC_START(crypto_aegis128_aesni_enc)
 	FRAME_BEGIN
 
 	cmp $0x10, LEN
 	jb .Lenc_out
 
@@ -498,11 +497,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc)
 
 /*
  * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
  *                                     const void *src, void *dst);
  */
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
 	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
@@ -555,11 +554,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
 
 /*
  * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
  *                                const void *src, void *dst);
  */
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
+SYM_FUNC_START(crypto_aegis128_aesni_dec)
 	FRAME_BEGIN
 
 	cmp $0x10, LEN
 	jb .Ldec_out
 
@@ -652,11 +651,11 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec)
 
 /*
  * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
  *                                     const void *src, void *dst);
  */
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
 	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 96586470154e0..deb39cef0be1a 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -54,20 +54,10 @@ struct aegis_state {
 
 struct aegis_ctx {
 	struct aegis_block key;
 };
 
-struct aegis_crypt_ops {
-	int (*skcipher_walk_init)(struct skcipher_walk *walk,
-				  struct aead_request *req, bool atomic);
-
-	void (*crypt_blocks)(void *state, unsigned int length, const void *src,
-			     void *dst);
-	void (*crypt_tail)(void *state, unsigned int length, const void *src,
-			   void *dst);
-};
-
 static void crypto_aegis128_aesni_process_ad(
 		struct aegis_state *state, struct scatterlist *sg_src,
 		unsigned int assoclen)
 {
 	struct scatter_walk walk;
@@ -112,24 +102,41 @@ static void crypto_aegis128_aesni_process_ad(
 		memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
 		crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
 	}
 }
 
-static void crypto_aegis128_aesni_process_crypt(
-		struct aegis_state *state, struct skcipher_walk *walk,
-		const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
+				    struct skcipher_walk *walk, bool enc)
 {
 	while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
-		ops->crypt_blocks(state,
-				  round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
-				  walk->src.virt.addr, walk->dst.virt.addr);
+		if (enc)
+			crypto_aegis128_aesni_enc(
+					state,
+					round_down(walk->nbytes,
+						   AEGIS128_BLOCK_SIZE),
+					walk->src.virt.addr,
+					walk->dst.virt.addr);
+		else
+			crypto_aegis128_aesni_dec(
+					state,
+					round_down(walk->nbytes,
+						   AEGIS128_BLOCK_SIZE),
+					walk->src.virt.addr,
+					walk->dst.virt.addr);
 		skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
 	}
 
 	if (walk->nbytes) {
-		ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
-				walk->dst.virt.addr);
+		if (enc)
+			crypto_aegis128_aesni_enc_tail(state, walk->nbytes,
+						       walk->src.virt.addr,
+						       walk->dst.virt.addr);
+		else
+			crypto_aegis128_aesni_dec_tail(state, walk->nbytes,
+						       walk->src.virt.addr,
+						       walk->dst.virt.addr);
 		skcipher_walk_done(walk, 0);
 	}
 }
 
 static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
@@ -160,71 +167,62 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
 	if (authsize < AEGIS128_MIN_AUTH_SIZE)
 		return -EINVAL;
 	return 0;
 }
 
-static void crypto_aegis128_aesni_crypt(struct aead_request *req,
-					struct aegis_block *tag_xor,
-					unsigned int cryptlen,
-					const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_crypt(struct aead_request *req,
+			    struct aegis_block *tag_xor,
+			    unsigned int cryptlen, bool enc)
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
 	struct skcipher_walk walk;
 	struct aegis_state state;
 
-	ops->skcipher_walk_init(&walk, req, true);
+	if (enc)
+		skcipher_walk_aead_encrypt(&walk, req, true);
+	else
+		skcipher_walk_aead_decrypt(&walk, req, true);
 
 	kernel_fpu_begin();
 
 	crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
 	crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
-	crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
+	crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
 	crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
 
 	kernel_fpu_end();
 }
 
 static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
 {
-	static const struct aegis_crypt_ops OPS = {
-		.skcipher_walk_init = skcipher_walk_aead_encrypt,
-		.crypt_blocks = crypto_aegis128_aesni_enc,
-		.crypt_tail = crypto_aegis128_aesni_enc_tail,
-	};
-
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aegis_block tag = {};
 	unsigned int authsize = crypto_aead_authsize(tfm);
 	unsigned int cryptlen = req->cryptlen;
 
-	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
 
 	scatterwalk_map_and_copy(tag.bytes, req->dst,
 				 req->assoclen + cryptlen, authsize, 1);
 	return 0;
 }
 
 static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
 {
 	static const struct aegis_block zeros = {};
 
-	static const struct aegis_crypt_ops OPS = {
-		.skcipher_walk_init = skcipher_walk_aead_decrypt,
-		.crypt_blocks = crypto_aegis128_aesni_dec,
-		.crypt_tail = crypto_aegis128_aesni_dec_tail,
-	};
-
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aegis_block tag;
 	unsigned int authsize = crypto_aead_authsize(tfm);
 	unsigned int cryptlen = req->cryptlen - authsize;
 
 	scatterwalk_map_and_copy(tag.bytes, req->src,
 				 req->assoclen + cryptlen, authsize, 0);
 
-	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+	crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
 
 	return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
 }
 
 static struct aead_alg crypto_aegis128_aesni_alg = {
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (2 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 03/10] crypto: x86/aegis128 - eliminate some indirect calls Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1 Eric Biggers
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Remove the AEGIS assembly code paths that were "optimized" to operate on
16-byte aligned data using movdqa, and instead just use the code paths
that use movdqu and can handle data with any alignment.

This does not reduce performance.  movdqa is basically a historical
artifact; on aligned data, movdqu and movdqa have had the same
performance since Intel Nehalem (2008) and AMD Bulldozer (2011).  And
code that requires AES-NI cannot run on CPUs older than those anyway.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S | 122 +++++----------------------
 1 file changed, 22 insertions(+), 100 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 1b57558548c78..5541aca2fd0dd 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -243,56 +243,12 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
-	mov SRC, %r8
-	and $0xF, %r8
-	jnz .Lad_u_loop
-
-.align 8
-.Lad_a_loop:
-	movdqa 0x00(SRC), MSG
-	aegis128_update
-	pxor MSG, STATE4
-	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_1
-
-	movdqa 0x10(SRC), MSG
-	aegis128_update
-	pxor MSG, STATE3
-	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_2
-
-	movdqa 0x20(SRC), MSG
-	aegis128_update
-	pxor MSG, STATE2
-	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_3
-
-	movdqa 0x30(SRC), MSG
-	aegis128_update
-	pxor MSG, STATE1
-	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_4
-
-	movdqa 0x40(SRC), MSG
-	aegis128_update
-	pxor MSG, STATE0
-	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_0
-
-	add $0x50, SRC
-	jmp .Lad_a_loop
-
 .align 8
-.Lad_u_loop:
+.Lad_loop:
 	movdqu 0x00(SRC), MSG
 	aegis128_update
 	pxor MSG, STATE4
 	sub $0x10, LEN
 	cmp $0x10, LEN
@@ -325,11 +281,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
 	sub $0x10, LEN
 	cmp $0x10, LEN
 	jl .Lad_out_0
 
 	add $0x50, SRC
-	jmp .Lad_u_loop
+	jmp .Lad_loop
 
 	/* store the state: */
 .Lad_out_0:
 	movdqu STATE0, 0x00(STATEP)
 	movdqu STATE1, 0x10(STATEP)
@@ -378,19 +334,19 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
 .Lad_out:
 	FRAME_END
 	RET
 SYM_FUNC_END(crypto_aegis128_aesni_ad)
 
-.macro encrypt_block a s0 s1 s2 s3 s4 i
-	movdq\a (\i * 0x10)(SRC), MSG
+.macro encrypt_block s0 s1 s2 s3 s4 i
+	movdqu (\i * 0x10)(SRC), MSG
 	movdqa MSG, T0
 	pxor \s1, T0
 	pxor \s4, T0
 	movdqa \s2, T1
 	pand \s3, T1
 	pxor T1, T0
-	movdq\a T0, (\i * 0x10)(DST)
+	movdqu T0, (\i * 0x10)(DST)
 
 	aegis128_update
 	pxor MSG, \s4
 
 	sub $0x10, LEN
@@ -413,38 +369,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
-	mov  SRC,  %r8
-	or   DST,  %r8
-	and $0xF, %r8
-	jnz .Lenc_u_loop
-
 .align 8
-.Lenc_a_loop:
-	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
-	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
-	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
-	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
-	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Lenc_loop:
+	encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+	encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+	encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+	encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+	encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
 
 	add $0x50, SRC
 	add $0x50, DST
-	jmp .Lenc_a_loop
-
-.align 8
-.Lenc_u_loop:
-	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
-	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
-	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
-	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
-	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
-
-	add $0x50, SRC
-	add $0x50, DST
-	jmp .Lenc_u_loop
+	jmp .Lenc_loop
 
 	/* store the state: */
 .Lenc_out_0:
 	movdqu STATE4, 0x00(STATEP)
 	movdqu STATE0, 0x10(STATEP)
@@ -533,18 +472,18 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
 
 	FRAME_END
 	RET
 SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
 
-.macro decrypt_block a s0 s1 s2 s3 s4 i
-	movdq\a (\i * 0x10)(SRC), MSG
+.macro decrypt_block s0 s1 s2 s3 s4 i
+	movdqu (\i * 0x10)(SRC), MSG
 	pxor \s1, MSG
 	pxor \s4, MSG
 	movdqa \s2, T1
 	pand \s3, T1
 	pxor T1, MSG
-	movdq\a MSG, (\i * 0x10)(DST)
+	movdqu MSG, (\i * 0x10)(DST)
 
 	aegis128_update
 	pxor MSG, \s4
 
 	sub $0x10, LEN
@@ -567,38 +506,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
-	mov  SRC, %r8
-	or   DST, %r8
-	and $0xF, %r8
-	jnz .Ldec_u_loop
-
-.align 8
-.Ldec_a_loop:
-	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
-	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
-	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
-	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
-	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
-
-	add $0x50, SRC
-	add $0x50, DST
-	jmp .Ldec_a_loop
-
 .align 8
-.Ldec_u_loop:
-	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
-	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
-	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
-	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
-	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Ldec_loop:
+	decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+	decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+	decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+	decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+	decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
 
 	add $0x50, SRC
 	add $0x50, DST
-	jmp .Ldec_u_loop
+	jmp .Ldec_loop
 
 	/* store the state: */
 .Ldec_out_0:
 	movdqu STATE4, 0x00(STATEP)
 	movdqu STATE0, 0x10(STATEP)
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (3 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 06/10] crypto: x86/aegis128 - improve assembly function prototypes Eric Biggers
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Start using SSE4.1 instructions in the AES-NI AEGIS code, with the first
use case being preparing the length block in fewer instructions.

In practice this does not reduce the set of CPUs on which the code can
run, because all Intel and AMD CPUs with AES-NI also have SSE4.1.

Upgrade the existing SSE2 feature check to SSE4.1, though it seems this
check is not strictly necessary; the aesni-intel module has been getting
away with using SSE4.1 despite checking for AES-NI only.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/Kconfig               | 4 ++--
 arch/x86/crypto/aegis128-aesni-asm.S  | 6 ++----
 arch/x86/crypto/aegis128-aesni-glue.c | 6 +++---
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 7b1bebed879df..3d2e38ba52403 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -361,20 +361,20 @@ config CRYPTO_CHACHA20_X86_64
 	  - SSSE3 (Supplemental SSE3)
 	  - AVX2 (Advanced Vector Extensions 2)
 	  - AVX-512VL (Advanced Vector Extensions-512VL)
 
 config CRYPTO_AEGIS128_AESNI_SSE2
-	tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)"
+	tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
 	depends on X86 && 64BIT
 	select CRYPTO_AEAD
 	select CRYPTO_SIMD
 	help
 	  AEGIS-128 AEAD algorithm
 
 	  Architecture: x86_64 using:
 	  - AES-NI (AES New Instructions)
-	  - SSE2 (Streaming SIMD Extensions 2)
+	  - SSE4.1 (Streaming SIMD Extensions 4.1)
 
 config CRYPTO_NHPOLY1305_SSE2
 	tristate "Hash functions: NHPoly1305 (SSE2)"
 	depends on X86 && 64BIT
 	select CRYPTO_NHPOLY1305
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 5541aca2fd0dd..6ed4bc452c292 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * AES-NI + SSE2 implementation of AEGIS-128
+ * AES-NI + SSE4.1 implementation of AEGIS-128
  *
  * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
  */
 
@@ -636,13 +636,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
 	/* prepare length block: */
 	movd %edx, MSG
-	movd %ecx, T0
-	pslldq $8, T0
-	pxor T0, MSG
+	pinsrd $2, %ecx, MSG
 	psllq $3, MSG /* multiply by 8 (to get bit count) */
 
 	pxor STATE3, MSG
 
 	/* update state: */
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index deb39cef0be1a..4dd2d981a514f 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * The AEGIS-128 Authenticated-Encryption Algorithm
- *   Glue for AES-NI + SSE2 implementation
+ *   Glue for AES-NI + SSE4.1 implementation
  *
  * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
  */
 
@@ -252,11 +252,11 @@ static struct aead_alg crypto_aegis128_aesni_alg = {
 
 static struct simd_aead_alg *simd_alg;
 
 static int __init crypto_aegis128_aesni_module_init(void)
 {
-	if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+	if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
 	    !boot_cpu_has(X86_FEATURE_AES) ||
 	    !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
 		return -ENODEV;
 
 	return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1,
@@ -271,8 +271,8 @@ static void __exit crypto_aegis128_aesni_module_exit(void)
 module_init(crypto_aegis128_aesni_module_init);
 module_exit(crypto_aegis128_aesni_module_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation");
 MODULE_ALIAS_CRYPTO("aegis128");
 MODULE_ALIAS_CRYPTO("aegis128-aesni");
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 06/10] crypto: x86/aegis128 - improve assembly function prototypes
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (4 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1 Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1 Eric Biggers
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Adjust the prototypes of the AEGIS assembly functions:

- Use proper types instead of 'void *', when applicable.

- Move the length parameter to after the buffers it describes rather
  than before, to match the usual convention.  Also shorten its name to
  just len (which is the name used in the assembly code).

- Declare register aliases at the beginning of each function rather than
  once per file.  This was necessary because len was moved, but also it
  allows adding some aliases where raw registers were used before.

- Put assoclen and cryptlen in the correct order when declaring the
  finalization function in the .c file.

- Remove the unnecessary "crypto_" prefix.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S  | 105 ++++++++++++++++----------
 arch/x86/crypto/aegis128-aesni-glue.c |  92 +++++++++++-----------
 2 files changed, 112 insertions(+), 85 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 6ed4bc452c292..9dfdbe0b1fb83 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -17,15 +17,10 @@
 #define KEY	%xmm5
 #define MSG	%xmm5
 #define T0	%xmm6
 #define T1	%xmm7
 
-#define STATEP	%rdi
-#define LEN	%esi
-#define SRC	%rdx
-#define DST	%rcx
-
 .section .rodata.cst16.aegis128_const, "aM", @progbits, 32
 .align 16
 .Laegis128_const_0:
 	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
 	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
@@ -70,10 +65,12 @@
  *   T0
  *   %r8
  *   %r9
  */
 SYM_FUNC_START_LOCAL(__load_partial)
+	.set LEN, %ecx
+	.set SRC, %rsi
 	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov LEN, %r8d
 	and $0x1, %r8
@@ -136,10 +133,12 @@ SYM_FUNC_END(__load_partial)
  *   %r8
  *   %r9
  *   %r10
  */
 SYM_FUNC_START_LOCAL(__store_partial)
+	.set LEN, %ecx
+	.set DST, %rdx
 	mov LEN, %r8d
 	mov DST, %r9
 
 	movq T0, %r10
 
@@ -182,20 +181,25 @@ SYM_FUNC_START_LOCAL(__store_partial)
 .Lst_partial_1:
 	RET
 SYM_FUNC_END(__store_partial)
 
 /*
- * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ * void aegis128_aesni_init(struct aegis_state *state,
+ *			    const struct aegis_block *key,
+ *			    const u8 iv[AEGIS128_NONCE_SIZE]);
  */
-SYM_FUNC_START(crypto_aegis128_aesni_init)
+SYM_FUNC_START(aegis128_aesni_init)
+	.set STATEP, %rdi
+	.set KEYP, %rsi
+	.set IVP, %rdx
 	FRAME_BEGIN
 
 	/* load IV: */
-	movdqu (%rdx), T1
+	movdqu (IVP), T1
 
 	/* load key: */
-	movdqa (%rsi), KEY
+	movdqa (KEYP), KEY
 	pxor KEY, T1
 	movdqa T1, STATE0
 	movdqa KEY, STATE3
 	movdqa KEY, STATE4
 
@@ -224,17 +228,20 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
 	movdqu STATE3, 0x30(STATEP)
 	movdqu STATE4, 0x40(STATEP)
 
 	FRAME_END
 	RET
-SYM_FUNC_END(crypto_aegis128_aesni_init)
+SYM_FUNC_END(aegis128_aesni_init)
 
 /*
- * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
- *                               const void *data);
+ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ *			  unsigned int len);
  */
-SYM_FUNC_START(crypto_aegis128_aesni_ad)
+SYM_FUNC_START(aegis128_aesni_ad)
+	.set STATEP, %rdi
+	.set SRC, %rsi
+	.set LEN, %edx
 	FRAME_BEGIN
 
 	cmp $0x10, LEN
 	jb .Lad_out
 
@@ -332,11 +339,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
 	RET
 
 .Lad_out:
 	FRAME_END
 	RET
-SYM_FUNC_END(crypto_aegis128_aesni_ad)
+SYM_FUNC_END(aegis128_aesni_ad)
 
 .macro encrypt_block s0 s1 s2 s3 s4 i
 	movdqu (\i * 0x10)(SRC), MSG
 	movdqa MSG, T0
 	pxor \s1, T0
@@ -353,14 +360,18 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad)
 	cmp $0x10, LEN
 	jl .Lenc_out_\i
 .endm
 
 /*
- * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
- *                                const void *src, void *dst);
+ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
+ *			   unsigned int len);
  */
-SYM_FUNC_START(crypto_aegis128_aesni_enc)
+SYM_FUNC_START(aegis128_aesni_enc)
+	.set STATEP, %rdi
+	.set SRC, %rsi
+	.set DST, %rdx
+	.set LEN, %ecx
 	FRAME_BEGIN
 
 	cmp $0x10, LEN
 	jb .Lenc_out
 
@@ -430,17 +441,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
 	RET
 
 .Lenc_out:
 	FRAME_END
 	RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc)
+SYM_FUNC_END(aegis128_aesni_enc)
 
 /*
- * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
- *                                     const void *src, void *dst);
+ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
+ *				u8 *dst, unsigned int len);
  */
-SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_START(aegis128_aesni_enc_tail)
+	.set STATEP, %rdi
+	.set SRC, %rsi
+	.set DST, %rdx
+	.set LEN, %ecx
 	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
@@ -470,11 +485,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
 	movdqu STATE2, 0x30(STATEP)
 	movdqu STATE3, 0x40(STATEP)
 
 	FRAME_END
 	RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_END(aegis128_aesni_enc_tail)
 
 .macro decrypt_block s0 s1 s2 s3 s4 i
 	movdqu (\i * 0x10)(SRC), MSG
 	pxor \s1, MSG
 	pxor \s4, MSG
@@ -490,14 +505,18 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
 	cmp $0x10, LEN
 	jl .Ldec_out_\i
 .endm
 
 /*
- * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
- *                                const void *src, void *dst);
+ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
+ *			   unsigned int len);
  */
-SYM_FUNC_START(crypto_aegis128_aesni_dec)
+SYM_FUNC_START(aegis128_aesni_dec)
+	.set STATEP, %rdi
+	.set SRC, %rsi
+	.set DST, %rdx
+	.set LEN, %ecx
 	FRAME_BEGIN
 
 	cmp $0x10, LEN
 	jb .Ldec_out
 
@@ -567,17 +586,21 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
 	RET
 
 .Ldec_out:
 	FRAME_END
 	RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec)
+SYM_FUNC_END(aegis128_aesni_dec)
 
 /*
- * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
- *                                     const void *src, void *dst);
+ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
+ *				u8 *dst, unsigned int len);
  */
-SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_START(aegis128_aesni_dec_tail)
+	.set STATEP, %rdi
+	.set SRC, %rsi
+	.set DST, %rdx
+	.set LEN, %ecx
 	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
@@ -617,30 +640,34 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
 	movdqu STATE2, 0x30(STATEP)
 	movdqu STATE3, 0x40(STATEP)
 
 	FRAME_END
 	RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_END(aegis128_aesni_dec_tail)
 
 /*
- * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
- *                                  unsigned int assoclen,
- *                                  unsigned int cryptlen);
+ * void aegis128_aesni_final(struct aegis_state *state,
+ *			     struct aegis_block *tag_xor,
+ *			     unsigned int assoclen, unsigned int cryptlen);
  */
-SYM_FUNC_START(crypto_aegis128_aesni_final)
+SYM_FUNC_START(aegis128_aesni_final)
+	.set STATEP, %rdi
+	.set TAG_XOR, %rsi
+	.set ASSOCLEN, %edx
+	.set CRYPTLEN, %ecx
 	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
 	/* prepare length block: */
-	movd %edx, MSG
-	pinsrd $2, %ecx, MSG
+	movd ASSOCLEN, MSG
+	pinsrd $2, CRYPTLEN, MSG
 	psllq $3, MSG /* multiply by 8 (to get bit count) */
 
 	pxor STATE3, MSG
 
 	/* update state: */
@@ -651,18 +678,18 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
 	aegis128_update; pxor MSG, STATE0
 	aegis128_update; pxor MSG, STATE4
 	aegis128_update; pxor MSG, STATE3
 
 	/* xor tag: */
-	movdqu (%rsi), MSG
+	movdqu (TAG_XOR), MSG
 
 	pxor STATE0, MSG
 	pxor STATE1, MSG
 	pxor STATE2, MSG
 	pxor STATE3, MSG
 	pxor STATE4, MSG
 
-	movdqu MSG, (%rsi)
+	movdqu MSG, (TAG_XOR)
 
 	FRAME_END
 	RET
-SYM_FUNC_END(crypto_aegis128_aesni_final)
+SYM_FUNC_END(aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 4dd2d981a514f..9555958e4089d 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -21,31 +21,10 @@
 #define AEGIS128_STATE_BLOCKS 5
 #define AEGIS128_KEY_SIZE 16
 #define AEGIS128_MIN_AUTH_SIZE 8
 #define AEGIS128_MAX_AUTH_SIZE 16
 
-asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
-
-asmlinkage void crypto_aegis128_aesni_ad(
-		void *state, unsigned int length, const void *data);
-
-asmlinkage void crypto_aegis128_aesni_enc(
-		void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec(
-		void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_enc_tail(
-		void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec_tail(
-		void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_final(
-		void *state, void *tag_xor, unsigned int cryptlen,
-		unsigned int assoclen);
-
 struct aegis_block {
 	u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
 };
 
 struct aegis_state {
@@ -54,10 +33,36 @@ struct aegis_state {
 
 struct aegis_ctx {
 	struct aegis_block key;
 };
 
+asmlinkage void aegis128_aesni_init(struct aegis_state *state,
+				    const struct aegis_block *key,
+				    const u8 iv[AEGIS128_NONCE_SIZE]);
+
+asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+				  unsigned int len);
+
+asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src,
+				   u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src,
+				   u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state,
+					const u8 *src, u8 *dst,
+					unsigned int len);
+
+asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state,
+					const u8 *src, u8 *dst,
+					unsigned int len);
+
+asmlinkage void aegis128_aesni_final(struct aegis_state *state,
+				     struct aegis_block *tag_xor,
+				     unsigned int assoclen,
+				     unsigned int cryptlen);
+
 static void crypto_aegis128_aesni_process_ad(
 		struct aegis_state *state, struct scatterlist *sg_src,
 		unsigned int assoclen)
 {
 	struct scatter_walk walk;
@@ -73,19 +78,18 @@ static void crypto_aegis128_aesni_process_ad(
 
 		if (pos + size >= AEGIS128_BLOCK_SIZE) {
 			if (pos > 0) {
 				unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
 				memcpy(buf.bytes + pos, src, fill);
-				crypto_aegis128_aesni_ad(state,
-							 AEGIS128_BLOCK_SIZE,
-							 buf.bytes);
+				aegis128_aesni_ad(state, buf.bytes,
+						  AEGIS128_BLOCK_SIZE);
 				pos = 0;
 				left -= fill;
 				src += fill;
 			}
 
-			crypto_aegis128_aesni_ad(state, left, src);
+			aegis128_aesni_ad(state, src, left);
 
 			src += left & ~(AEGIS128_BLOCK_SIZE - 1);
 			left &= AEGIS128_BLOCK_SIZE - 1;
 		}
 
@@ -98,45 +102,41 @@ static void crypto_aegis128_aesni_process_ad(
 		scatterwalk_done(&walk, 0, assoclen);
 	}
 
 	if (pos > 0) {
 		memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
-		crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+		aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE);
 	}
 }
 
 static __always_inline void
 crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
 				    struct skcipher_walk *walk, bool enc)
 {
 	while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
 		if (enc)
-			crypto_aegis128_aesni_enc(
-					state,
-					round_down(walk->nbytes,
-						   AEGIS128_BLOCK_SIZE),
-					walk->src.virt.addr,
-					walk->dst.virt.addr);
+			aegis128_aesni_enc(state, walk->src.virt.addr,
+					   walk->dst.virt.addr,
+					   round_down(walk->nbytes,
+						      AEGIS128_BLOCK_SIZE));
 		else
-			crypto_aegis128_aesni_dec(
-					state,
-					round_down(walk->nbytes,
-						   AEGIS128_BLOCK_SIZE),
-					walk->src.virt.addr,
-					walk->dst.virt.addr);
+			aegis128_aesni_dec(state, walk->src.virt.addr,
+					   walk->dst.virt.addr,
+					   round_down(walk->nbytes,
+						      AEGIS128_BLOCK_SIZE));
 		skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
 	}
 
 	if (walk->nbytes) {
 		if (enc)
-			crypto_aegis128_aesni_enc_tail(state, walk->nbytes,
-						       walk->src.virt.addr,
-						       walk->dst.virt.addr);
+			aegis128_aesni_enc_tail(state, walk->src.virt.addr,
+						walk->dst.virt.addr,
+						walk->nbytes);
 		else
-			crypto_aegis128_aesni_dec_tail(state, walk->nbytes,
-						       walk->src.virt.addr,
-						       walk->dst.virt.addr);
+			aegis128_aesni_dec_tail(state, walk->src.virt.addr,
+						walk->dst.virt.addr,
+						walk->nbytes);
 		skcipher_walk_done(walk, 0);
 	}
 }
 
 static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
@@ -184,14 +184,14 @@ crypto_aegis128_aesni_crypt(struct aead_request *req,
 	else
 		skcipher_walk_aead_decrypt(&walk, req, true);
 
 	kernel_fpu_begin();
 
-	crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+	aegis128_aesni_init(&state, &ctx->key, req->iv);
 	crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
 	crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
-	crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+	aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
 
 	kernel_fpu_end();
 }
 
 static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (5 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 06/10] crypto: x86/aegis128 - improve assembly function prototypes Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 08/10] crypto: x86/aegis128 - take advantage of block-aligned len Eric Biggers
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Optimize the code that loads and stores partial blocks, taking advantage
of SSE4.1.  The code is adapted from that in aes-gcm-aesni-x86_64.S.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S | 236 +++++++++++----------------
 1 file changed, 95 insertions(+), 141 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 9dfdbe0b1fb83..e650330ef6951 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -2,10 +2,11 @@
 /*
  * AES-NI + SSE4.1 implementation of AEGIS-128
  *
  * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ * Copyright 2024 Google LLC
  */
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
 
@@ -26,15 +27,15 @@
 	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
 .Laegis128_const_1:
 	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
 	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
 
-.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
-.align 16
-.Laegis128_counter:
-	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32
+.align 32
+.Lzeropad_mask:
+	.octa 0xffffffffffffffffffffffffffffffff
+	.octa 0
 
 .text
 
 /*
  * aegis128_update
@@ -53,136 +54,90 @@
 	aesenc STATE3, STATE2
 	aesenc T0,     STATE3
 .endm
 
 /*
- * __load_partial: internal ABI
- * input:
- *   LEN - bytes
- *   SRC - src
- * output:
- *   MSG  - message block
- * changed:
- *   T0
- *   %r8
- *   %r9
+ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register
+ * MSG and zeroize any remaining bytes.  Clobbers %rax, %rcx, and %r8.
  */
-SYM_FUNC_START_LOCAL(__load_partial)
-	.set LEN, %ecx
-	.set SRC, %rsi
-	xor %r9d, %r9d
-	pxor MSG, MSG
-
-	mov LEN, %r8d
-	and $0x1, %r8
-	jz .Lld_partial_1
-
-	mov LEN, %r8d
-	and $0x1E, %r8
-	add SRC, %r8
-	mov (%r8), %r9b
-
-.Lld_partial_1:
-	mov LEN, %r8d
-	and $0x2, %r8
-	jz .Lld_partial_2
-
-	mov LEN, %r8d
-	and $0x1C, %r8
-	add SRC, %r8
-	shl $0x10, %r9
-	mov (%r8), %r9w
-
-.Lld_partial_2:
-	mov LEN, %r8d
-	and $0x4, %r8
-	jz .Lld_partial_4
-
-	mov LEN, %r8d
-	and $0x18, %r8
-	add SRC, %r8
-	shl $32, %r9
-	mov (%r8), %r8d
-	xor %r8, %r9
-
-.Lld_partial_4:
-	movq %r9, MSG
-
-	mov LEN, %r8d
-	and $0x8, %r8
-	jz .Lld_partial_8
-
-	mov LEN, %r8d
-	and $0x10, %r8
-	add SRC, %r8
-	pslldq $8, MSG
-	movq (%r8), T0
-	pxor T0, MSG
-
-.Lld_partial_8:
-	RET
-SYM_FUNC_END(__load_partial)
+.macro load_partial
+	sub $8, %ecx			/* LEN - 8 */
+	jle .Lle8\@
+
+	/* Load 9 <= LEN <= 15 bytes: */
+	movq (SRC), MSG			/* Load first 8 bytes */
+	mov (SRC, %rcx), %rax		/* Load last 8 bytes */
+	neg %ecx
+	shl $3, %ecx
+	shr %cl, %rax			/* Discard overlapping bytes */
+	pinsrq $1, %rax, MSG
+	jmp .Ldone\@
+
+.Lle8\@:
+	add $4, %ecx			/* LEN - 4 */
+	jl .Llt4\@
+
+	/* Load 4 <= LEN <= 8 bytes: */
+	mov (SRC), %eax			/* Load first 4 bytes */
+	mov (SRC, %rcx), %r8d		/* Load last 4 bytes */
+	jmp .Lcombine\@
+
+.Llt4\@:
+	/* Load 1 <= LEN <= 3 bytes: */
+	add $2, %ecx			/* LEN - 2 */
+	movzbl (SRC), %eax		/* Load first byte */
+	jl .Lmovq\@
+	movzwl (SRC, %rcx), %r8d	/* Load last 2 bytes */
+.Lcombine\@:
+	shl $3, %ecx
+	shl %cl, %r8
+	or %r8, %rax			/* Combine the two parts */
+.Lmovq\@:
+	movq %rax, MSG
+.Ldone\@:
+.endm
 
 /*
- * __store_partial: internal ABI
- * input:
- *   LEN - bytes
- *   DST - dst
- * output:
- *   T0   - message block
- * changed:
- *   %r8
- *   %r9
- *   %r10
+ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer
+ * DST.  Clobbers %rax, %rcx, and %r8.
  */
-SYM_FUNC_START_LOCAL(__store_partial)
-	.set LEN, %ecx
-	.set DST, %rdx
-	mov LEN, %r8d
-	mov DST, %r9
-
-	movq T0, %r10
-
-	cmp $8, %r8
-	jl .Lst_partial_8
-
-	mov %r10, (%r9)
-	psrldq $8, T0
-	movq T0, %r10
-
-	sub $8, %r8
-	add $8, %r9
-
-.Lst_partial_8:
-	cmp $4, %r8
-	jl .Lst_partial_4
-
-	mov %r10d, (%r9)
-	shr $32, %r10
-
-	sub $4, %r8
-	add $4, %r9
-
-.Lst_partial_4:
-	cmp $2, %r8
-	jl .Lst_partial_2
-
-	mov %r10w, (%r9)
-	shr $0x10, %r10
-
-	sub $2, %r8
-	add $2, %r9
-
-.Lst_partial_2:
-	cmp $1, %r8
-	jl .Lst_partial_1
-
-	mov %r10b, (%r9)
-
-.Lst_partial_1:
-	RET
-SYM_FUNC_END(__store_partial)
+.macro store_partial msg
+	sub $8, %ecx			/* LEN - 8 */
+	jl .Llt8\@
+
+	/* Store 8 <= LEN <= 15 bytes: */
+	pextrq $1, \msg, %rax
+	mov %ecx, %r8d
+	shl $3, %ecx
+	ror %cl, %rax
+	mov %rax, (DST, %r8)		/* Store last LEN - 8 bytes */
+	movq \msg, (DST)		/* Store first 8 bytes */
+	jmp .Ldone\@
+
+.Llt8\@:
+	add $4, %ecx			/* LEN - 4 */
+	jl .Llt4\@
+
+	/* Store 4 <= LEN <= 7 bytes: */
+	pextrd $1, \msg, %eax
+	mov %ecx, %r8d
+	shl $3, %ecx
+	ror %cl, %eax
+	mov %eax, (DST, %r8)		/* Store last LEN - 4 bytes */
+	movd \msg, (DST)		/* Store first 4 bytes */
+	jmp .Ldone\@
+
+.Llt4\@:
+	/* Store 1 <= LEN <= 3 bytes: */
+	pextrb $0, \msg, 0(DST)
+	cmp $-2, %ecx			/* LEN - 4 == -2, i.e. LEN == 2? */
+	jl .Ldone\@
+	pextrb $1, \msg, 1(DST)
+	je .Ldone\@
+	pextrb $2, \msg, 2(DST)
+.Ldone\@:
+.endm
 
 /*
  * void aegis128_aesni_init(struct aegis_state *state,
  *			    const struct aegis_block *key,
  *			    const u8 iv[AEGIS128_NONCE_SIZE]);
@@ -451,31 +406,33 @@ SYM_FUNC_END(aegis128_aesni_enc)
  */
 SYM_FUNC_START(aegis128_aesni_enc_tail)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
-	.set LEN, %ecx
+	.set LEN, %ecx	/* {load,store}_partial rely on this being %ecx */
 	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
 	/* encrypt message: */
-	call __load_partial
+	mov LEN, %r9d
+	load_partial
 
 	movdqa MSG, T0
 	pxor STATE1, T0
 	pxor STATE4, T0
 	movdqa STATE2, T1
 	pand STATE3, T1
 	pxor T1, T0
 
-	call __store_partial
+	mov %r9d, LEN
+	store_partial T0
 
 	aegis128_update
 	pxor MSG, STATE4
 
 	/* store the state: */
@@ -596,40 +553,37 @@ SYM_FUNC_END(aegis128_aesni_dec)
  */
 SYM_FUNC_START(aegis128_aesni_dec_tail)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
-	.set LEN, %ecx
+	.set LEN, %ecx	/* {load,store}_partial rely on this being %ecx */
 	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
 	movdqu 0x40(STATEP), STATE4
 
 	/* decrypt message: */
-	call __load_partial
+	mov LEN, %r9d
+	load_partial
 
 	pxor STATE1, MSG
 	pxor STATE4, MSG
 	movdqa STATE2, T1
 	pand STATE3, T1
 	pxor T1, MSG
 
-	movdqa MSG, T0
-	call __store_partial
+	mov %r9d, LEN
+	store_partial MSG
 
 	/* mask with byte count: */
-	movd LEN, T0
-	punpcklbw T0, T0
-	punpcklbw T0, T0
-	punpcklbw T0, T0
-	punpcklbw T0, T0
-	movdqa .Laegis128_counter(%rip), T1
-	pcmpgtb T1, T0
+	lea .Lzeropad_mask+16(%rip), %rax
+	sub %r9, %rax
+	movdqu (%rax), T0
 	pand T0, MSG
 
 	aegis128_update
 	pxor MSG, STATE4
 
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 08/10] crypto: x86/aegis128 - take advantage of block-aligned len
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (6 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1 Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END Eric Biggers
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Update a caller of aegis128_aesni_ad() to round down the length to a
block boundary.  After that, aegis128_aesni_ad(), aegis128_aesni_enc(),
and aegis128_aesni_dec() are only passed whole blocks.  Update the
assembly code to take advantage of that, which eliminates some unneeded
instructions.  For aegis128_aesni_enc() and aegis128_aesni_dec(), the
length is also always nonzero, so stop checking for zero length.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S  | 37 +++++++++++----------------
 arch/x86/crypto/aegis128-aesni-glue.c |  4 +--
 2 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index e650330ef6951..345b1eafe45af 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -188,19 +188,21 @@ SYM_FUNC_START(aegis128_aesni_init)
 SYM_FUNC_END(aegis128_aesni_init)
 
 /*
  * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
  *			  unsigned int len);
+ *
+ * len must be a multiple of 16.
  */
 SYM_FUNC_START(aegis128_aesni_ad)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set LEN, %edx
 	FRAME_BEGIN
 
-	cmp $0x10, LEN
-	jb .Lad_out
+	test LEN, LEN
+	jz .Lad_out
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
@@ -211,40 +213,35 @@ SYM_FUNC_START(aegis128_aesni_ad)
 .Lad_loop:
 	movdqu 0x00(SRC), MSG
 	aegis128_update
 	pxor MSG, STATE4
 	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_1
+	jz .Lad_out_1
 
 	movdqu 0x10(SRC), MSG
 	aegis128_update
 	pxor MSG, STATE3
 	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_2
+	jz .Lad_out_2
 
 	movdqu 0x20(SRC), MSG
 	aegis128_update
 	pxor MSG, STATE2
 	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_3
+	jz .Lad_out_3
 
 	movdqu 0x30(SRC), MSG
 	aegis128_update
 	pxor MSG, STATE1
 	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_4
+	jz .Lad_out_4
 
 	movdqu 0x40(SRC), MSG
 	aegis128_update
 	pxor MSG, STATE0
 	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lad_out_0
+	jz .Lad_out_0
 
 	add $0x50, SRC
 	jmp .Lad_loop
 
 	/* store the state: */
@@ -310,28 +307,26 @@ SYM_FUNC_END(aegis128_aesni_ad)
 
 	aegis128_update
 	pxor MSG, \s4
 
 	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Lenc_out_\i
+	jz .Lenc_out_\i
 .endm
 
 /*
  * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
  *			   unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
  */
 SYM_FUNC_START(aegis128_aesni_enc)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
 	.set LEN, %ecx
 	FRAME_BEGIN
 
-	cmp $0x10, LEN
-	jb .Lenc_out
-
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
@@ -457,28 +452,26 @@ SYM_FUNC_END(aegis128_aesni_enc_tail)
 
 	aegis128_update
 	pxor MSG, \s4
 
 	sub $0x10, LEN
-	cmp $0x10, LEN
-	jl .Ldec_out_\i
+	jz .Ldec_out_\i
 .endm
 
 /*
  * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
  *			   unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
  */
 SYM_FUNC_START(aegis128_aesni_dec)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
 	.set LEN, %ecx
 	FRAME_BEGIN
 
-	cmp $0x10, LEN
-	jb .Ldec_out
-
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
 	movdqu 0x30(STATEP), STATE3
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 9555958e4089d..c19d8e3d96a35 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -85,12 +85,12 @@ static void crypto_aegis128_aesni_process_ad(
 				pos = 0;
 				left -= fill;
 				src += fill;
 			}
 
-			aegis128_aesni_ad(state, src, left);
-
+			aegis128_aesni_ad(state, src,
+					  left & ~(AEGIS128_BLOCK_SIZE - 1));
 			src += left & ~(AEGIS128_BLOCK_SIZE - 1);
 			left &= AEGIS128_BLOCK_SIZE - 1;
 		}
 
 		memcpy(buf.bytes + pos, src, left);
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (7 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 08/10] crypto: x86/aegis128 - take advantage of block-aligned len Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-17  0:00 ` [PATCH v2 10/10] crypto: x86/aegis128 - remove unneeded RETs Eric Biggers
  2024-10-26  6:59 ` [PATCH v2 00/10] AEGIS x86 assembly tuning Herbert Xu
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Stop using FRAME_BEGIN and FRAME_END in the AEGIS assembly functions,
since all these functions are now leaf functions.  This eliminates some
unnecessary instructions.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S | 34 ----------------------------
 1 file changed, 34 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 345b1eafe45af..42f25fea4e082 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -6,11 +6,10 @@
  * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
  * Copyright 2024 Google LLC
  */
 
 #include <linux/linkage.h>
-#include <asm/frame.h>
 
 #define STATE0	%xmm0
 #define STATE1	%xmm1
 #define STATE2	%xmm2
 #define STATE3	%xmm3
@@ -144,11 +143,10 @@
  */
 SYM_FUNC_START(aegis128_aesni_init)
 	.set STATEP, %rdi
 	.set KEYP, %rsi
 	.set IVP, %rdx
-	FRAME_BEGIN
 
 	/* load IV: */
 	movdqu (IVP), T1
 
 	/* load key: */
@@ -180,12 +178,10 @@ SYM_FUNC_START(aegis128_aesni_init)
 	movdqu STATE0, 0x00(STATEP)
 	movdqu STATE1, 0x10(STATEP)
 	movdqu STATE2, 0x20(STATEP)
 	movdqu STATE3, 0x30(STATEP)
 	movdqu STATE4, 0x40(STATEP)
-
-	FRAME_END
 	RET
 SYM_FUNC_END(aegis128_aesni_init)
 
 /*
  * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
@@ -195,11 +191,10 @@ SYM_FUNC_END(aegis128_aesni_init)
  */
 SYM_FUNC_START(aegis128_aesni_ad)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set LEN, %edx
-	FRAME_BEGIN
 
 	test LEN, LEN
 	jz .Lad_out
 
 	/* load the state: */
@@ -249,51 +244,45 @@ SYM_FUNC_START(aegis128_aesni_ad)
 	movdqu STATE0, 0x00(STATEP)
 	movdqu STATE1, 0x10(STATEP)
 	movdqu STATE2, 0x20(STATEP)
 	movdqu STATE3, 0x30(STATEP)
 	movdqu STATE4, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lad_out_1:
 	movdqu STATE4, 0x00(STATEP)
 	movdqu STATE0, 0x10(STATEP)
 	movdqu STATE1, 0x20(STATEP)
 	movdqu STATE2, 0x30(STATEP)
 	movdqu STATE3, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lad_out_2:
 	movdqu STATE3, 0x00(STATEP)
 	movdqu STATE4, 0x10(STATEP)
 	movdqu STATE0, 0x20(STATEP)
 	movdqu STATE1, 0x30(STATEP)
 	movdqu STATE2, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lad_out_3:
 	movdqu STATE2, 0x00(STATEP)
 	movdqu STATE3, 0x10(STATEP)
 	movdqu STATE4, 0x20(STATEP)
 	movdqu STATE0, 0x30(STATEP)
 	movdqu STATE1, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lad_out_4:
 	movdqu STATE1, 0x00(STATEP)
 	movdqu STATE2, 0x10(STATEP)
 	movdqu STATE3, 0x20(STATEP)
 	movdqu STATE4, 0x30(STATEP)
 	movdqu STATE0, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lad_out:
-	FRAME_END
 	RET
 SYM_FUNC_END(aegis128_aesni_ad)
 
 .macro encrypt_block s0 s1 s2 s3 s4 i
 	movdqu (\i * 0x10)(SRC), MSG
@@ -321,11 +310,10 @@ SYM_FUNC_END(aegis128_aesni_ad)
 SYM_FUNC_START(aegis128_aesni_enc)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
 	.set LEN, %ecx
-	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
@@ -349,51 +337,45 @@ SYM_FUNC_START(aegis128_aesni_enc)
 	movdqu STATE4, 0x00(STATEP)
 	movdqu STATE0, 0x10(STATEP)
 	movdqu STATE1, 0x20(STATEP)
 	movdqu STATE2, 0x30(STATEP)
 	movdqu STATE3, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lenc_out_1:
 	movdqu STATE3, 0x00(STATEP)
 	movdqu STATE4, 0x10(STATEP)
 	movdqu STATE0, 0x20(STATEP)
 	movdqu STATE1, 0x30(STATEP)
 	movdqu STATE2, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lenc_out_2:
 	movdqu STATE2, 0x00(STATEP)
 	movdqu STATE3, 0x10(STATEP)
 	movdqu STATE4, 0x20(STATEP)
 	movdqu STATE0, 0x30(STATEP)
 	movdqu STATE1, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lenc_out_3:
 	movdqu STATE1, 0x00(STATEP)
 	movdqu STATE2, 0x10(STATEP)
 	movdqu STATE3, 0x20(STATEP)
 	movdqu STATE4, 0x30(STATEP)
 	movdqu STATE0, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lenc_out_4:
 	movdqu STATE0, 0x00(STATEP)
 	movdqu STATE1, 0x10(STATEP)
 	movdqu STATE2, 0x20(STATEP)
 	movdqu STATE3, 0x30(STATEP)
 	movdqu STATE4, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Lenc_out:
-	FRAME_END
 	RET
 SYM_FUNC_END(aegis128_aesni_enc)
 
 /*
  * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
@@ -402,11 +384,10 @@ SYM_FUNC_END(aegis128_aesni_enc)
 SYM_FUNC_START(aegis128_aesni_enc_tail)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
 	.set LEN, %ecx	/* {load,store}_partial rely on this being %ecx */
-	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
@@ -434,12 +415,10 @@ SYM_FUNC_START(aegis128_aesni_enc_tail)
 	movdqu STATE4, 0x00(STATEP)
 	movdqu STATE0, 0x10(STATEP)
 	movdqu STATE1, 0x20(STATEP)
 	movdqu STATE2, 0x30(STATEP)
 	movdqu STATE3, 0x40(STATEP)
-
-	FRAME_END
 	RET
 SYM_FUNC_END(aegis128_aesni_enc_tail)
 
 .macro decrypt_block s0 s1 s2 s3 s4 i
 	movdqu (\i * 0x10)(SRC), MSG
@@ -466,11 +445,10 @@ SYM_FUNC_END(aegis128_aesni_enc_tail)
 SYM_FUNC_START(aegis128_aesni_dec)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
 	.set LEN, %ecx
-	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
@@ -494,51 +472,45 @@ SYM_FUNC_START(aegis128_aesni_dec)
 	movdqu STATE4, 0x00(STATEP)
 	movdqu STATE0, 0x10(STATEP)
 	movdqu STATE1, 0x20(STATEP)
 	movdqu STATE2, 0x30(STATEP)
 	movdqu STATE3, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Ldec_out_1:
 	movdqu STATE3, 0x00(STATEP)
 	movdqu STATE4, 0x10(STATEP)
 	movdqu STATE0, 0x20(STATEP)
 	movdqu STATE1, 0x30(STATEP)
 	movdqu STATE2, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Ldec_out_2:
 	movdqu STATE2, 0x00(STATEP)
 	movdqu STATE3, 0x10(STATEP)
 	movdqu STATE4, 0x20(STATEP)
 	movdqu STATE0, 0x30(STATEP)
 	movdqu STATE1, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Ldec_out_3:
 	movdqu STATE1, 0x00(STATEP)
 	movdqu STATE2, 0x10(STATEP)
 	movdqu STATE3, 0x20(STATEP)
 	movdqu STATE4, 0x30(STATEP)
 	movdqu STATE0, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Ldec_out_4:
 	movdqu STATE0, 0x00(STATEP)
 	movdqu STATE1, 0x10(STATEP)
 	movdqu STATE2, 0x20(STATEP)
 	movdqu STATE3, 0x30(STATEP)
 	movdqu STATE4, 0x40(STATEP)
-	FRAME_END
 	RET
 
 .Ldec_out:
-	FRAME_END
 	RET
 SYM_FUNC_END(aegis128_aesni_dec)
 
 /*
  * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
@@ -547,11 +519,10 @@ SYM_FUNC_END(aegis128_aesni_dec)
 SYM_FUNC_START(aegis128_aesni_dec_tail)
 	.set STATEP, %rdi
 	.set SRC, %rsi
 	.set DST, %rdx
 	.set LEN, %ecx	/* {load,store}_partial rely on this being %ecx */
-	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
@@ -584,12 +555,10 @@ SYM_FUNC_START(aegis128_aesni_dec_tail)
 	movdqu STATE4, 0x00(STATEP)
 	movdqu STATE0, 0x10(STATEP)
 	movdqu STATE1, 0x20(STATEP)
 	movdqu STATE2, 0x30(STATEP)
 	movdqu STATE3, 0x40(STATEP)
-
-	FRAME_END
 	RET
 SYM_FUNC_END(aegis128_aesni_dec_tail)
 
 /*
  * void aegis128_aesni_final(struct aegis_state *state,
@@ -599,11 +568,10 @@ SYM_FUNC_END(aegis128_aesni_dec_tail)
 SYM_FUNC_START(aegis128_aesni_final)
 	.set STATEP, %rdi
 	.set TAG_XOR, %rsi
 	.set ASSOCLEN, %edx
 	.set CRYPTLEN, %ecx
-	FRAME_BEGIN
 
 	/* load the state: */
 	movdqu 0x00(STATEP), STATE0
 	movdqu 0x10(STATEP), STATE1
 	movdqu 0x20(STATEP), STATE2
@@ -634,9 +602,7 @@ SYM_FUNC_START(aegis128_aesni_final)
 	pxor STATE2, MSG
 	pxor STATE3, MSG
 	pxor STATE4, MSG
 
 	movdqu MSG, (TAG_XOR)
-
-	FRAME_END
 	RET
 SYM_FUNC_END(aegis128_aesni_final)
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 10/10] crypto: x86/aegis128 - remove unneeded RETs
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (8 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END Eric Biggers
@ 2024-10-17  0:00 ` Eric Biggers
  2024-10-26  6:59 ` [PATCH v2 00/10] AEGIS x86 assembly tuning Herbert Xu
  10 siblings, 0 replies; 12+ messages in thread
From: Eric Biggers @ 2024-10-17  0:00 UTC (permalink / raw)
  To: linux-crypto; +Cc: x86, Ondrej Mosnacek, linux-kernel

From: Eric Biggers <ebiggers@google.com>

Remove returns that are immediately followed by another return.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aegis128-aesni-asm.S | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 42f25fea4e082..7294dc0ee7baa 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -276,12 +276,10 @@ SYM_FUNC_START(aegis128_aesni_ad)
 	movdqu STATE1, 0x00(STATEP)
 	movdqu STATE2, 0x10(STATEP)
 	movdqu STATE3, 0x20(STATEP)
 	movdqu STATE4, 0x30(STATEP)
 	movdqu STATE0, 0x40(STATEP)
-	RET
-
 .Lad_out:
 	RET
 SYM_FUNC_END(aegis128_aesni_ad)
 
 .macro encrypt_block s0 s1 s2 s3 s4 i
@@ -369,12 +367,10 @@ SYM_FUNC_START(aegis128_aesni_enc)
 	movdqu STATE0, 0x00(STATEP)
 	movdqu STATE1, 0x10(STATEP)
 	movdqu STATE2, 0x20(STATEP)
 	movdqu STATE3, 0x30(STATEP)
 	movdqu STATE4, 0x40(STATEP)
-	RET
-
 .Lenc_out:
 	RET
 SYM_FUNC_END(aegis128_aesni_enc)
 
 /*
@@ -504,12 +500,10 @@ SYM_FUNC_START(aegis128_aesni_dec)
 	movdqu STATE0, 0x00(STATEP)
 	movdqu STATE1, 0x10(STATEP)
 	movdqu STATE2, 0x20(STATEP)
 	movdqu STATE3, 0x30(STATEP)
 	movdqu STATE4, 0x40(STATEP)
-	RET
-
 .Ldec_out:
 	RET
 SYM_FUNC_END(aegis128_aesni_dec)
 
 /*
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 00/10] AEGIS x86 assembly tuning
  2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
                   ` (9 preceding siblings ...)
  2024-10-17  0:00 ` [PATCH v2 10/10] crypto: x86/aegis128 - remove unneeded RETs Eric Biggers
@ 2024-10-26  6:59 ` Herbert Xu
  10 siblings, 0 replies; 12+ messages in thread
From: Herbert Xu @ 2024-10-26  6:59 UTC (permalink / raw)
  To: Eric Biggers; +Cc: linux-crypto, x86, omosnace, linux-kernel

Eric Biggers <ebiggers@kernel.org> wrote:
> This series cleans up the AES-NI optimized implementation of AEGIS-128.
> 
> Performance is improved by 1-5% depending on the input lengths.  Binary
> code size is reduced by about 20% (measuring glue + assembly combined),
> and source code length is reduced by about 150 lines.
> 
> The first patch also fixes a bug which could theoretically cause
> incorrect behavior but was seemingly not being encountered in practice.
> 
> Note: future optimizations for AEGIS-128 could involve adding AVX512 /
> AVX10 optimized assembly code.  However, unfortunately due to the way
> that AEGIS-128 is specified, its level of parallelism is limited, and it
> can't really take advantage of vector lengths greater than 128 bits.
> So, probably this would provide only another modest improvement, mostly
> coming from being able to use the ternary logic instructions.
> 
> Changed in v2:
> - Put assoclen and cryptlen in the correct order in the prototype of
>  aegis128_aesni_final().
> - Expanded commit message of "eliminate some indirect calls"
> - Added Ondrej's Reviewed-by.
> 
> Eric Biggers (10):
>  crypto: x86/aegis128 - access 32-bit arguments as 32-bit
>  crypto: x86/aegis128 - remove no-op init and exit functions
>  crypto: x86/aegis128 - eliminate some indirect calls
>  crypto: x86/aegis128 - don't bother with special code for aligned data
>  crypto: x86/aegis128 - optimize length block preparation using SSE4.1
>  crypto: x86/aegis128 - improve assembly function prototypes
>  crypto: x86/aegis128 - optimize partial block handling using SSE4.1
>  crypto: x86/aegis128 - take advantage of block-aligned len
>  crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END
>  crypto: x86/aegis128 - remove unneeded RETs
> 
> arch/x86/crypto/Kconfig               |   4 +-
> arch/x86/crypto/aegis128-aesni-asm.S  | 532 ++++++++++----------------
> arch/x86/crypto/aegis128-aesni-glue.c | 145 ++++---
> 3 files changed, 261 insertions(+), 420 deletions(-)
> 
> base-commit: 5c20772738e1d1d7bec41664eb9d61497e53c10e

All applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-10-26  6:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-17  0:00 [PATCH v2 00/10] AEGIS x86 assembly tuning Eric Biggers
2024-10-17  0:00 ` [PATCH v2 01/10] crypto: x86/aegis128 - access 32-bit arguments as 32-bit Eric Biggers
2024-10-17  0:00 ` [PATCH v2 02/10] crypto: x86/aegis128 - remove no-op init and exit functions Eric Biggers
2024-10-17  0:00 ` [PATCH v2 03/10] crypto: x86/aegis128 - eliminate some indirect calls Eric Biggers
2024-10-17  0:00 ` [PATCH v2 04/10] crypto: x86/aegis128 - don't bother with special code for aligned data Eric Biggers
2024-10-17  0:00 ` [PATCH v2 05/10] crypto: x86/aegis128 - optimize length block preparation using SSE4.1 Eric Biggers
2024-10-17  0:00 ` [PATCH v2 06/10] crypto: x86/aegis128 - improve assembly function prototypes Eric Biggers
2024-10-17  0:00 ` [PATCH v2 07/10] crypto: x86/aegis128 - optimize partial block handling using SSE4.1 Eric Biggers
2024-10-17  0:00 ` [PATCH v2 08/10] crypto: x86/aegis128 - take advantage of block-aligned len Eric Biggers
2024-10-17  0:00 ` [PATCH v2 09/10] crypto: x86/aegis128 - remove unneeded FRAME_BEGIN and FRAME_END Eric Biggers
2024-10-17  0:00 ` [PATCH v2 10/10] crypto: x86/aegis128 - remove unneeded RETs Eric Biggers
2024-10-26  6:59 ` [PATCH v2 00/10] AEGIS x86 assembly tuning Herbert Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).