All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	Herbert Xu <herbert@gondor.apana.org.au>,
	Eric Biggers <ebiggers@google.com>
Subject: [PATCH 5.11 31/31] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
Date: Fri, 19 Mar 2021 13:19:25 +0100	[thread overview]
Message-ID: <20210319121748.211622571@linuxfoundation.org> (raw)
In-Reply-To: <20210319121747.203523570@linuxfoundation.org>

From: Ard Biesheuvel <ardb@kernel.org>

commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 upstream.

The XTS asm helper arrangement is a bit odd: the 8-way stride helper
consists of back-to-back calls to the 4-way core transforms, which
are called indirectly, based on a boolean that indicates whether we
are performing encryption or decryption.

Given how costly indirect calls are on x86, let's switch to direct
calls, and given how the 8-way stride doesn't really add anything
substantial, use a 4-way stride instead, and make the asm core
routine deal with any multiple of 4 blocks. Since 512 byte sectors
or 4 KB blocks are the typical quantities XTS operates on, increase
the stride exported to the glue helper to 512 bytes as well.

As a result, the number of indirect calls is reduced from 3 per 64 bytes
of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)

Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/aesni-intel_asm.S  |  115 ++++++++++++++++++++++---------------
 arch/x86/crypto/aesni-intel_glue.c |   25 ++++----
 2 files changed, 84 insertions(+), 56 deletions(-)

--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2715,25 +2715,18 @@ SYM_FUNC_END(aesni_ctr_enc)
 	pxor CTR, IV;
 
 /*
- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
- *			 const u8 *src, bool enc, le128 *iv)
+ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ *			  const u8 *src, unsigned int len, le128 *iv)
  */
-SYM_FUNC_START(aesni_xts_crypt8)
+SYM_FUNC_START(aesni_xts_encrypt)
 	FRAME_BEGIN
-	testb %cl, %cl
-	movl $0, %ecx
-	movl $240, %r10d
-	leaq _aesni_enc4, %r11
-	leaq _aesni_dec4, %rax
-	cmovel %r10d, %ecx
-	cmoveq %rax, %r11
 
 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
 	movups (IVP), IV
 
 	mov 480(KEYP), KLEN
-	addq %rcx, KEYP
 
+.Lxts_enc_loop4:
 	movdqa IV, STATE1
 	movdqu 0x00(INP), INC
 	pxor INC, STATE1
@@ -2757,71 +2750,103 @@ SYM_FUNC_START(aesni_xts_crypt8)
 	pxor INC, STATE4
 	movdqu IV, 0x30(OUTP)
 
-	CALL_NOSPEC r11
+	call _aesni_enc4
 
 	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
 	movdqu STATE1, 0x00(OUTP)
 
-	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE1
-	movdqu 0x40(INP), INC
-	pxor INC, STATE1
-	movdqu IV, 0x40(OUTP)
-
 	movdqu 0x10(OUTP), INC
 	pxor INC, STATE2
 	movdqu STATE2, 0x10(OUTP)
 
-	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE2
-	movdqu 0x50(INP), INC
-	pxor INC, STATE2
-	movdqu IV, 0x50(OUTP)
-
 	movdqu 0x20(OUTP), INC
 	pxor INC, STATE3
 	movdqu STATE3, 0x20(OUTP)
 
-	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE3
-	movdqu 0x60(INP), INC
-	pxor INC, STATE3
-	movdqu IV, 0x60(OUTP)
-
 	movdqu 0x30(OUTP), INC
 	pxor INC, STATE4
 	movdqu STATE4, 0x30(OUTP)
 
 	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE4
-	movdqu 0x70(INP), INC
-	pxor INC, STATE4
-	movdqu IV, 0x70(OUTP)
 
-	_aesni_gf128mul_x_ble()
+	add $64, INP
+	add $64, OUTP
+	sub $64, LEN
+	ja .Lxts_enc_loop4
+
 	movups IV, (IVP)
 
-	CALL_NOSPEC r11
+	FRAME_END
+	ret
+SYM_FUNC_END(aesni_xts_encrypt)
+
+/*
+ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ *			  const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_decrypt)
+	FRAME_BEGIN
+
+	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+	movups (IVP), IV
+
+	mov 480(KEYP), KLEN
+	add $240, KEYP
+
+.Lxts_dec_loop4:
+	movdqa IV, STATE1
+	movdqu 0x00(INP), INC
+	pxor INC, STATE1
+	movdqu IV, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	movdqu 0x10(INP), INC
+	pxor INC, STATE2
+	movdqu IV, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	movdqu 0x20(INP), INC
+	pxor INC, STATE3
+	movdqu IV, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	movdqu 0x30(INP), INC
+	pxor INC, STATE4
+	movdqu IV, 0x30(OUTP)
+
+	call _aesni_dec4
 
-	movdqu 0x40(OUTP), INC
+	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
-	movdqu STATE1, 0x40(OUTP)
+	movdqu STATE1, 0x00(OUTP)
 
-	movdqu 0x50(OUTP), INC
+	movdqu 0x10(OUTP), INC
 	pxor INC, STATE2
-	movdqu STATE2, 0x50(OUTP)
+	movdqu STATE2, 0x10(OUTP)
 
-	movdqu 0x60(OUTP), INC
+	movdqu 0x20(OUTP), INC
 	pxor INC, STATE3
-	movdqu STATE3, 0x60(OUTP)
+	movdqu STATE3, 0x20(OUTP)
 
-	movdqu 0x70(OUTP), INC
+	movdqu 0x30(OUTP), INC
 	pxor INC, STATE4
-	movdqu STATE4, 0x70(OUTP)
+	movdqu STATE4, 0x30(OUTP)
+
+	_aesni_gf128mul_x_ble()
+
+	add $64, INP
+	add $64, OUTP
+	sub $64, LEN
+	ja .Lxts_dec_loop4
+
+	movups IV, (IVP)
 
 	FRAME_END
 	ret
-SYM_FUNC_END(aesni_xts_crypt8)
+SYM_FUNC_END(aesni_xts_decrypt)
 
 #endif
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -97,6 +97,12 @@ asmlinkage void aesni_cbc_dec(struct cry
 #define AVX_GEN2_OPTSIZE 640
 #define AVX_GEN4_OPTSIZE 4096
 
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+				  const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+				  const u8 *in, unsigned int len, u8 *iv);
+
 #ifdef CONFIG_X86_64
 
 static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
@@ -104,9 +110,6 @@ static void (*aesni_ctr_enc_tfm)(struct
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
-				 const u8 *in, bool enc, le128 *iv);
-
 /* asmlinkage void aesni_gcm_enc()
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
  * struct gcm_context_data.  May be uninitialized.
@@ -547,14 +550,14 @@ static void aesni_xts_dec(const void *ct
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
 }
 
-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
 {
-	aesni_xts_crypt8(ctx, dst, src, true, iv);
+	aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
 }
 
-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
 {
-	aesni_xts_crypt8(ctx, dst, src, false, iv);
+	aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
 }
 
 static const struct common_glue_ctx aesni_enc_xts = {
@@ -562,8 +565,8 @@ static const struct common_glue_ctx aesn
 	.fpu_blocks_limit = 1,
 
 	.funcs = { {
-		.num_blocks = 8,
-		.fn_u = { .xts = aesni_xts_enc8 }
+		.num_blocks = 32,
+		.fn_u = { .xts = aesni_xts_enc32 }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .xts = aesni_xts_enc }
@@ -575,8 +578,8 @@ static const struct common_glue_ctx aesn
 	.fpu_blocks_limit = 1,
 
 	.funcs = { {
-		.num_blocks = 8,
-		.fn_u = { .xts = aesni_xts_dec8 }
+		.num_blocks = 32,
+		.fn_u = { .xts = aesni_xts_dec32 }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .xts = aesni_xts_dec }



  parent reply	other threads:[~2021-03-19 12:22 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-19 12:18 [PATCH 5.11 00/31] 5.11.8-rc1 review Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 01/31] io_uring: dont attempt IO reissue from the ring exit path Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 02/31] KVM: x86/mmu: Expand on the comment in kvm_vcpu_ad_need_write_protect() Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 03/31] KVM: x86/mmu: Set SPTE_AD_WRPROT_ONLY_MASK if and only if PML is enabled Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 04/31] mptcp: send ack for every add_addr Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 05/31] mptcp: pm: add lockdep assertions Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 06/31] mptcp: dispose initial struct socket when its subflow is closed Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 07/31] io_uring: refactor scheduling in io_cqring_wait Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 08/31] io_uring: refactor io_cqring_wait Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 09/31] io_uring: dont keep looping for more events if we cant flush overflow Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 10/31] io_uring: simplify do_read return parsing Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 11/31] io_uring: clear IOCB_WAITQ for non -EIOCBQUEUED return Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 12/31] gpiolib: Read "gpio-line-names" from a firmware node Greg Kroah-Hartman
2021-03-19 12:27   ` Marek Vasut
2021-03-19 12:36     ` Greg Kroah-Hartman
2021-03-19 12:45       ` Marek Vasut
2021-03-19 12:19 ` [PATCH 5.11 13/31] net: bonding: fix error return code of bond_neigh_init() Greg Kroah-Hartman
2021-03-19 14:12   ` Jiri Kosina
2021-03-19 14:24     ` Jiri Kosina
2021-03-19 14:29       ` Greg Kroah-Hartman
2021-03-19 14:25     ` Greg Kroah-Hartman
2021-03-19 15:14       ` Jiri Kosina
2021-03-19 12:19 ` [PATCH 5.11 14/31] regulator: pca9450: Add SD_VSEL GPIO for LDO5 Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 15/31] regulator: pca9450: Enable system reset on WDOG_B assertion Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 16/31] regulator: pca9450: Clear PRESET_EN bit to fix BUCK1/2/3 voltage setting Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 17/31] gfs2: Add common helper for holding and releasing the freeze glock Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 18/31] gfs2: move freeze glock outside the make_fs_rw and _ro functions Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 19/31] gfs2: bypass signal_our_withdraw if no journal Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 20/31] bpf: Prohibit alu ops for pointer types not defining ptr_limit Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 21/31] bpf: Fix off-by-one for area size in creating mask to left Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 22/31] bpf: Simplify alu_limit masking for pointer arithmetic Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 23/31] bpf: Add sanity check for upper ptr_limit Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 24/31] bpf, selftests: Fix up some test_verifier cases for unprivileged Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 25/31] arm64: Unconditionally set virtual cpu id registers Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 26/31] RDMA/srp: Fix support for unpopulated and unbalanced NUMA nodes Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 27/31] fuse: fix live lock in fuse_iget() Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 28/31] Revert "nfsd4: remove check_conflicting_opens warning" Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 29/31] Revert "nfsd4: a clients own opens neednt prevent delegations" Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 30/31] net: dsa: b53: Support setting learning on port Greg Kroah-Hartman
2021-03-19 12:19 ` Greg Kroah-Hartman [this message]
2021-03-19 19:38 ` [PATCH 5.11 00/31] 5.11.8-rc1 review Naresh Kamboju
2021-03-20  9:52   ` Greg Kroah-Hartman
2021-03-19 21:23 ` Guenter Roeck

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210319121748.211622571@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=ardb@kernel.org \
    --cc=ebiggers@google.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.