All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mark Rutland <mark.rutland@arm.com>
To: Ard Biesheuvel <ardb@google.com>
Cc: linux-arm-kernel@lists.infradead.org,
	linux-crypto@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	Marc Zyngier <maz@kernel.org>, Will Deacon <will@kernel.org>,
	Kees Cook <keescook@chromium.org>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Mark Brown <broonie@kernel.org>,
	Eric Biggers <ebiggers@google.com>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Subject: Re: [PATCH v3 4/5] arm64: crypto: Remove conditional yield logic
Date: Mon, 27 Nov 2023 13:46:25 +0000	[thread overview]
Message-ID: <ZWSdsUIcCCuWlVhM@FVFF77S0Q05N> (raw)
In-Reply-To: <20231127122259.2265164-11-ardb@google.com>

On Mon, Nov 27, 2023 at 01:23:04PM +0100, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Some classes of crypto algorithms (such as skciphers or aeads) have
> natural yield points, but SIMD based shashes yield the NEON unit
> manually to avoid causing scheduling blackouts when operating on large
> inputs.
> 
> This is no longer necessary now that kernel mode NEON runs with
> preemption enabled, so remove this logic from the crypto assembler code,
> along with the macro that implements the TIF_NEED_RESCHED check.
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

I definitely want to get rid of all of the voluntary preemption points, but
IIUC for the moment we need to keep these for PREEMPT_NONE and
PREEMPT_VOLUNTARY (and consequently for PREEMPT_DYNAMIC). Once the preemption
rework lands, these should no longer be necessary and can be removed:

  https://lore.kernel.org/lkml/20231107215742.363031-1-ankur.a.arora@oracle.com/

Thanks,
Mark.

> ---
>  arch/arm64/crypto/aes-glue.c       | 21 +++++---------
>  arch/arm64/crypto/aes-modes.S      |  2 --
>  arch/arm64/crypto/sha1-ce-core.S   |  6 ++--
>  arch/arm64/crypto/sha1-ce-glue.c   | 19 ++++---------
>  arch/arm64/crypto/sha2-ce-core.S   |  6 ++--
>  arch/arm64/crypto/sha2-ce-glue.c   | 19 ++++---------
>  arch/arm64/crypto/sha3-ce-core.S   |  6 ++--
>  arch/arm64/crypto/sha3-ce-glue.c   | 14 ++++------
>  arch/arm64/crypto/sha512-ce-core.S |  8 ++----
>  arch/arm64/crypto/sha512-ce-glue.c | 16 ++++-------
>  arch/arm64/include/asm/assembler.h | 29 --------------------
>  arch/arm64/kernel/asm-offsets.c    |  4 ---
>  12 files changed, 38 insertions(+), 112 deletions(-)
> 
> diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
> index 162787c7aa86..c42c903b7d60 100644
> --- a/arch/arm64/crypto/aes-glue.c
> +++ b/arch/arm64/crypto/aes-glue.c
> @@ -109,9 +109,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
>  				      int rounds, int blocks, u8 iv[],
>  				      u32 const rk2[]);
>  
> -asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
> -			      int blocks, u8 dg[], int enc_before,
> -			      int enc_after);
> +asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
> +			       int blocks, u8 dg[], int enc_before,
> +			       int enc_after);
>  
>  struct crypto_aes_xts_ctx {
>  	struct crypto_aes_ctx key1;
> @@ -880,17 +880,10 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
>  	int rounds = 6 + ctx->key_length / 4;
>  
>  	if (crypto_simd_usable()) {
> -		int rem;
> -
> -		do {
> -			kernel_neon_begin();
> -			rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
> -					     dg, enc_before, enc_after);
> -			kernel_neon_end();
> -			in += (blocks - rem) * AES_BLOCK_SIZE;
> -			blocks = rem;
> -			enc_before = 0;
> -		} while (blocks);
> +		kernel_neon_begin();
> +		aes_mac_update(in, ctx->key_enc, rounds, blocks, dg,
> +			       enc_before, enc_after);
> +		kernel_neon_end();
>  	} else {
>  		if (enc_before)
>  			aes_encrypt(ctx, dg, dg);
> diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
> index 0e834a2c062c..4d68853d0caf 100644
> --- a/arch/arm64/crypto/aes-modes.S
> +++ b/arch/arm64/crypto/aes-modes.S
> @@ -842,7 +842,6 @@ AES_FUNC_START(aes_mac_update)
>  	cbz		w5, .Lmacout
>  	encrypt_block	v0, w2, x1, x7, w8
>  	st1		{v0.16b}, [x4]			/* return dg */
> -	cond_yield	.Lmacout, x7, x8
>  	b		.Lmacloop4x
>  .Lmac1x:
>  	add		w3, w3, #4
> @@ -861,6 +860,5 @@ AES_FUNC_START(aes_mac_update)
>  
>  .Lmacout:
>  	st1		{v0.16b}, [x4]			/* return dg */
> -	mov		w0, w3
>  	ret
>  AES_FUNC_END(aes_mac_update)
> diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
> index 9b1f2d82a6fe..9e37bc09c3a5 100644
> --- a/arch/arm64/crypto/sha1-ce-core.S
> +++ b/arch/arm64/crypto/sha1-ce-core.S
> @@ -62,8 +62,8 @@
>  	.endm
>  
>  	/*
> -	 * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> -	 *			   int blocks)
> +	 * void __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> +	 *			    int blocks)
>  	 */
>  SYM_FUNC_START(__sha1_ce_transform)
>  	/* load round constants */
> @@ -121,7 +121,6 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
>  	add		dgav.4s, dgav.4s, dg0v.4s
>  
>  	cbz		w2, 2f
> -	cond_yield	3f, x5, x6
>  	b		0b
>  
>  	/*
> @@ -145,6 +144,5 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
>  	/* store new state */
>  3:	st1		{dgav.4s}, [x0]
>  	str		dgb, [x0, #16]
> -	mov		w0, w2
>  	ret
>  SYM_FUNC_END(__sha1_ce_transform)
> diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
> index 1dd93e1fcb39..c1c5c5cb104b 100644
> --- a/arch/arm64/crypto/sha1-ce-glue.c
> +++ b/arch/arm64/crypto/sha1-ce-glue.c
> @@ -29,23 +29,16 @@ struct sha1_ce_state {
>  extern const u32 sha1_ce_offsetof_count;
>  extern const u32 sha1_ce_offsetof_finalize;
>  
> -asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> -				   int blocks);
> +asmlinkage void __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> +				    int blocks);
>  
>  static void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
>  			      int blocks)
>  {
> -	while (blocks) {
> -		int rem;
> -
> -		kernel_neon_begin();
> -		rem = __sha1_ce_transform(container_of(sst,
> -						       struct sha1_ce_state,
> -						       sst), src, blocks);
> -		kernel_neon_end();
> -		src += (blocks - rem) * SHA1_BLOCK_SIZE;
> -		blocks = rem;
> -	}
> +	kernel_neon_begin();
> +	__sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src,
> +			    blocks);
> +	kernel_neon_end();
>  }
>  
>  const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
> diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
> index fce84d88ddb2..112d772b29db 100644
> --- a/arch/arm64/crypto/sha2-ce-core.S
> +++ b/arch/arm64/crypto/sha2-ce-core.S
> @@ -71,8 +71,8 @@
>  	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
>  
>  	/*
> -	 * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> -	 *			     int blocks)
> +	 * void __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> +	 *			      int blocks)
>  	 */
>  	.text
>  SYM_FUNC_START(__sha256_ce_transform)
> @@ -129,7 +129,6 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
>  
>  	/* handled all input blocks? */
>  	cbz		w2, 2f
> -	cond_yield	3f, x5, x6
>  	b		0b
>  
>  	/*
> @@ -152,6 +151,5 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
>  
>  	/* store new state */
>  3:	st1		{dgav.4s, dgbv.4s}, [x0]
> -	mov		w0, w2
>  	ret
>  SYM_FUNC_END(__sha256_ce_transform)
> diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
> index 0a44d2e7ee1f..f785a66a1de4 100644
> --- a/arch/arm64/crypto/sha2-ce-glue.c
> +++ b/arch/arm64/crypto/sha2-ce-glue.c
> @@ -30,23 +30,16 @@ struct sha256_ce_state {
>  extern const u32 sha256_ce_offsetof_count;
>  extern const u32 sha256_ce_offsetof_finalize;
>  
> -asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> -				     int blocks);
> +asmlinkage void __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> +				      int blocks);
>  
>  static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
>  				int blocks)
>  {
> -	while (blocks) {
> -		int rem;
> -
> -		kernel_neon_begin();
> -		rem = __sha256_ce_transform(container_of(sst,
> -							 struct sha256_ce_state,
> -							 sst), src, blocks);
> -		kernel_neon_end();
> -		src += (blocks - rem) * SHA256_BLOCK_SIZE;
> -		blocks = rem;
> -	}
> +	kernel_neon_begin();
> +	__sha256_ce_transform(container_of(sst, struct sha256_ce_state, sst),
> +			      src, blocks);
> +	kernel_neon_end();
>  }
>  
>  const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
> diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
> index 9c77313f5a60..db64831ad35d 100644
> --- a/arch/arm64/crypto/sha3-ce-core.S
> +++ b/arch/arm64/crypto/sha3-ce-core.S
> @@ -37,7 +37,7 @@
>  	.endm
>  
>  	/*
> -	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
> +	 * void sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
>  	 */
>  	.text
>  SYM_FUNC_START(sha3_ce_transform)
> @@ -184,18 +184,16 @@ SYM_FUNC_START(sha3_ce_transform)
>  	eor	 v0.16b,  v0.16b, v31.16b
>  
>  	cbnz	w8, 3b
> -	cond_yield 4f, x8, x9
>  	cbnz	w2, 0b
>  
>  	/* save state */
> -4:	st1	{ v0.1d- v3.1d}, [x0], #32
> +	st1	{ v0.1d- v3.1d}, [x0], #32
>  	st1	{ v4.1d- v7.1d}, [x0], #32
>  	st1	{ v8.1d-v11.1d}, [x0], #32
>  	st1	{v12.1d-v15.1d}, [x0], #32
>  	st1	{v16.1d-v19.1d}, [x0], #32
>  	st1	{v20.1d-v23.1d}, [x0], #32
>  	st1	{v24.1d}, [x0]
> -	mov	w0, w2
>  	ret
>  SYM_FUNC_END(sha3_ce_transform)
>  
> diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
> index 250e1377c481..d689cd2bf4cf 100644
> --- a/arch/arm64/crypto/sha3-ce-glue.c
> +++ b/arch/arm64/crypto/sha3-ce-glue.c
> @@ -28,8 +28,8 @@ MODULE_ALIAS_CRYPTO("sha3-256");
>  MODULE_ALIAS_CRYPTO("sha3-384");
>  MODULE_ALIAS_CRYPTO("sha3-512");
>  
> -asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
> -				 int md_len);
> +asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks,
> +				  int md_len);
>  
>  static int sha3_update(struct shash_desc *desc, const u8 *data,
>  		       unsigned int len)
> @@ -59,15 +59,11 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
>  		blocks = len / sctx->rsiz;
>  		len %= sctx->rsiz;
>  
> -		while (blocks) {
> -			int rem;
> -
> +		if (blocks) {
>  			kernel_neon_begin();
> -			rem = sha3_ce_transform(sctx->st, data, blocks,
> -						digest_size);
> +			sha3_ce_transform(sctx->st, data, blocks, digest_size);
>  			kernel_neon_end();
> -			data += (blocks - rem) * sctx->rsiz;
> -			blocks = rem;
> +			data += blocks * sctx->rsiz;
>  		}
>  	}
>  
> diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
> index 91ef68b15fcc..96acc9295230 100644
> --- a/arch/arm64/crypto/sha512-ce-core.S
> +++ b/arch/arm64/crypto/sha512-ce-core.S
> @@ -102,8 +102,8 @@
>  	.endm
>  
>  	/*
> -	 * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> -	 *			     int blocks)
> +	 * void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> +	 *			      int blocks)
>  	 */
>  	.text
>  SYM_FUNC_START(__sha512_ce_transform)
> @@ -195,12 +195,10 @@ CPU_LE(	rev64		v19.16b, v19.16b	)
>  	add		v10.2d, v10.2d, v2.2d
>  	add		v11.2d, v11.2d, v3.2d
>  
> -	cond_yield	3f, x4, x5
>  	/* handled all input blocks? */
>  	cbnz		w2, 0b
>  
>  	/* store new state */
> -3:	st1		{v8.2d-v11.2d}, [x0]
> -	mov		w0, w2
> +	st1		{v8.2d-v11.2d}, [x0]
>  	ret
>  SYM_FUNC_END(__sha512_ce_transform)
> diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
> index f3431fc62315..70eef74fe031 100644
> --- a/arch/arm64/crypto/sha512-ce-glue.c
> +++ b/arch/arm64/crypto/sha512-ce-glue.c
> @@ -26,23 +26,17 @@ MODULE_LICENSE("GPL v2");
>  MODULE_ALIAS_CRYPTO("sha384");
>  MODULE_ALIAS_CRYPTO("sha512");
>  
> -asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> -				     int blocks);
> +asmlinkage void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> +				      int blocks);
>  
>  asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
>  
>  static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
>  				int blocks)
>  {
> -	while (blocks) {
> -		int rem;
> -
> -		kernel_neon_begin();
> -		rem = __sha512_ce_transform(sst, src, blocks);
> -		kernel_neon_end();
> -		src += (blocks - rem) * SHA512_BLOCK_SIZE;
> -		blocks = rem;
> -	}
> +	kernel_neon_begin();
> +	__sha512_ce_transform(sst, src, blocks);
> +	kernel_neon_end();
>  }
>  
>  static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
> index 376a980f2bad..f0da53a0388f 100644
> --- a/arch/arm64/include/asm/assembler.h
> +++ b/arch/arm64/include/asm/assembler.h
> @@ -759,35 +759,6 @@ alternative_endif
>  	set_sctlr sctlr_el2, \reg
>  .endm
>  
> -	/*
> -	 * Check whether preempt/bh-disabled asm code should yield as soon as
> -	 * it is able. This is the case if we are currently running in task
> -	 * context, and either a softirq is pending, or the TIF_NEED_RESCHED
> -	 * flag is set and re-enabling preemption a single time would result in
> -	 * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
> -	 * stored negated in the top word of the thread_info::preempt_count
> -	 * field)
> -	 */
> -	.macro		cond_yield, lbl:req, tmp:req, tmp2:req
> -	get_current_task \tmp
> -	ldr		\tmp, [\tmp, #TSK_TI_PREEMPT]
> -	/*
> -	 * If we are serving a softirq, there is no point in yielding: the
> -	 * softirq will not be preempted no matter what we do, so we should
> -	 * run to completion as quickly as we can.
> -	 */
> -	tbnz		\tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@
> -#ifdef CONFIG_PREEMPTION
> -	sub		\tmp, \tmp, #PREEMPT_DISABLE_OFFSET
> -	cbz		\tmp, \lbl
> -#endif
> -	adr_l		\tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
> -	get_this_cpu_offset	\tmp2
> -	ldr		w\tmp, [\tmp, \tmp2]
> -	cbnz		w\tmp, \lbl	// yield on pending softirq in task context
> -.Lnoyield_\@:
> -	.endm
> -
>  /*
>   * Branch Target Identifier (BTI)
>   */
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 5ff1942b04fc..fb9e9ef9b527 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -116,10 +116,6 @@ int main(void)
>    DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
>    DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
>    BLANK();
> -  DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
> -  DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
> -  DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
> -  BLANK();
>    DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
>    BLANK();
>    DEFINE(FTR_OVR_VAL_OFFSET,	offsetof(struct arm64_ftr_override, val));
> -- 
> 2.43.0.rc1.413.gea7ed67945-goog
> 

WARNING: multiple messages have this Message-ID (diff)
From: Mark Rutland <mark.rutland@arm.com>
To: Ard Biesheuvel <ardb@google.com>
Cc: linux-arm-kernel@lists.infradead.org,
	linux-crypto@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	Marc Zyngier <maz@kernel.org>, Will Deacon <will@kernel.org>,
	Kees Cook <keescook@chromium.org>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Mark Brown <broonie@kernel.org>,
	Eric Biggers <ebiggers@google.com>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Subject: Re: [PATCH v3 4/5] arm64: crypto: Remove conditional yield logic
Date: Mon, 27 Nov 2023 13:46:25 +0000	[thread overview]
Message-ID: <ZWSdsUIcCCuWlVhM@FVFF77S0Q05N> (raw)
In-Reply-To: <20231127122259.2265164-11-ardb@google.com>

On Mon, Nov 27, 2023 at 01:23:04PM +0100, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
> 
> Some classes of crypto algorithms (such as skciphers or aeads) have
> natural yield points, but SIMD based shashes yield the NEON unit
> manually to avoid causing scheduling blackouts when operating on large
> inputs.
> 
> This is no longer necessary now that kernel mode NEON runs with
> preemption enabled, so remove this logic from the crypto assembler code,
> along with the macro that implements the TIF_NEED_RESCHED check.
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

I definitely want to get rid of all of the voluntary preemption points, but
IIUC for the moment we need to keep these for PREEMPT_NONE and
PREEMPT_VOLUNTARY (and consequently for PREEMPT_DYNAMIC). Once the preemption
rework lands, these should no longer be necessary and can be removed:

  https://lore.kernel.org/lkml/20231107215742.363031-1-ankur.a.arora@oracle.com/

Thanks,
Mark.

> ---
>  arch/arm64/crypto/aes-glue.c       | 21 +++++---------
>  arch/arm64/crypto/aes-modes.S      |  2 --
>  arch/arm64/crypto/sha1-ce-core.S   |  6 ++--
>  arch/arm64/crypto/sha1-ce-glue.c   | 19 ++++---------
>  arch/arm64/crypto/sha2-ce-core.S   |  6 ++--
>  arch/arm64/crypto/sha2-ce-glue.c   | 19 ++++---------
>  arch/arm64/crypto/sha3-ce-core.S   |  6 ++--
>  arch/arm64/crypto/sha3-ce-glue.c   | 14 ++++------
>  arch/arm64/crypto/sha512-ce-core.S |  8 ++----
>  arch/arm64/crypto/sha512-ce-glue.c | 16 ++++-------
>  arch/arm64/include/asm/assembler.h | 29 --------------------
>  arch/arm64/kernel/asm-offsets.c    |  4 ---
>  12 files changed, 38 insertions(+), 112 deletions(-)
> 
> diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
> index 162787c7aa86..c42c903b7d60 100644
> --- a/arch/arm64/crypto/aes-glue.c
> +++ b/arch/arm64/crypto/aes-glue.c
> @@ -109,9 +109,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
>  				      int rounds, int blocks, u8 iv[],
>  				      u32 const rk2[]);
>  
> -asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
> -			      int blocks, u8 dg[], int enc_before,
> -			      int enc_after);
> +asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
> +			       int blocks, u8 dg[], int enc_before,
> +			       int enc_after);
>  
>  struct crypto_aes_xts_ctx {
>  	struct crypto_aes_ctx key1;
> @@ -880,17 +880,10 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
>  	int rounds = 6 + ctx->key_length / 4;
>  
>  	if (crypto_simd_usable()) {
> -		int rem;
> -
> -		do {
> -			kernel_neon_begin();
> -			rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
> -					     dg, enc_before, enc_after);
> -			kernel_neon_end();
> -			in += (blocks - rem) * AES_BLOCK_SIZE;
> -			blocks = rem;
> -			enc_before = 0;
> -		} while (blocks);
> +		kernel_neon_begin();
> +		aes_mac_update(in, ctx->key_enc, rounds, blocks, dg,
> +			       enc_before, enc_after);
> +		kernel_neon_end();
>  	} else {
>  		if (enc_before)
>  			aes_encrypt(ctx, dg, dg);
> diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
> index 0e834a2c062c..4d68853d0caf 100644
> --- a/arch/arm64/crypto/aes-modes.S
> +++ b/arch/arm64/crypto/aes-modes.S
> @@ -842,7 +842,6 @@ AES_FUNC_START(aes_mac_update)
>  	cbz		w5, .Lmacout
>  	encrypt_block	v0, w2, x1, x7, w8
>  	st1		{v0.16b}, [x4]			/* return dg */
> -	cond_yield	.Lmacout, x7, x8
>  	b		.Lmacloop4x
>  .Lmac1x:
>  	add		w3, w3, #4
> @@ -861,6 +860,5 @@ AES_FUNC_START(aes_mac_update)
>  
>  .Lmacout:
>  	st1		{v0.16b}, [x4]			/* return dg */
> -	mov		w0, w3
>  	ret
>  AES_FUNC_END(aes_mac_update)
> diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
> index 9b1f2d82a6fe..9e37bc09c3a5 100644
> --- a/arch/arm64/crypto/sha1-ce-core.S
> +++ b/arch/arm64/crypto/sha1-ce-core.S
> @@ -62,8 +62,8 @@
>  	.endm
>  
>  	/*
> -	 * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> -	 *			   int blocks)
> +	 * void __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> +	 *			    int blocks)
>  	 */
>  SYM_FUNC_START(__sha1_ce_transform)
>  	/* load round constants */
> @@ -121,7 +121,6 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
>  	add		dgav.4s, dgav.4s, dg0v.4s
>  
>  	cbz		w2, 2f
> -	cond_yield	3f, x5, x6
>  	b		0b
>  
>  	/*
> @@ -145,6 +144,5 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
>  	/* store new state */
>  3:	st1		{dgav.4s}, [x0]
>  	str		dgb, [x0, #16]
> -	mov		w0, w2
>  	ret
>  SYM_FUNC_END(__sha1_ce_transform)
> diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
> index 1dd93e1fcb39..c1c5c5cb104b 100644
> --- a/arch/arm64/crypto/sha1-ce-glue.c
> +++ b/arch/arm64/crypto/sha1-ce-glue.c
> @@ -29,23 +29,16 @@ struct sha1_ce_state {
>  extern const u32 sha1_ce_offsetof_count;
>  extern const u32 sha1_ce_offsetof_finalize;
>  
> -asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> -				   int blocks);
> +asmlinkage void __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> +				    int blocks);
>  
>  static void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
>  			      int blocks)
>  {
> -	while (blocks) {
> -		int rem;
> -
> -		kernel_neon_begin();
> -		rem = __sha1_ce_transform(container_of(sst,
> -						       struct sha1_ce_state,
> -						       sst), src, blocks);
> -		kernel_neon_end();
> -		src += (blocks - rem) * SHA1_BLOCK_SIZE;
> -		blocks = rem;
> -	}
> +	kernel_neon_begin();
> +	__sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src,
> +			    blocks);
> +	kernel_neon_end();
>  }
>  
>  const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
> diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
> index fce84d88ddb2..112d772b29db 100644
> --- a/arch/arm64/crypto/sha2-ce-core.S
> +++ b/arch/arm64/crypto/sha2-ce-core.S
> @@ -71,8 +71,8 @@
>  	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
>  
>  	/*
> -	 * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> -	 *			     int blocks)
> +	 * void __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> +	 *			      int blocks)
>  	 */
>  	.text
>  SYM_FUNC_START(__sha256_ce_transform)
> @@ -129,7 +129,6 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
>  
>  	/* handled all input blocks? */
>  	cbz		w2, 2f
> -	cond_yield	3f, x5, x6
>  	b		0b
>  
>  	/*
> @@ -152,6 +151,5 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
>  
>  	/* store new state */
>  3:	st1		{dgav.4s, dgbv.4s}, [x0]
> -	mov		w0, w2
>  	ret
>  SYM_FUNC_END(__sha256_ce_transform)
> diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
> index 0a44d2e7ee1f..f785a66a1de4 100644
> --- a/arch/arm64/crypto/sha2-ce-glue.c
> +++ b/arch/arm64/crypto/sha2-ce-glue.c
> @@ -30,23 +30,16 @@ struct sha256_ce_state {
>  extern const u32 sha256_ce_offsetof_count;
>  extern const u32 sha256_ce_offsetof_finalize;
>  
> -asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> -				     int blocks);
> +asmlinkage void __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> +				      int blocks);
>  
>  static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
>  				int blocks)
>  {
> -	while (blocks) {
> -		int rem;
> -
> -		kernel_neon_begin();
> -		rem = __sha256_ce_transform(container_of(sst,
> -							 struct sha256_ce_state,
> -							 sst), src, blocks);
> -		kernel_neon_end();
> -		src += (blocks - rem) * SHA256_BLOCK_SIZE;
> -		blocks = rem;
> -	}
> +	kernel_neon_begin();
> +	__sha256_ce_transform(container_of(sst, struct sha256_ce_state, sst),
> +			      src, blocks);
> +	kernel_neon_end();
>  }
>  
>  const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
> diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
> index 9c77313f5a60..db64831ad35d 100644
> --- a/arch/arm64/crypto/sha3-ce-core.S
> +++ b/arch/arm64/crypto/sha3-ce-core.S
> @@ -37,7 +37,7 @@
>  	.endm
>  
>  	/*
> -	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
> +	 * void sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
>  	 */
>  	.text
>  SYM_FUNC_START(sha3_ce_transform)
> @@ -184,18 +184,16 @@ SYM_FUNC_START(sha3_ce_transform)
>  	eor	 v0.16b,  v0.16b, v31.16b
>  
>  	cbnz	w8, 3b
> -	cond_yield 4f, x8, x9
>  	cbnz	w2, 0b
>  
>  	/* save state */
> -4:	st1	{ v0.1d- v3.1d}, [x0], #32
> +	st1	{ v0.1d- v3.1d}, [x0], #32
>  	st1	{ v4.1d- v7.1d}, [x0], #32
>  	st1	{ v8.1d-v11.1d}, [x0], #32
>  	st1	{v12.1d-v15.1d}, [x0], #32
>  	st1	{v16.1d-v19.1d}, [x0], #32
>  	st1	{v20.1d-v23.1d}, [x0], #32
>  	st1	{v24.1d}, [x0]
> -	mov	w0, w2
>  	ret
>  SYM_FUNC_END(sha3_ce_transform)
>  
> diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
> index 250e1377c481..d689cd2bf4cf 100644
> --- a/arch/arm64/crypto/sha3-ce-glue.c
> +++ b/arch/arm64/crypto/sha3-ce-glue.c
> @@ -28,8 +28,8 @@ MODULE_ALIAS_CRYPTO("sha3-256");
>  MODULE_ALIAS_CRYPTO("sha3-384");
>  MODULE_ALIAS_CRYPTO("sha3-512");
>  
> -asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
> -				 int md_len);
> +asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks,
> +				  int md_len);
>  
>  static int sha3_update(struct shash_desc *desc, const u8 *data,
>  		       unsigned int len)
> @@ -59,15 +59,11 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
>  		blocks = len / sctx->rsiz;
>  		len %= sctx->rsiz;
>  
> -		while (blocks) {
> -			int rem;
> -
> +		if (blocks) {
>  			kernel_neon_begin();
> -			rem = sha3_ce_transform(sctx->st, data, blocks,
> -						digest_size);
> +			sha3_ce_transform(sctx->st, data, blocks, digest_size);
>  			kernel_neon_end();
> -			data += (blocks - rem) * sctx->rsiz;
> -			blocks = rem;
> +			data += blocks * sctx->rsiz;
>  		}
>  	}
>  
> diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
> index 91ef68b15fcc..96acc9295230 100644
> --- a/arch/arm64/crypto/sha512-ce-core.S
> +++ b/arch/arm64/crypto/sha512-ce-core.S
> @@ -102,8 +102,8 @@
>  	.endm
>  
>  	/*
> -	 * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> -	 *			     int blocks)
> +	 * void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> +	 *			      int blocks)
>  	 */
>  	.text
>  SYM_FUNC_START(__sha512_ce_transform)
> @@ -195,12 +195,10 @@ CPU_LE(	rev64		v19.16b, v19.16b	)
>  	add		v10.2d, v10.2d, v2.2d
>  	add		v11.2d, v11.2d, v3.2d
>  
> -	cond_yield	3f, x4, x5
>  	/* handled all input blocks? */
>  	cbnz		w2, 0b
>  
>  	/* store new state */
> -3:	st1		{v8.2d-v11.2d}, [x0]
> -	mov		w0, w2
> +	st1		{v8.2d-v11.2d}, [x0]
>  	ret
>  SYM_FUNC_END(__sha512_ce_transform)
> diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
> index f3431fc62315..70eef74fe031 100644
> --- a/arch/arm64/crypto/sha512-ce-glue.c
> +++ b/arch/arm64/crypto/sha512-ce-glue.c
> @@ -26,23 +26,17 @@ MODULE_LICENSE("GPL v2");
>  MODULE_ALIAS_CRYPTO("sha384");
>  MODULE_ALIAS_CRYPTO("sha512");
>  
> -asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> -				     int blocks);
> +asmlinkage void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> +				      int blocks);
>  
>  asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
>  
>  static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
>  				int blocks)
>  {
> -	while (blocks) {
> -		int rem;
> -
> -		kernel_neon_begin();
> -		rem = __sha512_ce_transform(sst, src, blocks);
> -		kernel_neon_end();
> -		src += (blocks - rem) * SHA512_BLOCK_SIZE;
> -		blocks = rem;
> -	}
> +	kernel_neon_begin();
> +	__sha512_ce_transform(sst, src, blocks);
> +	kernel_neon_end();
>  }
>  
>  static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
> index 376a980f2bad..f0da53a0388f 100644
> --- a/arch/arm64/include/asm/assembler.h
> +++ b/arch/arm64/include/asm/assembler.h
> @@ -759,35 +759,6 @@ alternative_endif
>  	set_sctlr sctlr_el2, \reg
>  .endm
>  
> -	/*
> -	 * Check whether preempt/bh-disabled asm code should yield as soon as
> -	 * it is able. This is the case if we are currently running in task
> -	 * context, and either a softirq is pending, or the TIF_NEED_RESCHED
> -	 * flag is set and re-enabling preemption a single time would result in
> -	 * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
> -	 * stored negated in the top word of the thread_info::preempt_count
> -	 * field)
> -	 */
> -	.macro		cond_yield, lbl:req, tmp:req, tmp2:req
> -	get_current_task \tmp
> -	ldr		\tmp, [\tmp, #TSK_TI_PREEMPT]
> -	/*
> -	 * If we are serving a softirq, there is no point in yielding: the
> -	 * softirq will not be preempted no matter what we do, so we should
> -	 * run to completion as quickly as we can.
> -	 */
> -	tbnz		\tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@
> -#ifdef CONFIG_PREEMPTION
> -	sub		\tmp, \tmp, #PREEMPT_DISABLE_OFFSET
> -	cbz		\tmp, \lbl
> -#endif
> -	adr_l		\tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
> -	get_this_cpu_offset	\tmp2
> -	ldr		w\tmp, [\tmp, \tmp2]
> -	cbnz		w\tmp, \lbl	// yield on pending softirq in task context
> -.Lnoyield_\@:
> -	.endm
> -
>  /*
>   * Branch Target Identifier (BTI)
>   */
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 5ff1942b04fc..fb9e9ef9b527 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -116,10 +116,6 @@ int main(void)
>    DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
>    DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
>    BLANK();
> -  DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
> -  DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
> -  DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
> -  BLANK();
>    DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
>    BLANK();
>    DEFINE(FTR_OVR_VAL_OFFSET,	offsetof(struct arm64_ftr_override, val));
> -- 
> 2.43.0.rc1.413.gea7ed67945-goog
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

  reply	other threads:[~2023-11-27 13:46 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-27 12:23 [PATCH v3 0/5] arm64: Run kernel mode NEON with preemption enabled Ard Biesheuvel
2023-11-27 12:23 ` Ard Biesheuvel
2023-11-27 12:23 ` [PATCH v3 1/5] arm64: fpsimd: Drop unneeded 'busy' flag Ard Biesheuvel
2023-11-27 12:23   ` Ard Biesheuvel
2023-11-27 12:23 ` [PATCH v3 2/5] arm64: fpsimd: Preserve/restore kernel mode NEON at context switch Ard Biesheuvel
2023-11-27 12:23   ` Ard Biesheuvel
2023-11-27 13:09   ` Mark Rutland
2023-11-27 13:09     ` Mark Rutland
2023-11-27 12:23 ` [PATCH v3 3/5] arm64: fpsimd: Implement lazy restore for kernel mode FPSIMD Ard Biesheuvel
2023-11-27 12:23   ` Ard Biesheuvel
2023-11-27 13:32   ` Mark Rutland
2023-11-27 13:32     ` Mark Rutland
2023-11-27 12:23 ` [PATCH v3 4/5] arm64: crypto: Remove conditional yield logic Ard Biesheuvel
2023-11-27 12:23   ` Ard Biesheuvel
2023-11-27 13:46   ` Mark Rutland [this message]
2023-11-27 13:46     ` Mark Rutland
2023-11-27 15:17     ` Ard Biesheuvel
2023-11-27 15:17       ` Ard Biesheuvel
2023-11-27 12:23 ` [PATCH v3 5/5] arm64: crypto: Remove FPSIMD yield logic from glue code Ard Biesheuvel
2023-11-27 12:23   ` Ard Biesheuvel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZWSdsUIcCCuWlVhM@FVFF77S0Q05N \
    --to=mark.rutland@arm.com \
    --cc=ardb@google.com \
    --cc=ardb@kernel.org \
    --cc=bigeasy@linutronix.de \
    --cc=broonie@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=ebiggers@google.com \
    --cc=keescook@chromium.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=maz@kernel.org \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.