public inbox for linux-arm-kernel@lists.infradead.org
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: Nathan Huckleberry <nhuck@google.com>
Cc: linux-crypto@vger.kernel.org,
	Herbert Xu <herbert@gondor.apana.org.au>,
	"David S. Miller" <davem@davemloft.net>,
	linux-arm-kernel@lists.infradead.org,
	Paul Crowley <paulcrowley@google.com>,
	Sami Tolvanen <samitolvanen@google.com>,
	Ard Biesheuvel <ardb@kernel.org>
Subject: Re: [PATCH v4 4/8] crypto: x86/aesni-xctr: Add accelerated implementation of XCTR
Date: Mon, 18 Apr 2022 16:44:23 -0700	[thread overview]
Message-ID: <Yl3319lf33hgZniP@sol.localdomain> (raw)
In-Reply-To: <20220412172816.917723-5-nhuck@google.com>

On Tue, Apr 12, 2022 at 05:28:12PM +0000, Nathan Huckleberry wrote:
> diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
> index 43852ba6e19c..9e20d7d3d6da 100644
> --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
> +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
> @@ -53,6 +53,10 @@
>  #define KEY_192		2
>  #define KEY_256		3
>  
> +// XCTR mode only
> +#define counter		%r9
> +#define xiv		%xmm8
> +

It would be helpful if the registers were listed in order, and if the
CTR-specific ones were marked as being specific to CTR.  This would make it easy
to verify that there are no collisions in register allocation.  I.e.:

[...]
#define xdata7		%xmm7
#define xcounter	%xmm8	// CTR mode only
#define xiv		%xmm8	// XCTR mode only
#define xbyteswap	%xmm9	// CTR mode only
#define xkey0		%xmm10
[...]
#define num_bytes	%r8
#define counter		%r9	// XCTR mode only
#define tmp		%r10
[...]


I'm also not a fan of the naming, with "xcounter" being used by CTR only and
"counter" being used by XCTR only...  I see why you did it, though, as the
existing code uses the "x" prefix to mean "this is an xmm register".  It could
at least use a comment that makes this super clear, though:

// Note: the "x" prefix in these aliases means "this is an xmm register".
// No relation to XCTR where the "X" prefix means "XOR counter".
#define xdata0		%xmm0

> +	.if (\xctr == 1)

As \xctr is either 0 or 1, this can be written as simply '.if \xctr'

> +		.set i, 0
> +		.rept (by)
> +			club XDATA, i
> +			movq counter, var_xdata
> +			.set i, (i +1)
> +		.endr
> +	.endif
> +

Since the 3-operand add instruction (vpaddq) is available here, and in fact is
being used already, it isn't necessary to move 'counter' into all (up to 8) of
the var_xdata registers.  Just move it into the last var_xdata register, or into
a temporary register, and use it as a source operand for all the additions.

> -	vpshufb	xbyteswap, xcounter, xdata0
> -
> -	.set i, 1
> -	.rept (by - 1)
> -		club XDATA, i
> -		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
> -		vptest	ddq_low_msk(%rip), var_xdata
> -		jnz 1f
> -		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
> -		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> -		1:
> -		vpshufb	xbyteswap, var_xdata, var_xdata
> -		.set i, (i +1)
> -	.endr
> +	.if (\xctr == 0)
> +		vpshufb	xbyteswap, xcounter, xdata0
> +		.set i, 1
> +		.rept (by - 1)
> +			club XDATA, i
> +			vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
> +			vptest	ddq_low_msk(%rip), var_xdata
> +			jnz 1f
> +			vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
> +			vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> +			1:
> +			vpshufb	xbyteswap, var_xdata, var_xdata
> +			.set i, (i +1)
> +		.endr
> +	.endif
> +	.if (\xctr == 1)
> +		.set i, 0
> +		.rept (by)
> +			club XDATA, i
> +			vpaddq	(ddq_add_1 + 16 * i)(%rip), var_xdata, var_xdata
> +			.set i, (i +1)
> +		.endr
> +		.set i, 0
> +		.rept (by)
> +			club	XDATA, i
> +			vpxor	xiv, var_xdata, var_xdata
> +			.set i, (i +1)
> +		.endr
> +	.endif

This can be written as:

	.if \xctr
	[second part above]
	.else
	[first part above]
	.endif

> -	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
> -	vptest	ddq_low_msk(%rip), xcounter
> -	jnz	1f
> -	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> -	1:
> +	.if (\xctr == 0)
> +		vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
> +		vptest	ddq_low_msk(%rip), xcounter
> +		jnz	1f
> +		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
> +		1:
> +	.endif
> +	.if (\xctr == 1)
> +		add $by, counter
> +	.endif

Likewise here.

> +.macro do_aes_ctrmain key_len, xctr
>  	cmp	$16, num_bytes
> -	jb	.Ldo_return2\key_len
> +	jb	.Ldo_return2\xctr\key_len
>  
>  	vmovdqa	byteswap_const(%rip), xbyteswap
> -	vmovdqu	(p_iv), xcounter
> -	vpshufb	xbyteswap, xcounter, xcounter
> +	.if (\xctr == 0)
> +		vmovdqu	(p_iv), xcounter
> +		vpshufb	xbyteswap, xcounter, xcounter
> +	.endif
> +	.if (\xctr == 1)
> +		andq	$(~0xf), num_bytes
> +		shr	$4, counter
> +		vmovdqu	(p_iv), xiv
> +	.endif

And likewise here.  Also, the load of byteswap_const can be moved into the
!\xctr block.

- Eric

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

  parent reply	other threads:[~2022-04-18 23:45 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-12 17:28 [PATCH v4 0/8] crypto: HCTR2 support Nathan Huckleberry
2022-04-12 17:28 ` [PATCH v4 1/8] crypto: xctr - Add XCTR support Nathan Huckleberry
2022-04-18 19:03   ` Eric Biggers
2022-04-12 17:28 ` [PATCH v4 2/8] crypto: polyval - Add POLYVAL support Nathan Huckleberry
2022-04-18 19:25   ` Eric Biggers
2022-04-12 17:28 ` [PATCH v4 3/8] crypto: hctr2 - Add HCTR2 support Nathan Huckleberry
2022-04-13  4:20   ` Eric Biggers
2022-04-18 20:46   ` Eric Biggers
2022-04-12 17:28 ` [PATCH v4 4/8] crypto: x86/aesni-xctr: Add accelerated implementation of XCTR Nathan Huckleberry
2022-04-14  7:00   ` Eric Biggers
2022-04-18 23:44   ` Eric Biggers [this message]
2022-04-19  0:13   ` Eric Biggers
2022-04-21 21:59     ` Nathan Huckleberry
2022-04-21 22:29       ` Eric Biggers
2022-04-12 17:28 ` [PATCH v4 5/8] crypto: arm64/aes-xctr: " Nathan Huckleberry
2022-04-19  4:33   ` Eric Biggers
2022-04-12 17:28 ` [PATCH v4 6/8] crypto: x86/polyval: Add PCLMULQDQ accelerated implementation of POLYVAL Nathan Huckleberry
2022-04-13  5:18   ` Eric Biggers
2022-04-18 21:36   ` Eric Biggers
2022-04-12 17:28 ` [PATCH v4 7/8] crypto: arm64/polyval: Add PMULL " Nathan Huckleberry
2022-04-13  5:53   ` Eric Biggers
2022-04-12 17:28 ` [PATCH v4 8/8] fscrypt: Add HCTR2 support for filename encryption Nathan Huckleberry
2022-04-13  6:10   ` Eric Biggers
2022-04-13  6:16     ` Ard Biesheuvel
2022-04-14  7:12       ` Eric Biggers
2022-04-14  7:15         ` Ard Biesheuvel
2022-04-18 18:05   ` Eric Biggers
2022-04-14 14:18 ` [PATCH v4 0/8] crypto: HCTR2 support Ard Biesheuvel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Yl3319lf33hgZniP@sol.localdomain \
    --to=ebiggers@kernel.org \
    --cc=ardb@kernel.org \
    --cc=davem@davemloft.net \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=nhuck@google.com \
    --cc=paulcrowley@google.com \
    --cc=samitolvanen@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox