[PATCH] crypto: arm/chacha-neon - optimize for non-block size multiples

Linux cryptographic layer development
 help / color / mirror / Atom feed

* [PATCH] crypto: arm/chacha-neon - optimize for non-block size multiples
@ 2020-11-01 16:33 Ard Biesheuvel
  2020-11-02  0:30 ` Jason A. Donenfeld
  0 siblings, 1 reply; 3+ messages in thread
From: Ard Biesheuvel @ 2020-11-01 16:33 UTC (permalink / raw)
  To: linux-crypto; +Cc: herbert, Ard Biesheuvel, Eric Biggers, Jason A . Donenfeld

The current NEON based ChaCha implementation for ARM is optimized for
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
block encryption, but given that ChaCha is also often used in the
context of networking, it makes sense to consider arbitrary length
inputs as well.

For example, WireGuard typically uses 1420 byte packets, and performing
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
and 3 invocations of chacha_block_xor_neon(), where the last one also
involves a memcpy() using a buffer on the stack to process the final
chunk of 1420 % 64 == 12 bytes.

Let's optimize for this case as well, by letting chacha_4block_xor_neon()
deal with any input size between 64 and 256 bytes, using NEON permutation
instructions and overlapping loads and stores. This way, the 140 byte
tail of a 1420 byte input buffer can simply be processed in one go.

On out-of-order microarchitectures such as Cortex-A57, this results in
a speedup for 1420 byte blocks of about 21%, without any signficant
performance impact of the power-of-2 block sizes. On lower end cores
such as Cortex-A53, the speedup for 1420 byte blocks is only about 2%,
but also without impacting other input sizes.

Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/crypto/chacha-glue.c      |  23 ++--
 arch/arm/crypto/chacha-neon-core.S | 124 +++++++++++++++++---
 2 files changed, 116 insertions(+), 31 deletions(-)

diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c
index 59da6c0b63b6..9924143f63d7 100644
--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -23,7 +23,7 @@
 asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
 				      int nrounds);
 asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
-				       int nrounds);
+				       int nrounds, unsigned int nbytes);
 asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 
@@ -42,19 +42,14 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		chacha_4block_xor_neon(state, dst, src, nrounds);
-		bytes -= CHACHA_BLOCK_SIZE * 4;
-		src += CHACHA_BLOCK_SIZE * 4;
-		dst += CHACHA_BLOCK_SIZE * 4;
-		state[12] += 4;
-	}
-	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha_block_xor_neon(state, dst, src, nrounds);
-		bytes -= CHACHA_BLOCK_SIZE;
-		src += CHACHA_BLOCK_SIZE;
-		dst += CHACHA_BLOCK_SIZE;
-		state[12]++;
+	while (bytes > CHACHA_BLOCK_SIZE) {
+		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
 	}
 	if (bytes) {
 		memcpy(buf, src, bytes);
diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
index eb22926d4912..38bcb49b3b39 100644
--- a/arch/arm/crypto/chacha-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -47,6 +47,7 @@
   */
 
 #include <linux/linkage.h>
+#include <asm/cache.h>
 
 	.text
 	.fpu		neon
@@ -205,8 +206,9 @@ ENDPROC(hchacha_block_neon)
 
 	.align		5
 ENTRY(chacha_4block_xor_neon)
-	push		{r4-r5}
-	mov		r4, sp			// preserve the stack pointer
+	push		{r4-r6, lr}
+	ldr		r4, [sp, #16]
+	mov		r6, sp			// preserve the stack pointer
 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
 	bic		ip, ip, #0x1f		// aligned to 32 bytes
 	mov		sp, ip
@@ -215,6 +217,7 @@ ENTRY(chacha_4block_xor_neon)
 	// r1: 4 data blocks output, o
 	// r2: 4 data blocks input, i
 	// r3: nrounds
+	// r4: number of bytes
 
 	//
 	// This function encrypts four consecutive ChaCha blocks by loading
@@ -503,6 +506,13 @@ ENTRY(chacha_4block_xor_neon)
 	vswp		d17, d20
 	vswp		d19, d22
 
+	mov		sp, r6		// restore original stack pointer
+
+	subs		r4, r4, #96	// set up lr and ip for overlapping
+	mov		lr, #32		// loads and stores
+	addcc		ip, r4, #32
+	movcs		ip, #32
+
 	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
 
 	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
@@ -519,42 +529,122 @@ ENTRY(chacha_4block_xor_neon)
 
 	// XOR the rest of the data with the keystream
 
-	vld1.8		{q0-q1}, [r2]!
+	vld1.8		{q0-q1}, [r2], ip
 	veor		q0, q0, q8
 	veor		q1, q1, q12
-	vst1.8		{q0-q1}, [r1]!
+	ble		.Lle96
+	subs		r4, r4, #32
+	addcc		lr, r4, #32
+	vst1.8		{q0-q1}, [r1], ip
 
-	vld1.8		{q0-q1}, [r2]!
+	vld1.8		{q0-q1}, [r2], lr
 	veor		q0, q0, q2
 	veor		q1, q1, q6
-	vst1.8		{q0-q1}, [r1]!
+	ble		.Lle128
+	subs		r4, r4, #32
+	addcc		ip, r4, #32
+	vst1.8		{q0-q1}, [r1], lr
 
-	vld1.8		{q0-q1}, [r2]!
+	vld1.8		{q0-q1}, [r2], ip
 	veor		q0, q0, q10
 	veor		q1, q1, q14
-	vst1.8		{q0-q1}, [r1]!
+	ble		.Lle160
+	subs		r4, r4, #32
+	addcc		lr, r4, #32
+	vst1.8		{q0-q1}, [r1], ip
 
-	vld1.8		{q0-q1}, [r2]!
+	vld1.8		{q0-q1}, [r2], lr
 	veor		q0, q0, q4
 	veor		q1, q1, q5
-	vst1.8		{q0-q1}, [r1]!
+	ble		.Lle192
+	subs		r4, r4, #32
+	addcc		ip, r4, #32
+	vst1.8		{q0-q1}, [r1], lr
 
-	vld1.8		{q0-q1}, [r2]!
+	vld1.8		{q0-q1}, [r2], ip
 	veor		q0, q0, q9
 	veor		q1, q1, q13
-	vst1.8		{q0-q1}, [r1]!
+	ble		.Lle224
+	subs		r4, r4, #32
+	addcc		lr, r4, #32
+	vst1.8		{q0-q1}, [r1], ip
 
-	vld1.8		{q0-q1}, [r2]!
+	vld1.8		{q0-q1}, [r2], lr
 	veor		q0, q0, q3
 	veor		q1, q1, q7
-	vst1.8		{q0-q1}, [r1]!
+	blt		.Llt256
+	vst1.8		{q0-q1}, [r1], lr
 
+.Llastblock:
 	vld1.8		{q0-q1}, [r2]
-	  mov		sp, r4		// restore original stack pointer
 	veor		q0, q0, q11
 	veor		q1, q1, q15
 	vst1.8		{q0-q1}, [r1]
 
-	pop		{r4-r5}
-	bx		lr
+	pop		{r4-r6, pc}
+
+.Lle192:
+	mov		ip, lr
+	vmov		q4, q9
+	vmov		q5, q13
+
+.Lle160:
+	// Process the final block if processing less than 4 full blocks.
+	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+	// previous 32 byte output block that still needs to be written at [r1]
+	beq		.Lfullblock
+	adr		lr, .Lpermute
+	add		lr, lr, ip
+	add		ip, ip, r1
+
+	vld1.8		{q2-q3}, [lr]
+	vld1.8		{q6-q7}, [r2]
+
+	vtbl.8		d4, {q4-q5}, d4
+	vtbl.8		d5, {q4-q5}, d5
+	vtbl.8		d6, {q4-q5}, d6
+	vtbl.8		d7, {q4-q5}, d7
+
+	veor		q6, q6, q2
+	veor		q7, q7, q3
+
+	vst1.8		{q6-q7}, [ip]	// overlapping stores
+	vst1.8		{q0-q1}, [r1]
+	pop		{r4-r6, pc}
+
+.Lfullblock:
+	vst1.8		{q0-q1}, [r1]!
+	vmov		q11, q4
+	vmov		q15, q5
+	b		.Llastblock
+
+.Lle96:
+	vmov		q4, q2
+	vmov		q5, q6
+	b		.Lle160
+.Lle128:
+	mov		ip, lr
+	vmov		q4, q10
+	vmov		q5, q14
+	b		.Lle160
+.Lle224:
+	vmov		q4, q3
+	vmov		q5, q7
+	b		.Lle160
+.Llt256:
+	mov		ip, lr
+	vmov		q4, q11
+	vmov		q5, q15
+	b		.Lle160
 ENDPROC(chacha_4block_xor_neon)
+
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] crypto: arm/chacha-neon - optimize for non-block size multiples
  2020-11-01 16:33 [PATCH] crypto: arm/chacha-neon - optimize for non-block size multiples Ard Biesheuvel
@ 2020-11-02  0:30 ` Jason A. Donenfeld
  2020-11-02  6:44   ` Ard Biesheuvel
  0 siblings, 1 reply; 3+ messages in thread
From: Jason A. Donenfeld @ 2020-11-02  0:30 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

Cool patch! I look forward to getting out the old arm32 rig and
benching this. One question:

On Sun, Nov 1, 2020 at 5:33 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> On out-of-order microarchitectures such as Cortex-A57, this results in
> a speedup for 1420 byte blocks of about 21%, without any signficant
> performance impact of the power-of-2 block sizes. On lower end cores
> such as Cortex-A53, the speedup for 1420 byte blocks is only about 2%,
> but also without impacting other input sizes.

A57 and A53 are 64-bit, but this is code for 32-bit arm, right? So the
comparison is more like A15 vs A5? Or are you running 32-bit kernels
on armv8 hardware?

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] crypto: arm/chacha-neon - optimize for non-block size multiples
  2020-11-02  0:30 ` Jason A. Donenfeld
@ 2020-11-02  6:44   ` Ard Biesheuvel
  0 siblings, 0 replies; 3+ messages in thread
From: Ard Biesheuvel @ 2020-11-02  6:44 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: Linux Crypto Mailing List, Herbert Xu, Eric Biggers

On Mon, 2 Nov 2020 at 01:30, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Cool patch! I look forward to getting out the old arm32 rig and
> benching this. One question:
>
> On Sun, Nov 1, 2020 at 5:33 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> > On out-of-order microarchitectures such as Cortex-A57, this results in
> > a speedup for 1420 byte blocks of about 21%, without any signficant
> > performance impact of the power-of-2 block sizes. On lower end cores
> > such as Cortex-A53, the speedup for 1420 byte blocks is only about 2%,
> > but also without impacting other input sizes.
>
> A57 and A53 are 64-bit, but this is code for 32-bit arm, right? So the
> comparison is more like A15 vs A5? Or are you running 32-bit kernels
> on armv8 hardware?

The latter. The only 32-bit hardware I have in my drawer is Cortex-A8,
which I expect to benefit from this change, but the way its
micro-architecture integrates the NEON stages into the pipeline is a
bit odd, and therefore, you cannot really extrapolate from those
results for other cores.

Cortex-A57 and Cortex-A15 should be fairly similar, so that is really
the target for this optimization. Cortex-A5 and A7 already omit the
NEON code path entirely, so they are not affected in the first place.
Cortex-A53 is significant because this is what the Raspberry Pi3 uses
(and it ships with a 32-bit kernel)

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-11-02  6:44 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-11-01 16:33 [PATCH] crypto: arm/chacha-neon - optimize for non-block size multiples Ard Biesheuvel
2020-11-02  0:30 ` Jason A. Donenfeld
2020-11-02  6:44   ` Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox