[PATCH 0/2] crypto/arm64: aes-ce-gcm - switch to 2-way aggregation

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] crypto/arm64: aes-ce-gcm - switch to 2-way aggregation
@ 2018-07-28 18:53 Ard Biesheuvel
  2018-07-28 18:53 ` [PATCH 1/2] crypto/arm64: aes-ce-gcm - operate on two input blocks at a time Ard Biesheuvel
  2018-07-28 18:54 ` [PATCH 2/2] crypto/arm64: aes-ce-gcm - implement 2-way aggregation Ard Biesheuvel
  0 siblings, 2 replies; 3+ messages in thread
From: Ard Biesheuvel @ 2018-07-28 18:53 UTC (permalink / raw)
  To: linux-arm-kernel

Update the combined AES-GCM AEAD implementation to process two blocks
at a time, allowing us to switch to a faster version of the GHASH
implementation.

Note that this does not update the core GHASH transform, only the
combined AES-GCM AEAD mode. GHASH is mostly used with AES anyway, and
the ARMv8 architecture mandates support for AES instructions if
64-bit polynomial multiplication instructions are implemented. This
means that mosts users of the pmull.p64 based GHASH routines are better
off using the combined AES-GCM code anyway. Users of the pmull.p8 based
GHASH implementation are unlikely to benefit substantially from aggregation,
given that the multiplication phase is much more dominant in this case
(and it is only the reduction phase that is amortized over multiple
blocks)

Performance numbers for Cortex-A53 can be found after patch #2.

Ard Biesheuvel (2):
  crypto/arm64: aes-ce-gcm - operate on two input blocks at a time
  crypto/arm64: aes-ce-gcm - implement 2-way aggregation

 arch/arm64/crypto/ghash-ce-core.S | 128 +++++++++++++-------
 arch/arm64/crypto/ghash-ce-glue.c | 117 ++++++++++++------
 2 files changed, 165 insertions(+), 80 deletions(-)

-- 
2.18.0

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] crypto/arm64: aes-ce-gcm - operate on two input blocks at a time
  2018-07-28 18:53 [PATCH 0/2] crypto/arm64: aes-ce-gcm - switch to 2-way aggregation Ard Biesheuvel
@ 2018-07-28 18:53 ` Ard Biesheuvel
  2018-07-28 18:54 ` [PATCH 2/2] crypto/arm64: aes-ce-gcm - implement 2-way aggregation Ard Biesheuvel
  1 sibling, 0 replies; 3+ messages in thread
From: Ard Biesheuvel @ 2018-07-28 18:53 UTC (permalink / raw)
  To: linux-arm-kernel

Update the core AES/GCM transform and the associated plumbing to operate
on 2 AES/GHASH blocks at a time. By itself, this is not expected to
result in a noticeable speedup, but it paves the way for reimplementing
the GHASH component using 2-way aggregation.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/ghash-ce-core.S | 129 +++++++++++++++-----
 arch/arm64/crypto/ghash-ce-glue.c |  84 +++++++++----
 2 files changed, 155 insertions(+), 58 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index dcffb9e77589..437a2fb0f7f9 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -286,9 +286,10 @@ ENTRY(pmull_ghash_update_p8)
 	__pmull_ghash	p8
 ENDPROC(pmull_ghash_update_p8)
 
-	KS		.req	v8
-	CTR		.req	v9
-	INP		.req	v10
+	KS0		.req	v8
+	KS1		.req	v9
+	INP0		.req	v10
+	INP1		.req	v11
 
 	.macro		load_round_keys, rounds, rk
 	cmp		\rounds, #12
@@ -350,90 +351,152 @@ CPU_LE(	rev		x28, x28	)
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	.if		\enc == 1
-	ld1		{KS.16b}, [x27]
+	ld1		{KS0.16b-KS1.16b}, [x27]
 	.endif
 
-1:	ld1		{CTR.8b}, [x24]			// load upper counter
-	ld1		{INP.16b}, [x22], #16
+1:	ld1		{INP0.16b-INP1.16b}, [x22], #32
+
 	rev		x9, x28
-	add		x28, x28, #1
-	sub		w19, w19, #1
-	ins		CTR.d[1], x9			// set lower counter
+	add		x10, x28, #1
+	add		x28, x28, #2
 
 	.if		\enc == 1
-	eor		INP.16b, INP.16b, KS.16b	// encrypt input
-	st1		{INP.16b}, [x21], #16
+	eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input
+	eor		INP1.16b, INP1.16b, KS1.16b
 	.endif
 
-	rev64		T1.16b, INP.16b
+	ld1		{KS0.8b}, [x24]			// load upper counter
+	rev		x10, x10
+	sub		w19, w19, #2
+	mov		KS1.8b, KS0.8b
+	ins		KS0.d[1], x9			// set lower counter
+	ins		KS1.d[1], x10
+
+	rev64		T1.16b, INP0.16b
 
 	cmp		w26, #12
 	b.ge		4f				// AES-192/256?
 
-2:	enc_round	CTR, v21
+2:	enc_round	KS0, v21
+
+	ext		T2.16b, XL.16b, XL.16b, #8
+	ext		IN1.16b, T1.16b, T1.16b, #8
+
+	enc_round	KS1, v21
+
+	eor		T1.16b, T1.16b, T2.16b
+	eor		XL.16b, XL.16b, IN1.16b
+
+	enc_round	KS0, v22
+
+	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
+	eor		T1.16b, T1.16b, XL.16b
+
+	enc_round	KS1, v22
+
+	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
+	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
+
+	enc_round	KS0, v23
+
+	ext		T1.16b, XL.16b, XH.16b, #8
+	eor		T2.16b, XL.16b, XH.16b
+	eor		XM.16b, XM.16b, T1.16b
+
+	enc_round	KS1, v23
+
+	eor		XM.16b, XM.16b, T2.16b
+	pmull		T2.1q, XL.1d, MASK.1d
+
+	enc_round	KS0, v24
+
+	mov		XH.d[0], XM.d[1]
+	mov		XM.d[1], XL.d[0]
+
+	enc_round	KS1, v24
+
+	eor		XL.16b, XM.16b, T2.16b
+
+	enc_round	KS0, v25
+
+	ext		T2.16b, XL.16b, XL.16b, #8
+
+	enc_round	KS1, v25
+
+	pmull		XL.1q, XL.1d, MASK.1d
+	eor		T2.16b, T2.16b, XH.16b
+
+	enc_round	KS0, v26
+
+	eor		XL.16b, XL.16b, T2.16b
+	rev64		T1.16b, INP1.16b
+
+	enc_round	KS1, v26
 
 	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
 
-	enc_round	CTR, v22
+	enc_round	KS0, v27
 
 	eor		T1.16b, T1.16b, T2.16b
 	eor		XL.16b, XL.16b, IN1.16b
 
-	enc_round	CTR, v23
+	enc_round	KS1, v27
 
 	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
 	eor		T1.16b, T1.16b, XL.16b
 
-	enc_round	CTR, v24
+	enc_round	KS0, v28
 
 	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
 	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
 
-	enc_round	CTR, v25
+	enc_round	KS1, v28
 
 	ext		T1.16b, XL.16b, XH.16b, #8
 	eor		T2.16b, XL.16b, XH.16b
 	eor		XM.16b, XM.16b, T1.16b
 
-	enc_round	CTR, v26
+	enc_round	KS0, v29
 
 	eor		XM.16b, XM.16b, T2.16b
 	pmull		T2.1q, XL.1d, MASK.1d
 
-	enc_round	CTR, v27
+	enc_round	KS1, v29
 
 	mov		XH.d[0], XM.d[1]
 	mov		XM.d[1], XL.d[0]
 
-	enc_round	CTR, v28
+	aese		KS0.16b, v30.16b
 
 	eor		XL.16b, XM.16b, T2.16b
 
-	enc_round	CTR, v29
+	aese		KS1.16b, v30.16b
 
 	ext		T2.16b, XL.16b, XL.16b, #8
 
-	aese		CTR.16b, v30.16b
+	eor		KS0.16b, KS0.16b, v31.16b
 
 	pmull		XL.1q, XL.1d, MASK.1d
 	eor		T2.16b, T2.16b, XH.16b
 
-	eor		KS.16b, CTR.16b, v31.16b
+	eor		KS1.16b, KS1.16b, v31.16b
 
 	eor		XL.16b, XL.16b, T2.16b
 
 	.if		\enc == 0
-	eor		INP.16b, INP.16b, KS.16b
-	st1		{INP.16b}, [x21], #16
+	eor		INP0.16b, INP0.16b, KS0.16b
+	eor		INP1.16b, INP1.16b, KS1.16b
 	.endif
 
+	st1		{INP0.16b-INP1.16b}, [x21], #32
+
 	cbz		w19, 3f
 
 	if_will_cond_yield_neon
 	st1		{XL.2d}, [x20]
 	.if		\enc == 1
-	st1		{KS.16b}, [x27]
+	st1		{KS0.16b-KS1.16b}, [x27]
 	.endif
 	do_cond_yield_neon
 	b		0b
@@ -443,7 +506,7 @@ CPU_LE(	rev		x28, x28	)
 
 3:	st1		{XL.2d}, [x20]
 	.if		\enc == 1
-	st1		{KS.16b}, [x27]
+	st1		{KS0.16b-KS1.16b}, [x27]
 	.endif
 
 CPU_LE(	rev		x28, x28	)
@@ -453,10 +516,14 @@ CPU_LE(	rev		x28, x28	)
 	ret
 
 4:	b.eq		5f				// AES-192?
-	enc_round	CTR, v17
-	enc_round	CTR, v18
-5:	enc_round	CTR, v19
-	enc_round	CTR, v20
+	enc_round	KS0, v17
+	enc_round	KS1, v17
+	enc_round	KS0, v18
+	enc_round	KS1, v18
+5:	enc_round	KS0, v19
+	enc_round	KS1, v19
+	enc_round	KS0, v20
+	enc_round	KS1, v20
 	b		2b
 	.endm
 
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 8a10f1d7199a..371f8368c196 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -349,7 +349,7 @@ static int gcm_encrypt(struct aead_request *req)
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
 	struct skcipher_walk walk;
 	u8 iv[AES_BLOCK_SIZE];
-	u8 ks[AES_BLOCK_SIZE];
+	u8 ks[2 * AES_BLOCK_SIZE];
 	u8 tag[AES_BLOCK_SIZE];
 	u64 dg[2] = {};
 	int err;
@@ -369,12 +369,15 @@ static int gcm_encrypt(struct aead_request *req)
 		pmull_gcm_encrypt_block(ks, iv, NULL,
 					num_rounds(&ctx->aes_key));
 		put_unaligned_be32(3, iv + GCM_IV_SIZE);
+		pmull_gcm_encrypt_block(ks + AES_BLOCK_SIZE, iv, NULL,
+					num_rounds(&ctx->aes_key));
+		put_unaligned_be32(4, iv + GCM_IV_SIZE);
 		kernel_neon_end();
 
 		err = skcipher_walk_aead_encrypt(&walk, req, false);
 
-		while (walk.nbytes >= AES_BLOCK_SIZE) {
-			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		while (walk.nbytes >= 2 * AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
 
 			kernel_neon_begin();
 			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
@@ -384,7 +387,7 @@ static int gcm_encrypt(struct aead_request *req)
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
-						 walk.nbytes % AES_BLOCK_SIZE);
+					walk.nbytes % (2 * AES_BLOCK_SIZE));
 		}
 	} else {
 		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
@@ -424,13 +427,21 @@ static int gcm_encrypt(struct aead_request *req)
 	/* handle the tail */
 	if (walk.nbytes) {
 		u8 buf[GHASH_BLOCK_SIZE];
+		unsigned int nbytes = walk.nbytes;
+		u8 *dst = walk.dst.virt.addr;
+		u8 *head = NULL;
 
-		crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks,
-			       walk.nbytes);
+		crypto_xor_cpy(dst, walk.src.virt.addr, ks, nbytes);
 
-		memcpy(buf, walk.dst.virt.addr, walk.nbytes);
-		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
-		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+		if (walk.nbytes > GHASH_BLOCK_SIZE) {
+			head = dst;
+			dst += GHASH_BLOCK_SIZE;
+			nbytes %= GHASH_BLOCK_SIZE;
+		}
+
+		memcpy(buf, dst, nbytes);
+		memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
+		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
 
 		err = skcipher_walk_done(&walk, 0);
 	}
@@ -453,10 +464,11 @@ static int gcm_decrypt(struct aead_request *req)
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
 	unsigned int authsize = crypto_aead_authsize(aead);
 	struct skcipher_walk walk;
-	u8 iv[AES_BLOCK_SIZE];
+	u8 iv[2 * AES_BLOCK_SIZE];
 	u8 tag[AES_BLOCK_SIZE];
-	u8 buf[GHASH_BLOCK_SIZE];
+	u8 buf[2 * GHASH_BLOCK_SIZE];
 	u64 dg[2] = {};
+	int nrounds = num_rounds(&ctx->aes_key);
 	int err;
 
 	if (req->assoclen)
@@ -467,31 +479,40 @@ static int gcm_decrypt(struct aead_request *req)
 
 	if (likely(may_use_simd())) {
 		kernel_neon_begin();
-
-		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
-					num_rounds(&ctx->aes_key));
+		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc, nrounds);
 		put_unaligned_be32(2, iv + GCM_IV_SIZE);
 		kernel_neon_end();
 
 		err = skcipher_walk_aead_decrypt(&walk, req, false);
 
-		while (walk.nbytes >= AES_BLOCK_SIZE) {
-			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+		while (walk.nbytes >= 2 * AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2;
 
 			kernel_neon_begin();
 			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
 					  walk.src.virt.addr, &ctx->ghash_key,
-					  iv, ctx->aes_key.key_enc,
-					  num_rounds(&ctx->aes_key));
+					  iv, ctx->aes_key.key_enc, nrounds);
 			kernel_neon_end();
 
 			err = skcipher_walk_done(&walk,
-						 walk.nbytes % AES_BLOCK_SIZE);
+					walk.nbytes % (2 * AES_BLOCK_SIZE));
+		}
+		if (walk.nbytes > AES_BLOCK_SIZE) {
+			u32 ctr = get_unaligned_be32(iv + GCM_IV_SIZE);
+
+			memcpy(iv + AES_BLOCK_SIZE, iv, GCM_IV_SIZE);
+			put_unaligned_be32(ctr + 1,
+					   iv + AES_BLOCK_SIZE + GCM_IV_SIZE);
 		}
 		if (walk.nbytes) {
 			kernel_neon_begin();
 			pmull_gcm_encrypt_block(iv, iv, ctx->aes_key.key_enc,
-						num_rounds(&ctx->aes_key));
+						nrounds);
+
+			if (walk.nbytes > AES_BLOCK_SIZE)
+				pmull_gcm_encrypt_block(iv + AES_BLOCK_SIZE,
+							iv + AES_BLOCK_SIZE,
+							NULL, nrounds);
 			kernel_neon_end();
 		}
 
@@ -512,8 +533,7 @@ static int gcm_decrypt(struct aead_request *req)
 
 			do {
 				__aes_arm64_encrypt(ctx->aes_key.key_enc,
-						    buf, iv,
-						    num_rounds(&ctx->aes_key));
+						    buf, iv, nrounds);
 				crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
 				crypto_inc(iv, AES_BLOCK_SIZE);
 
@@ -526,14 +546,24 @@ static int gcm_decrypt(struct aead_request *req)
 		}
 		if (walk.nbytes)
 			__aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv,
-					    num_rounds(&ctx->aes_key));
+					    nrounds);
 	}
 
 	/* handle the tail */
 	if (walk.nbytes) {
-		memcpy(buf, walk.src.virt.addr, walk.nbytes);
-		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
-		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+		const u8 *src = walk.src.virt.addr;
+		const u8 *head = NULL;
+		unsigned int nbytes = walk.nbytes;
+
+		if (walk.nbytes > GHASH_BLOCK_SIZE) {
+			head = src;
+			src += GHASH_BLOCK_SIZE;
+			nbytes %= GHASH_BLOCK_SIZE;
+		}
+
+		memcpy(buf, src, nbytes);
+		memset(buf + nbytes, 0, GHASH_BLOCK_SIZE - nbytes);
+		ghash_do_update(!!nbytes, dg, buf, &ctx->ghash_key, head);
 
 		crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
 			       walk.nbytes);
@@ -558,7 +588,7 @@ static int gcm_decrypt(struct aead_request *req)
 
 static struct aead_alg gcm_aes_alg = {
 	.ivsize			= GCM_IV_SIZE,
-	.chunksize		= AES_BLOCK_SIZE,
+	.chunksize		= 2 * AES_BLOCK_SIZE,
 	.maxauthsize		= AES_BLOCK_SIZE,
 	.setkey			= gcm_setkey,
 	.setauthsize		= gcm_setauthsize,
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] crypto/arm64: aes-ce-gcm - implement 2-way aggregation
  2018-07-28 18:53 [PATCH 0/2] crypto/arm64: aes-ce-gcm - switch to 2-way aggregation Ard Biesheuvel
  2018-07-28 18:53 ` [PATCH 1/2] crypto/arm64: aes-ce-gcm - operate on two input blocks at a time Ard Biesheuvel
@ 2018-07-28 18:54 ` Ard Biesheuvel
  1 sibling, 0 replies; 3+ messages in thread
From: Ard Biesheuvel @ 2018-07-28 18:54 UTC (permalink / raw)
  To: linux-arm-kernel

Implement a faster version of the GHASH transform which amortizes the
reduction modulo the characteristic polynomial across two input blocks at
a time. This is based on the Intel white paper "Carry-Less Multiplication
Instruction and its Usage for Computing the GCM Mode"

On a Cortex-A53, the gcm(aes) performance increases 24%, from 3.0 cycles per
byte to 2.4 cpb for large input sizes.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
Raw numbers after the patch

 arch/arm64/crypto/ghash-ce-core.S | 87 +++++++-------------
 arch/arm64/crypto/ghash-ce-glue.c | 33 ++++++--
 2 files changed, 54 insertions(+), 66 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 437a2fb0f7f9..c144b526abe6 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -290,6 +290,11 @@ ENDPROC(pmull_ghash_update_p8)
 	KS1		.req	v9
 	INP0		.req	v10
 	INP1		.req	v11
+	HH		.req	v12
+	Hhl		.req	v13
+	XLn		.req	v14
+	XMn		.req	v15
+	XHn		.req	v16
 
 	.macro		load_round_keys, rounds, rk
 	cmp		\rounds, #12
@@ -342,13 +347,13 @@ CPU_LE(	rev		x28, x28	)
 
 0:	mov		x0, x25
 	load_round_keys	w26, x0
-	ld1		{SHASH.2d}, [x23]
+	add		x1, x23, #32
+	ld1		{HH.2d-Hhl.2d}, [x23]
+	ld1		{SHASH.2d}, [x1]
 	ld1		{XL.2d}, [x20]
 
 	movi		MASK.16b, #0xe1
-	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
 	shl		MASK.2d, MASK.2d, #57
-	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 
 	.if		\enc == 1
 	ld1		{KS0.16b-KS1.16b}, [x27]
@@ -372,116 +377,82 @@ CPU_LE(	rev		x28, x28	)
 	ins		KS0.d[1], x9			// set lower counter
 	ins		KS1.d[1], x10
 
-	rev64		T1.16b, INP0.16b
+	rev64		T1.16b, INP1.16b
 
 	cmp		w26, #12
 	b.ge		4f				// AES-192/256?
 
 2:	enc_round	KS0, v21
-
-	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8
 
 	enc_round	KS1, v21
-
-	eor		T1.16b, T1.16b, T2.16b
-	eor		XL.16b, XL.16b, IN1.16b
+	pmull2		XHn.1q, SHASH.2d, IN1.2d	// a1 * b1
 
 	enc_round	KS0, v22
-
-	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
-	eor		T1.16b, T1.16b, XL.16b
+	eor		T1.16b, T1.16b, IN1.16b
 
 	enc_round	KS1, v22
-
-	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
-	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
+	pmull		XLn.1q, SHASH.1d, IN1.1d	// a0 * b0
 
 	enc_round	KS0, v23
-
-	ext		T1.16b, XL.16b, XH.16b, #8
-	eor		T2.16b, XL.16b, XH.16b
-	eor		XM.16b, XM.16b, T1.16b
+	pmull		XMn.1q, Hhl.1d, T1.1d		// (a1 + a0)(b1 + b0)
 
 	enc_round	KS1, v23
-
-	eor		XM.16b, XM.16b, T2.16b
-	pmull		T2.1q, XL.1d, MASK.1d
+	rev64		T1.16b, INP0.16b
+	ext		T2.16b, XL.16b, XL.16b, #8
 
 	enc_round	KS0, v24
-
-	mov		XH.d[0], XM.d[1]
-	mov		XM.d[1], XL.d[0]
+	ext		IN1.16b, T1.16b, T1.16b, #8
+	eor		T1.16b, T1.16b, T2.16b
 
 	enc_round	KS1, v24
-
-	eor		XL.16b, XM.16b, T2.16b
+	eor		XL.16b, XL.16b, IN1.16b
 
 	enc_round	KS0, v25
-
-	ext		T2.16b, XL.16b, XL.16b, #8
+	pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1
 
 	enc_round	KS1, v25
-
-	pmull		XL.1q, XL.1d, MASK.1d
-	eor		T2.16b, T2.16b, XH.16b
+	eor		T1.16b, T1.16b, XL.16b
 
 	enc_round	KS0, v26
-
-	eor		XL.16b, XL.16b, T2.16b
-	rev64		T1.16b, INP1.16b
+	pmull		XL.1q, HH.1d, XL.1d		// a0 * b0
 
 	enc_round	KS1, v26
-
-	ext		T2.16b, XL.16b, XL.16b, #8
-	ext		IN1.16b, T1.16b, T1.16b, #8
+	pmull2		XM.1q, Hhl.2d, T1.2d		// (a1 + a0)(b1 + b0)
 
 	enc_round	KS0, v27
-
-	eor		T1.16b, T1.16b, T2.16b
-	eor		XL.16b, XL.16b, IN1.16b
+	eor		XH.16b, XH.16b, XHn.16b
+	eor		XM.16b, XM.16b, XMn.16b
 
 	enc_round	KS1, v27
-
-	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
-	eor		T1.16b, T1.16b, XL.16b
+	eor		XL.16b, XL.16b, XLn.16b
+	ext		T1.16b, XL.16b, XH.16b, #8
 
 	enc_round	KS0, v28
-
-	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
-	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)
-
-	enc_round	KS1, v28
-
-	ext		T1.16b, XL.16b, XH.16b, #8
 	eor		T2.16b, XL.16b, XH.16b
 	eor		XM.16b, XM.16b, T1.16b
 
-	enc_round	KS0, v29
-
+	enc_round	KS1, v28
 	eor		XM.16b, XM.16b, T2.16b
+
+	enc_round	KS0, v29
 	pmull		T2.1q, XL.1d, MASK.1d
 
 	enc_round	KS1, v29
-
 	mov		XH.d[0], XM.d[1]
 	mov		XM.d[1], XL.d[0]
 
 	aese		KS0.16b, v30.16b
-
 	eor		XL.16b, XM.16b, T2.16b
 
 	aese		KS1.16b, v30.16b
-
 	ext		T2.16b, XL.16b, XL.16b, #8
 
 	eor		KS0.16b, KS0.16b, v31.16b
-
 	pmull		XL.1q, XL.1d, MASK.1d
 	eor		T2.16b, T2.16b, XH.16b
 
 	eor		KS1.16b, KS1.16b, v31.16b
-
 	eor		XL.16b, XL.16b, T2.16b
 
 	.if		\enc == 0
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 371f8368c196..65a0b8239620 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -46,6 +46,8 @@ struct ghash_desc_ctx {
 
 struct gcm_aes_ctx {
 	struct crypto_aes_ctx	aes_key;
+	u64			h2[2];
+	u64			hhl[2];
 	struct ghash_key	ghash_key;
 };
 
@@ -62,12 +64,11 @@ static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
 				  const char *head);
 
 asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
-				  const u8 src[], struct ghash_key const *k,
-				  u8 ctr[], u32 const rk[], int rounds,
-				  u8 ks[]);
+				  const u8 src[], u64 const *k, u8 ctr[],
+				  u32 const rk[], int rounds, u8 ks[]);
 
 asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
-				  const u8 src[], struct ghash_key const *k,
+				  const u8 src[], u64 const *k,
 				  u8 ctr[], u32 const rk[], int rounds);
 
 asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
@@ -233,7 +234,8 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 		      unsigned int keylen)
 {
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
-	u8 key[GHASH_BLOCK_SIZE];
+	be128 h1, h2;
+	u8 *key = (u8 *)&h1;
 	int ret;
 
 	ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
@@ -245,7 +247,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 	__aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
 			    num_rounds(&ctx->aes_key));
 
-	return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));
+	__ghash_setkey(&ctx->ghash_key, key, sizeof(be128));
+
+	/* calculate H^2 and Hhl (used for 2-way aggregation) */
+	h2 = h1;
+	gf128mul_lle(&h2, &h1);
+
+	ctx->h2[0] = (be64_to_cpu(h2.b) << 1) | (be64_to_cpu(h2.a) >> 63);
+	ctx->h2[1] = (be64_to_cpu(h2.a) << 1) | (be64_to_cpu(h2.b) >> 63);
+
+	if (be64_to_cpu(h2.a) >> 63)
+		ctx->h2[1] ^= 0xc200000000000000UL;
+
+	ctx->hhl[0] = ctx->ghash_key.a ^ ctx->ghash_key.b;
+	ctx->hhl[1] = ctx->h2[0] ^ ctx->h2[1];
+
+	return 0;
 }
 
 static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@@ -381,7 +398,7 @@ static int gcm_encrypt(struct aead_request *req)
 
 			kernel_neon_begin();
 			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
-					  walk.src.virt.addr, &ctx->ghash_key,
+					  walk.src.virt.addr, ctx->h2,
 					  iv, ctx->aes_key.key_enc,
 					  num_rounds(&ctx->aes_key), ks);
 			kernel_neon_end();
@@ -490,7 +507,7 @@ static int gcm_decrypt(struct aead_request *req)
 
 			kernel_neon_begin();
 			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
-					  walk.src.virt.addr, &ctx->ghash_key,
+					  walk.src.virt.addr, ctx->h2,
 					  iv, ctx->aes_key.key_enc, nrounds);
 			kernel_neon_end();
 
-- 
2.18.0

tcrypt performance numbers for a Cortex-A53 @ 1 GHz (CONFIG_PREEMPT disabled)

BASELINE:
=========
test  0 (128 bit key,   16 byte blocks): 445165 operations in 1 seconds (  7122640 bytes)
test  1 (128 bit key,   64 byte blocks): 437076 operations in 1 seconds ( 27972864 bytes)
test  2 (128 bit key,  256 byte blocks): 354203 operations in 1 seconds ( 90675968 bytes)
test  3 (128 bit key,  512 byte blocks): 284031 operations in 1 seconds (145423872 bytes)
test  4 (128 bit key, 1024 byte blocks): 203473 operations in 1 seconds (208356352 bytes)
test  5 (128 bit key, 2048 byte blocks): 129855 operations in 1 seconds (265943040 bytes)
test  6 (128 bit key, 4096 byte blocks):  75686 operations in 1 seconds (310009856 bytes)
test  7 (128 bit key, 8192 byte blocks):  40167 operations in 1 seconds (329048064 bytes)
test  8 (192 bit key,   16 byte blocks): 441610 operations in 1 seconds (  7065760 bytes)
test  9 (192 bit key,   64 byte blocks): 429364 operations in 1 seconds ( 27479296 bytes)
test 10 (192 bit key,  256 byte blocks): 343303 operations in 1 seconds ( 87885568 bytes)
test 11 (192 bit key,  512 byte blocks): 272029 operations in 1 seconds (139278848 bytes)
test 12 (192 bit key, 1024 byte blocks): 192399 operations in 1 seconds (197016576 bytes)
test 13 (192 bit key, 2048 byte blocks): 121298 operations in 1 seconds (248418304 bytes)
test 14 (192 bit key, 4096 byte blocks):  69994 operations in 1 seconds (286695424 bytes)
test 15 (192 bit key, 8192 byte blocks):  37045 operations in 1 seconds (303472640 bytes)
test 16 (256 bit key,   16 byte blocks): 438244 operations in 1 seconds (  7011904 bytes)
test 17 (256 bit key,   64 byte blocks): 423345 operations in 1 seconds ( 27094080 bytes)
test 18 (256 bit key,  256 byte blocks): 336844 operations in 1 seconds ( 86232064 bytes)
test 19 (256 bit key,  512 byte blocks): 265711 operations in 1 seconds (136044032 bytes)
test 20 (256 bit key, 1024 byte blocks): 186853 operations in 1 seconds (191337472 bytes)
test 21 (256 bit key, 2048 byte blocks): 117301 operations in 1 seconds (240232448 bytes)
test 22 (256 bit key, 4096 byte blocks):  67513 operations in 1 seconds (276533248 bytes)
test 23 (256 bit key, 8192 byte blocks):  35629 operations in 1 seconds (291872768 bytes)


THIS PATCH:
===========
test  0 (128 bit key,   16 byte blocks): 441257 operations in 1 seconds (  7060112 bytes)
test  1 (128 bit key,   64 byte blocks): 436595 operations in 1 seconds ( 27942080 bytes)
test  2 (128 bit key,  256 byte blocks): 369839 operations in 1 seconds ( 94678784 bytes)
test  3 (128 bit key,  512 byte blocks): 308239 operations in 1 seconds (157818368 bytes)
test  4 (128 bit key, 1024 byte blocks): 231004 operations in 1 seconds (236548096 bytes)
test  5 (128 bit key, 2048 byte blocks): 153930 operations in 1 seconds (315248640 bytes)
test  6 (128 bit key, 4096 byte blocks):  92739 operations in 1 seconds (379858944 bytes)
test  7 (128 bit key, 8192 byte blocks):  49934 operations in 1 seconds (409059328 bytes)
test  8 (192 bit key,   16 byte blocks): 437427 operations in 1 seconds (  6998832 bytes)
test  9 (192 bit key,   64 byte blocks): 429462 operations in 1 seconds ( 27485568 bytes)
test 10 (192 bit key,  256 byte blocks): 358183 operations in 1 seconds ( 91694848 bytes)
test 11 (192 bit key,  512 byte blocks): 294539 operations in 1 seconds (150803968 bytes)
test 12 (192 bit key, 1024 byte blocks): 217082 operations in 1 seconds (222291968 bytes)
test 13 (192 bit key, 2048 byte blocks): 140672 operations in 1 seconds (288096256 bytes)
test 14 (192 bit key, 4096 byte blocks):  84369 operations in 1 seconds (345575424 bytes)
test 15 (192 bit key, 8192 byte blocks):  45280 operations in 1 seconds (370933760 bytes)
test 16 (256 bit key,   16 byte blocks): 434127 operations in 1 seconds (  6946032 bytes)
test 17 (256 bit key,   64 byte blocks): 423837 operations in 1 seconds ( 27125568 bytes)
test 18 (256 bit key,  256 byte blocks): 351244 operations in 1 seconds ( 89918464 bytes)
test 19 (256 bit key,  512 byte blocks): 286884 operations in 1 seconds (146884608 bytes)
test 20 (256 bit key, 1024 byte blocks): 209954 operations in 1 seconds (214992896 bytes)
test 21 (256 bit key, 2048 byte blocks): 136553 operations in 1 seconds (279660544 bytes)
test 22 (256 bit key, 4096 byte blocks):  80749 operations in 1 seconds (330747904 bytes)
test 23 (256 bit key, 8192 byte blocks):  43118 operations in 1 seconds (353222656 bytes)

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-07-28 18:54 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-07-28 18:53 [PATCH 0/2] crypto/arm64: aes-ce-gcm - switch to 2-way aggregation Ard Biesheuvel
2018-07-28 18:53 ` [PATCH 1/2] crypto/arm64: aes-ce-gcm - operate on two input blocks at a time Ard Biesheuvel
2018-07-28 18:54 ` [PATCH 2/2] crypto/arm64: aes-ce-gcm - implement 2-way aggregation Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).