[RFC/RFT PATCH 01/18] crypto: x86/poly1305 - fix overflow during partial reduction

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: stable@vger.kernel.org, Martin Willi <martin@strongswan.org>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>
Subject: [RFC/RFT PATCH 01/18] crypto: x86/poly1305 - fix overflow during partial reduction
Date: Sun, 31 Mar 2019 13:04:11 -0700	[thread overview]
Message-ID: <20190331200428.26597-2-ebiggers@kernel.org> (raw)
In-Reply-To: <20190331200428.26597-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

The x86_64 implementation of Poly1305 produces the wrong result on some
inputs because poly1305_4block_avx2() incorrectly assumes that when
partially reducing the accumulator, the bits carried from limb 'd4' to
limb 'h0' fit in a 32-bit integer.  This is true for poly1305-generic
which processes only one block at a time.  However, it's not true for
the AVX2 implementation, which processes 4 blocks at a time and
therefore can produce intermediate limbs about 4x larger.

Fix it by making the relevant calculations use 64-bit arithmetic rather
than 32-bit.  Note that most of the carries already used 64-bit
arithmetic, but the d4 -> h0 carry was different for some reason.

To be safe I also made the same change to the corresponding SSE2 code,
though that only operates on 1 or 2 blocks at a time.  I don't think
it's really needed for poly1305_block_sse2(), but it doesn't hurt
because it's already x86_64 code.  It *might* be needed for
poly1305_2block_sse2(), but overflows aren't easy to reproduce there.

This bug was originally detected by my patches that improve testmgr to
fuzz algorithms against their generic implementation.  But also add a
test vector which reproduces it directly (in the AVX2 case).

Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64")
Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64")
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Martin Willi <martin@strongswan.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/poly1305-avx2-x86_64.S | 14 +++++---
 arch/x86/crypto/poly1305-sse2-x86_64.S | 22 ++++++++-----
 crypto/testmgr.h                       | 44 +++++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index 3b6e70d085da8..8457cdd47f751 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
 	vpaddq		t2,t1,t1
 	vmovq		t1x,d4
 
+	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+	# integers.  It's true in a single-block implementation, but not here.
+
 	# d1 += d0 >> 26
 	mov		d0,%rax
 	shr		$26,%rax
@@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index e6add74d78a59..6f0be7a869641 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
@@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2)
 	paddq		t2,t1
 	movq		t1,d4
 
+	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+	# integers.  It's true in a single-block implementation, but not here.
+
 	# d1 += d0 >> 26
 	mov		d0,%rax
 	shr		$26,%rax
@@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index f267633cf13ac..d18a37629f053 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -5634,7 +5634,49 @@ static const struct hash_testvec poly1305_tv_template[] = {
 		.psize		= 80,
 		.digest		= "\x13\x00\x00\x00\x00\x00\x00\x00"
 				  "\x00\x00\x00\x00\x00\x00\x00\x00",
-	},
+	}, { /* Regression test for overflow in AVX2 implementation */
+		.plaintext	= "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff\xff\xff\xff\xff"
+				  "\xff\xff\xff\xff",
+		.psize		= 300,
+		.digest		= "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
+				  "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",
+	}
 };
 
 /* NHPoly1305 test vectors from https://github.com/google/adiantum */
-- 
2.21.0

next prev parent reply	other threads:[~2019-03-31 20:06 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-03-31 20:04 [RFC/RFT PATCH 00/18] crypto: fuzz algorithms against their generic implementation Eric Biggers
2019-03-31 20:04 ` Eric Biggers [this message]
2019-04-01  7:52   ` [RFC/RFT PATCH 01/18] crypto: x86/poly1305 - fix overflow during partial reduction Martin Willi
2019-03-31 20:04 ` [RFC/RFT PATCH 02/18] crypto: crct10dif-generic - fix use via crypto_shash_digest() Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 03/18] crypto: x86/crct10dif-pcl " Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 04/18] crypto: skcipher - restore default skcipher_walk::iv on error Eric Biggers
2019-04-08  6:23   ` Herbert Xu
2019-04-08 17:27     ` Eric Biggers
2019-04-09  6:37       ` Herbert Xu
2019-03-31 20:04 ` [RFC/RFT PATCH 05/18] crypto: skcipher - don't WARN on unprocessed data after slow walk step Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 06/18] crypto: chacha20poly1305 - set cra_name correctly Eric Biggers
2019-04-01  7:57   ` Martin Willi
2019-03-31 20:04 ` [RFC/RFT PATCH 07/18] crypto: gcm - fix incompatibility between "gcm" and "gcm_base" Eric Biggers
2019-04-01 15:56   ` Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 08/18] crypto: ccm - fix incompatibility between "ccm" and "ccm_base" Eric Biggers
2019-04-02 15:48   ` Sasha Levin
2019-03-31 20:04 ` [RFC/RFT PATCH 09/18] crypto: streebog - fix unaligned memory accesses Eric Biggers
2019-03-31 21:47   ` Vitaly Chikunov
2019-04-02 16:15     ` Vitaly Chikunov
2019-04-02 16:57       ` Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 10/18] crypto: cts - don't support empty messages Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 11/18] crypto: arm64/cbcmac - handle empty messages in same way as template Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 12/18] crypto: testmgr - expand ability to test for errors Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 13/18] crypto: testmgr - identify test vectors by name rather than number Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 14/18] crypto: testmgr - add helpers for fuzzing against generic implementation Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 15/18] crypto: testmgr - fuzz hashes against their " Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 16/18] crypto: testmgr - fuzz skciphers " Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 17/18] crypto: testmgr - fuzz AEADs " Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 18/18] crypto: run initcalls for generic implementations earlier Eric Biggers
2019-04-08  5:53   ` Herbert Xu
2019-04-08 17:44     ` Eric Biggers
2019-04-09  6:24       ` Herbert Xu
2019-04-09 18:16         ` Eric Biggers
2019-04-11  6:26           ` Herbert Xu
2019-04-08  6:44 ` [RFC/RFT PATCH 00/18] crypto: fuzz algorithms against their generic implementation Herbert Xu

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:3b6e70d085da dfblob:8457cdd47f75 dfblob:e6add74d78a5
dfblob:6f0be7a86964 dfblob:f267633cf13a dfblob:d18a37629f05 )
 OR (
bs:"[RFC/RFT PATCH 01/18] crypto: x86/poly1305 - fix overflow during partial reduction" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190331200428.26597-2-ebiggers@kernel.org \
    --to=ebiggers@kernel.org \
    --cc=Jason@zx2c4.com \
    --cc=linux-crypto@vger.kernel.org \
    --cc=martin@strongswan.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.