From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: stable@vger.kernel.org, Martin Willi <martin@strongswan.org>,
"Jason A . Donenfeld" <Jason@zx2c4.com>
Subject: [RFC/RFT PATCH 01/18] crypto: x86/poly1305 - fix overflow during partial reduction
Date: Sun, 31 Mar 2019 13:04:11 -0700 [thread overview]
Message-ID: <20190331200428.26597-2-ebiggers@kernel.org> (raw)
In-Reply-To: <20190331200428.26597-1-ebiggers@kernel.org>
From: Eric Biggers <ebiggers@google.com>
The x86_64 implementation of Poly1305 produces the wrong result on some
inputs because poly1305_4block_avx2() incorrectly assumes that when
partially reducing the accumulator, the bits carried from limb 'd4' to
limb 'h0' fit in a 32-bit integer. This is true for poly1305-generic
which processes only one block at a time. However, it's not true for
the AVX2 implementation, which processes 4 blocks at a time and
therefore can produce intermediate limbs about 4x larger.
Fix it by making the relevant calculations use 64-bit arithmetic rather
than 32-bit. Note that most of the carries already used 64-bit
arithmetic, but the d4 -> h0 carry was different for some reason.
To be safe I also made the same change to the corresponding SSE2 code,
though that only operates on 1 or 2 blocks at a time. I don't think
it's really needed for poly1305_block_sse2(), but it doesn't hurt
because it's already x86_64 code. It *might* be needed for
poly1305_2block_sse2(), but overflows aren't easy to reproduce there.
This bug was originally detected by my patches that improve testmgr to
fuzz algorithms against their generic implementation. But also add a
test vector which reproduces it directly (in the AVX2 case).
Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64")
Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64")
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Martin Willi <martin@strongswan.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
arch/x86/crypto/poly1305-avx2-x86_64.S | 14 +++++---
arch/x86/crypto/poly1305-sse2-x86_64.S | 22 ++++++++-----
crypto/testmgr.h | 44 +++++++++++++++++++++++++-
3 files changed, 67 insertions(+), 13 deletions(-)
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index 3b6e70d085da8..8457cdd47f751 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
vpaddq t2,t1,t1
vmovq t1x,d4
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
@@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index e6add74d78a59..6f0be7a869641 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
@@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2)
paddq t2,t1
movq t1,d4
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
@@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2)
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index f267633cf13ac..d18a37629f053 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -5634,7 +5634,49 @@ static const struct hash_testvec poly1305_tv_template[] = {
.psize = 80,
.digest = "\x13\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00",
- },
+ }, { /* Regression test for overflow in AVX2 implementation */
+ .plaintext = "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff\xff\xff\xff\xff"
+ "\xff\xff\xff\xff",
+ .psize = 300,
+ .digest = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
+ "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",
+ }
};
/* NHPoly1305 test vectors from https://github.com/google/adiantum */
--
2.21.0
next parent reply other threads:[~2019-03-31 20:06 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20190331200428.26597-1-ebiggers@kernel.org>
2019-03-31 20:04 ` Eric Biggers [this message]
2019-04-01 7:52 ` [RFC/RFT PATCH 01/18] crypto: x86/poly1305 - fix overflow during partial reduction Martin Willi
2019-03-31 20:04 ` [RFC/RFT PATCH 02/18] crypto: crct10dif-generic - fix use via crypto_shash_digest() Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 03/18] crypto: x86/crct10dif-pcl " Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 04/18] crypto: skcipher - restore default skcipher_walk::iv on error Eric Biggers
2019-04-08 6:23 ` Herbert Xu
2019-04-08 17:27 ` Eric Biggers
2019-04-09 6:37 ` Herbert Xu
2019-03-31 20:04 ` [RFC/RFT PATCH 05/18] crypto: skcipher - don't WARN on unprocessed data after slow walk step Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 06/18] crypto: chacha20poly1305 - set cra_name correctly Eric Biggers
2019-04-01 7:57 ` Martin Willi
2019-03-31 20:04 ` [RFC/RFT PATCH 07/18] crypto: gcm - fix incompatibility between "gcm" and "gcm_base" Eric Biggers
2019-04-01 15:56 ` Eric Biggers
2019-03-31 20:04 ` [RFC/RFT PATCH 08/18] crypto: ccm - fix incompatibility between "ccm" and "ccm_base" Eric Biggers
2019-04-02 15:48 ` Sasha Levin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190331200428.26597-2-ebiggers@kernel.org \
--to=ebiggers@kernel.org \
--cc=Jason@zx2c4.com \
--cc=linux-crypto@vger.kernel.org \
--cc=martin@strongswan.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).