From: Ard Biesheuvel <ardb@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: herbert@gondor.apana.org.au,
linux-arm-kernel@lists.infradead.org,
Ard Biesheuvel <ardb@kernel.org>,
Ondrej Mosnacek <omosnacek@gmail.com>,
Eric Biggers <ebiggers@kernel.org>
Subject: [PATCH v3 2/4] crypto: aegis128/neon - optimize tail block handling
Date: Tue, 17 Nov 2020 14:32:12 +0100 [thread overview]
Message-ID: <20201117133214.29114-3-ardb@kernel.org> (raw)
In-Reply-To: <20201117133214.29114-1-ardb@kernel.org>
Avoid copying the tail block via a stack buffer if the total size
exceeds a single AEGIS block. In this case, we can use overlapping
loads and stores and NEON permutation instructions instead, which
leads to a modest performance improvement on some cores (< 5%),
and is slightly cleaner. Note that we still need to use a stack
buffer if the entire input is smaller than 16 bytes, given that
we cannot use 16 byte NEON loads and stores safely in this case.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
crypto/aegis128-neon-inner.c | 89 +++++++++++++++++---
1 file changed, 75 insertions(+), 14 deletions(-)
diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
index 2a660ac1bc3a..cd1b3ad1d1f3 100644
--- a/crypto/aegis128-neon-inner.c
+++ b/crypto/aegis128-neon-inner.c
@@ -20,7 +20,6 @@
extern int aegis128_have_aes_insn;
void *memcpy(void *dest, const void *src, size_t n);
-void *memset(void *s, int c, size_t n);
struct aegis128_state {
uint8x16_t v[5];
@@ -173,10 +172,46 @@ void crypto_aegis128_update_neon(void *state, const void *msg)
aegis128_save_state_neon(st, state);
}
+#ifdef CONFIG_ARM
+/*
+ * AArch32 does not provide these intrinsics natively because it does not
+ * implement the underlying instructions. AArch32 only provides 64-bit
+ * wide vtbl.8/vtbx.8 instruction, so use those instead.
+ */
+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
+{
+ union {
+ uint8x16_t val;
+ uint8x8x2_t pair;
+ } __a = { a };
+
+ return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
+ vtbl2_u8(__a.pair, vget_high_u8(b)));
+}
+
+static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
+{
+ union {
+ uint8x16_t val;
+ uint8x8x2_t pair;
+ } __a = { a };
+
+ return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
+ vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
+}
+#endif
+
+static const uint8_t permute[] __aligned(64) = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
unsigned int size)
{
struct aegis128_state st = aegis128_load_state_neon(state);
+ const int short_input = size < AEGIS_BLOCK_SIZE;
uint8x16_t msg;
preload_sbox();
@@ -186,7 +221,8 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
msg = vld1q_u8(src);
st = aegis128_update_neon(st, msg);
- vst1q_u8(dst, msg ^ s);
+ msg ^= s;
+ vst1q_u8(dst, msg);
size -= AEGIS_BLOCK_SIZE;
src += AEGIS_BLOCK_SIZE;
@@ -195,13 +231,26 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
if (size > 0) {
uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
- uint8_t buf[AEGIS_BLOCK_SIZE] = {};
+ uint8_t buf[AEGIS_BLOCK_SIZE];
+ const void *in = src;
+ void *out = dst;
+ uint8x16_t m;
- memcpy(buf, src, size);
- msg = vld1q_u8(buf);
- st = aegis128_update_neon(st, msg);
- vst1q_u8(buf, msg ^ s);
- memcpy(dst, buf, size);
+ if (__builtin_expect(short_input, 0))
+ in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
+
+ m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+ vld1q_u8(permute + 32 - size));
+
+ st = aegis128_update_neon(st, m);
+
+ vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+ vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
+
+ if (__builtin_expect(short_input, 0))
+ memcpy(dst, out, size);
+ else
+ vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
}
aegis128_save_state_neon(st, state);
@@ -211,6 +260,7 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
unsigned int size)
{
struct aegis128_state st = aegis128_load_state_neon(state);
+ const int short_input = size < AEGIS_BLOCK_SIZE;
uint8x16_t msg;
preload_sbox();
@@ -228,14 +278,25 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
if (size > 0) {
uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
uint8_t buf[AEGIS_BLOCK_SIZE];
+ const void *in = src;
+ void *out = dst;
+ uint8x16_t m;
- vst1q_u8(buf, s);
- memcpy(buf, src, size);
- msg = vld1q_u8(buf) ^ s;
- vst1q_u8(buf, msg);
- memcpy(dst, buf, size);
+ if (__builtin_expect(short_input, 0))
+ in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
- st = aegis128_update_neon(st, msg);
+ m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+ vld1q_u8(permute + 32 - size));
+
+ st = aegis128_update_neon(st, m);
+
+ vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+ vqtbl1q_u8(m, vld1q_u8(permute + size)));
+
+ if (__builtin_expect(short_input, 0))
+ memcpy(dst, out, size);
+ else
+ vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
}
aegis128_save_state_neon(st, state);
--
2.17.1
WARNING: multiple messages have this Message-ID (diff)
From: Ard Biesheuvel <ardb@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: Eric Biggers <ebiggers@kernel.org>,
Ondrej Mosnacek <omosnacek@gmail.com>,
herbert@gondor.apana.org.au,
linux-arm-kernel@lists.infradead.org,
Ard Biesheuvel <ardb@kernel.org>
Subject: [PATCH v3 2/4] crypto: aegis128/neon - optimize tail block handling
Date: Tue, 17 Nov 2020 14:32:12 +0100 [thread overview]
Message-ID: <20201117133214.29114-3-ardb@kernel.org> (raw)
In-Reply-To: <20201117133214.29114-1-ardb@kernel.org>
Avoid copying the tail block via a stack buffer if the total size
exceeds a single AEGIS block. In this case, we can use overlapping
loads and stores and NEON permutation instructions instead, which
leads to a modest performance improvement on some cores (< 5%),
and is slightly cleaner. Note that we still need to use a stack
buffer if the entire input is smaller than 16 bytes, given that
we cannot use 16 byte NEON loads and stores safely in this case.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
crypto/aegis128-neon-inner.c | 89 +++++++++++++++++---
1 file changed, 75 insertions(+), 14 deletions(-)
diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
index 2a660ac1bc3a..cd1b3ad1d1f3 100644
--- a/crypto/aegis128-neon-inner.c
+++ b/crypto/aegis128-neon-inner.c
@@ -20,7 +20,6 @@
extern int aegis128_have_aes_insn;
void *memcpy(void *dest, const void *src, size_t n);
-void *memset(void *s, int c, size_t n);
struct aegis128_state {
uint8x16_t v[5];
@@ -173,10 +172,46 @@ void crypto_aegis128_update_neon(void *state, const void *msg)
aegis128_save_state_neon(st, state);
}
+#ifdef CONFIG_ARM
+/*
+ * AArch32 does not provide these intrinsics natively because it does not
+ * implement the underlying instructions. AArch32 only provides 64-bit
+ * wide vtbl.8/vtbx.8 instruction, so use those instead.
+ */
+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
+{
+ union {
+ uint8x16_t val;
+ uint8x8x2_t pair;
+ } __a = { a };
+
+ return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
+ vtbl2_u8(__a.pair, vget_high_u8(b)));
+}
+
+static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
+{
+ union {
+ uint8x16_t val;
+ uint8x8x2_t pair;
+ } __a = { a };
+
+ return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
+ vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
+}
+#endif
+
+static const uint8_t permute[] __aligned(64) = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
unsigned int size)
{
struct aegis128_state st = aegis128_load_state_neon(state);
+ const int short_input = size < AEGIS_BLOCK_SIZE;
uint8x16_t msg;
preload_sbox();
@@ -186,7 +221,8 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
msg = vld1q_u8(src);
st = aegis128_update_neon(st, msg);
- vst1q_u8(dst, msg ^ s);
+ msg ^= s;
+ vst1q_u8(dst, msg);
size -= AEGIS_BLOCK_SIZE;
src += AEGIS_BLOCK_SIZE;
@@ -195,13 +231,26 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
if (size > 0) {
uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
- uint8_t buf[AEGIS_BLOCK_SIZE] = {};
+ uint8_t buf[AEGIS_BLOCK_SIZE];
+ const void *in = src;
+ void *out = dst;
+ uint8x16_t m;
- memcpy(buf, src, size);
- msg = vld1q_u8(buf);
- st = aegis128_update_neon(st, msg);
- vst1q_u8(buf, msg ^ s);
- memcpy(dst, buf, size);
+ if (__builtin_expect(short_input, 0))
+ in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
+
+ m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+ vld1q_u8(permute + 32 - size));
+
+ st = aegis128_update_neon(st, m);
+
+ vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+ vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
+
+ if (__builtin_expect(short_input, 0))
+ memcpy(dst, out, size);
+ else
+ vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
}
aegis128_save_state_neon(st, state);
@@ -211,6 +260,7 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
unsigned int size)
{
struct aegis128_state st = aegis128_load_state_neon(state);
+ const int short_input = size < AEGIS_BLOCK_SIZE;
uint8x16_t msg;
preload_sbox();
@@ -228,14 +278,25 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
if (size > 0) {
uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
uint8_t buf[AEGIS_BLOCK_SIZE];
+ const void *in = src;
+ void *out = dst;
+ uint8x16_t m;
- vst1q_u8(buf, s);
- memcpy(buf, src, size);
- msg = vld1q_u8(buf) ^ s;
- vst1q_u8(buf, msg);
- memcpy(dst, buf, size);
+ if (__builtin_expect(short_input, 0))
+ in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
- st = aegis128_update_neon(st, msg);
+ m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
+ vld1q_u8(permute + 32 - size));
+
+ st = aegis128_update_neon(st, m);
+
+ vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
+ vqtbl1q_u8(m, vld1q_u8(permute + size)));
+
+ if (__builtin_expect(short_input, 0))
+ memcpy(dst, out, size);
+ else
+ vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
}
aegis128_save_state_neon(st, state);
--
2.17.1
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
next prev parent reply other threads:[~2020-11-17 13:33 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-11-17 13:32 [PATCH v3 0/4] crypto: aegis128 enhancements Ard Biesheuvel
2020-11-17 13:32 ` Ard Biesheuvel
2020-11-17 13:32 ` [PATCH v3 1/4] crypto: aegis128 - wipe plaintext and tag if decryption fails Ard Biesheuvel
2020-11-17 13:32 ` Ard Biesheuvel
2020-11-17 13:32 ` Ard Biesheuvel [this message]
2020-11-17 13:32 ` [PATCH v3 2/4] crypto: aegis128/neon - optimize tail block handling Ard Biesheuvel
2020-11-17 13:32 ` [PATCH v3 3/4] crypto: aegis128/neon - move final tag check to SIMD domain Ard Biesheuvel
2020-11-17 13:32 ` Ard Biesheuvel
2020-11-17 13:32 ` [PATCH v3 4/4] crypto: aegis128 - expose SIMD code path as separate driver Ard Biesheuvel
2020-11-17 13:32 ` Ard Biesheuvel
2020-11-20 8:55 ` Ondrej Mosnáček
2020-11-20 8:55 ` Ondrej Mosnáček
2020-11-27 6:24 ` [PATCH v3 0/4] crypto: aegis128 enhancements Herbert Xu
2020-11-27 6:24 ` Herbert Xu
2020-11-30 9:37 ` Geert Uytterhoeven
2020-11-30 9:37 ` Geert Uytterhoeven
2020-11-30 9:43 ` Ard Biesheuvel
2020-11-30 9:43 ` Ard Biesheuvel
2020-11-30 9:45 ` Ard Biesheuvel
2020-11-30 9:45 ` Ard Biesheuvel
2020-11-30 12:14 ` Geert Uytterhoeven
2020-11-30 12:14 ` Geert Uytterhoeven
-- strict thread matches above, loose matches on Subject: below --
2020-11-18 4:25 [PATCH v3 2/4] crypto: aegis128/neon - optimize tail block handling kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201117133214.29114-3-ardb@kernel.org \
--to=ardb@kernel.org \
--cc=ebiggers@kernel.org \
--cc=herbert@gondor.apana.org.au \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-crypto@vger.kernel.org \
--cc=omosnacek@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.