* [PATCH 01/18] crypto: Add generic 8-bit carry-less multiply routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 02/18] target/arm: Use clmul_8* routines Richard Henderson
` (17 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
host/include/generic/host/crypto/clmul.h | 17 ++++++
include/crypto/clmul.h | 61 +++++++++++++++++++
crypto/clmul.c | 76 ++++++++++++++++++++++++
crypto/meson.build | 9 ++-
4 files changed, 160 insertions(+), 3 deletions(-)
create mode 100644 host/include/generic/host/crypto/clmul.h
create mode 100644 include/crypto/clmul.h
create mode 100644 crypto/clmul.c
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
new file mode 100644
index 0000000000..694705f703
--- /dev/null
+++ b/host/include/generic/host/crypto/clmul.h
@@ -0,0 +1,17 @@
+/*
+ * No host specific carry-less multiply acceleration.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef GENERIC_HOST_CRYPTO_CLMUL_H
+#define GENERIC_HOST_CRYPTO_CLMUL_H
+
+/* Defer everything to the generic routines. */
+#define clmul_8x8_low clmul_8x8_low_gen
+#define clmul_8x4_even clmul_8x4_even_gen
+#define clmul_8x4_odd clmul_8x4_odd_gen
+#define clmul_8x8_even clmul_8x8_even_gen
+#define clmul_8x8_odd clmul_8x8_odd_gen
+#define clmul_8x8_packed clmul_8x8_packed_gen
+
+#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
new file mode 100644
index 0000000000..7f19205d6f
--- /dev/null
+++ b/include/crypto/clmul.h
@@ -0,0 +1,61 @@
+/*
+ * Carry-less multiply
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef CRYPTO_CLMUL_H
+#define CRYPTO_CLMUL_H
+
+#include "qemu/int128.h"
+
+/**
+ * clmul_8x8_low:
+ *
+ * Perform eight 8x8->8 carry-less multiplies.
+ */
+uint64_t clmul_8x8_low_gen(uint64_t, uint64_t);
+
+/**
+ * clmul_8x4_even:
+ *
+ * Perform four 8x8->16 carry-less multiplies.
+ * The odd bytes of the inputs are ignored.
+ */
+uint64_t clmul_8x4_even_gen(uint64_t, uint64_t);
+
+/**
+ * clmul_8x4_odd:
+ *
+ * Perform four 8x8->16 carry-less multiplies.
+ * The even bytes of the inputs are ignored.
+ */
+uint64_t clmul_8x4_odd_gen(uint64_t, uint64_t);
+
+/**
+ * clmul_8x8_even:
+ *
+ * Perform eight 8x8->16 carry-less multiplies.
+ * The odd bytes of the inputs are ignored.
+ */
+Int128 clmul_8x8_even_gen(Int128, Int128);
+
+/**
+ * clmul_8x8_odd:
+ *
+ * Perform eight 8x8->16 carry-less multiplies.
+ * The even bytes of the inputs are ignored.
+ */
+Int128 clmul_8x8_odd_gen(Int128, Int128);
+
+/**
+ * clmul_8x8_packed:
+ *
+ * Perform eight 8x8->16 carry-less multiplies.
+ */
+Int128 clmul_8x8_packed_gen(uint64_t, uint64_t);
+
+#include "host/crypto/clmul.h"
+
+#endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
new file mode 100644
index 0000000000..866704e751
--- /dev/null
+++ b/crypto/clmul.c
@@ -0,0 +1,76 @@
+/*
+ * No host specific carry-less multiply acceleration.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/clmul.h"
+
+uint64_t clmul_8x8_low_gen(uint64_t n, uint64_t m)
+{
+ uint64_t r = 0;
+
+ for (int i = 0; i < 8; ++i) {
+ uint64_t mask = (n & 0x0101010101010101ull) * 0xff;
+ r ^= m & mask;
+ m = (m << 1) & 0xfefefefefefefefeull;
+ n >>= 1;
+ }
+ return r;
+}
+
+uint64_t clmul_8x4_even_gen(uint64_t n, uint64_t m)
+{
+ uint64_t r = 0;
+
+ n &= 0x00ff00ff00ff00ffull;
+ m &= 0x00ff00ff00ff00ffull;
+
+ for (int i = 0; i < 8; ++i) {
+ uint64_t mask = (n & 0x0001000100010001ull) * 0xffff;
+ r ^= m & mask;
+ n >>= 1;
+ m <<= 1;
+ }
+ return r;
+}
+
+uint64_t clmul_8x4_odd_gen(uint64_t n, uint64_t m)
+{
+ return clmul_8x4_even_gen(n >> 8, m >> 8);
+}
+
+Int128 clmul_8x8_even_gen(Int128 n, Int128 m)
+{
+ uint64_t rl, rh;
+
+ rl = clmul_8x4_even_gen(int128_getlo(n), int128_getlo(m));
+ rh = clmul_8x4_even_gen(int128_gethi(n), int128_gethi(m));
+ return int128_make128(rl, rh);
+}
+
+Int128 clmul_8x8_odd_gen(Int128 n, Int128 m)
+{
+ uint64_t rl, rh;
+
+ rl = clmul_8x4_odd_gen(int128_getlo(n), int128_getlo(m));
+ rh = clmul_8x4_odd_gen(int128_gethi(n), int128_gethi(m));
+ return int128_make128(rl, rh);
+}
+
+static uint64_t unpack_8_to_16(uint64_t x)
+{
+ return (x & 0x000000ff)
+ | ((x & 0x0000ff00) << 8)
+ | ((x & 0x00ff0000) << 16)
+ | ((x & 0xff000000) << 24);
+}
+
+Int128 clmul_8x8_packed_gen(uint64_t n, uint64_t m)
+{
+ uint64_t rl, rh;
+
+ rl = clmul_8x4_even_gen(unpack_8_to_16(n), unpack_8_to_16(m));
+ rh = clmul_8x4_even_gen(unpack_8_to_16(n >> 32), unpack_8_to_16(m >> 32));
+ return int128_make128(rl, rh);
+}
diff --git a/crypto/meson.build b/crypto/meson.build
index 5f03a30d34..9ac1a89802 100644
--- a/crypto/meson.build
+++ b/crypto/meson.build
@@ -48,9 +48,12 @@ if have_afalg
endif
crypto_ss.add(when: gnutls, if_true: files('tls-cipher-suites.c'))
-util_ss.add(files('sm4.c'))
-util_ss.add(files('aes.c'))
-util_ss.add(files('init.c'))
+util_ss.add(files(
+ 'aes.c',
+ 'clmul.c',
+ 'init.c',
+ 'sm4.c',
+))
if gnutls.found()
util_ss.add(gnutls)
endif
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 02/18] target/arm: Use clmul_8* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
2023-07-13 21:14 ` [PATCH 01/18] crypto: Add generic 8-bit carry-less multiply routines Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:43 ` Philippe Mathieu-Daudé
2023-07-13 21:14 ` [PATCH 03/18] target/s390x: " Richard Henderson
` (16 subsequent siblings)
18 siblings, 1 reply; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 8-bit carry-less multiply.
Remove our local version of pmull_h.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/vec_internal.h | 5 ---
target/arm/tcg/mve_helper.c | 8 ++---
target/arm/tcg/vec_helper.c | 63 +++++++----------------------------
3 files changed, 15 insertions(+), 61 deletions(-)
diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
index 1f4ed80ff7..c4afba6d9f 100644
--- a/target/arm/tcg/vec_internal.h
+++ b/target/arm/tcg/vec_internal.h
@@ -219,11 +219,6 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
-/*
- * 8 x 8 -> 16 vector polynomial multiply where the inputs are
- * in the low 8 bits of each 16-bit element
-*/
-uint64_t pmull_h(uint64_t op1, uint64_t op2);
/*
* 16 x 16 -> 32 vector polynomial multiply where the inputs are
* in the low 16 bits of each 32-bit element
diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c
index 403b345ea3..96ddfb4b3a 100644
--- a/target/arm/tcg/mve_helper.c
+++ b/target/arm/tcg/mve_helper.c
@@ -26,6 +26,7 @@
#include "exec/exec-all.h"
#include "tcg/tcg.h"
#include "fpu/softfloat.h"
+#include "crypto/clmul.h"
static uint16_t mve_eci_mask(CPUARMState *env)
{
@@ -984,15 +985,12 @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
* Polynomial multiply. We can always do this generating 64 bits
* of the result at a time, so we don't need to use DO_2OP_L.
*/
-#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
#define VMULLPW_MASK 0x0000ffff0000ffffULL
-#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
-#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
-DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
-DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
+DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even)
+DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd)
DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index f59d3b26ea..4384b6c188 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -23,6 +23,7 @@
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
#include "qemu/int128.h"
+#include "crypto/clmul.h"
#include "vec_internal.h"
/*
@@ -1986,21 +1987,11 @@ void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
*/
void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t i, opr_sz = simd_oprsz(desc);
uint64_t *d = vd, *n = vn, *m = vm;
for (i = 0; i < opr_sz / 8; ++i) {
- uint64_t nn = n[i];
- uint64_t mm = m[i];
- uint64_t rr = 0;
-
- for (j = 0; j < 8; ++j) {
- uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
- rr ^= mm & mask;
- mm = (mm << 1) & 0xfefefefefefefefeull;
- nn >>= 1;
- }
- d[i] = rr;
+ d[i] = clmul_8x8_low(n[i], m[i]);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
@@ -2038,22 +2029,6 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-/*
- * 8x8->16 polynomial multiply.
- *
- * The byte inputs are expanded to (or extracted from) half-words.
- * Note that neon and sve2 get the inputs from different positions.
- * This allows 4 bytes to be processed in parallel with uint64_t.
- */
-
-static uint64_t expand_byte_to_half(uint64_t x)
-{
- return (x & 0x000000ff)
- | ((x & 0x0000ff00) << 8)
- | ((x & 0x00ff0000) << 16)
- | ((x & 0xff000000) << 24);
-}
-
uint64_t pmull_w(uint64_t op1, uint64_t op2)
{
uint64_t result = 0;
@@ -2067,30 +2042,14 @@ uint64_t pmull_w(uint64_t op1, uint64_t op2)
return result;
}
-uint64_t pmull_h(uint64_t op1, uint64_t op2)
-{
- uint64_t result = 0;
- int i;
- for (i = 0; i < 8; ++i) {
- uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
- result ^= op2 & mask;
- op1 >>= 1;
- op2 <<= 1;
- }
- return result;
-}
-
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
{
int hi = simd_data(desc);
uint64_t *d = vd, *n = vn, *m = vm;
- uint64_t nn = n[hi], mm = m[hi];
-
- d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
- nn >>= 32;
- mm >>= 32;
- d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+ Int128 r = clmul_8x8_packed(n[hi], m[hi]);
+ d[0] = int128_getlo(r);
+ d[1] = int128_gethi(r);
clear_tail(d, 16, simd_maxsz(desc));
}
@@ -2101,11 +2060,13 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
intptr_t i, opr_sz = simd_oprsz(desc);
uint64_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz / 8; ++i) {
- uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
- uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ Int128 nn = int128_make128(n[i] >> shift, n[i + 1] >> shift);
+ Int128 mm = int128_make128(m[i] >> shift, m[i + 1] >> shift);
+ Int128 r = clmul_8x8_even(nn, mm);
- d[i] = pmull_h(nn, mm);
+ d[0] = int128_getlo(r);
+ d[1] = int128_gethi(r);
}
}
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH 02/18] target/arm: Use clmul_8* routines
2023-07-13 21:14 ` [PATCH 02/18] target/arm: Use clmul_8* routines Richard Henderson
@ 2023-07-13 21:43 ` Philippe Mathieu-Daudé
0 siblings, 0 replies; 23+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-07-13 21:43 UTC (permalink / raw)
To: Richard Henderson, qemu-devel; +Cc: berrange, ardb
On 13/7/23 23:14, Richard Henderson wrote:
> Use generic routines for 8-bit carry-less multiply.
> Remove our local version of pmull_h.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/arm/tcg/vec_internal.h | 5 ---
> target/arm/tcg/mve_helper.c | 8 ++---
> target/arm/tcg/vec_helper.c | 63 +++++++----------------------------
> 3 files changed, 15 insertions(+), 61 deletions(-)
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 03/18] target/s390x: Use clmul_8* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
2023-07-13 21:14 ` [PATCH 01/18] crypto: Add generic 8-bit carry-less multiply routines Richard Henderson
2023-07-13 21:14 ` [PATCH 02/18] target/arm: Use clmul_8* routines Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 04/18] target/ppc: " Richard Henderson
` (15 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 8-bit carry-less multiply.
Remove our local version of galois_multiply8.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++---
1 file changed, 24 insertions(+), 3 deletions(-)
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index 53ab5c5eb3..e110a7581a 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -14,6 +14,7 @@
#include "vec.h"
#include "exec/helper-proto.h"
#include "tcg/tcg-gvec-desc.h"
+#include "crypto/clmul.h"
static bool s390_vec_is_zero(const S390Vector *v)
{
@@ -179,7 +180,6 @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \
} \
return res; \
}
-DEF_GALOIS_MULTIPLY(8, 16)
DEF_GALOIS_MULTIPLY(16, 32)
DEF_GALOIS_MULTIPLY(32, 64)
@@ -203,6 +203,29 @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b)
return res;
}
+static Int128 do_gfm8(Int128 n, Int128 m)
+{
+ Int128 e = clmul_8x8_even(n, m);
+ Int128 o = clmul_8x8_odd(n, m);
+ return int128_xor(e, o);
+}
+
+void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d)
+{
+ /*
+ * There is no carry across the two doublewords, so their order
+ * does not matter, so we need not care for host endianness.
+ */
+ *(Int128 *)v1 = do_gfm8(*(const Int128 *)v2, *(const Int128 *)v3);
+}
+
+void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
+ const void *v4, uint32_t d)
+{
+ Int128 r = do_gfm8(*(const Int128 *)v2, *(const Int128 *)v3);
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
+}
+
#define DEF_VGFM(BITS, TBITS) \
void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
uint32_t desc) \
@@ -220,7 +243,6 @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
s390_vec_write_element##TBITS(v1, i, d); \
} \
}
-DEF_VGFM(8, 16)
DEF_VGFM(16, 32)
DEF_VGFM(32, 64)
@@ -257,7 +279,6 @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \
s390_vec_write_element##TBITS(v1, i, d); \
} \
}
-DEF_VGFMA(8, 16)
DEF_VGFMA(16, 32)
DEF_VGFMA(32, 64)
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 04/18] target/ppc: Use clmul_8* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (2 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 03/18] target/s390x: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 05/18] crypto: Add generic 16-bit carry-less multiply routines Richard Henderson
` (14 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 8-bit carry-less multiply.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/ppc/int_helper.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 834da80fe3..3bf0f5dbe5 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -26,6 +26,7 @@
#include "exec/helper-proto.h"
#include "crypto/aes.h"
#include "crypto/aes-round.h"
+#include "crypto/clmul.h"
#include "fpu/softfloat.h"
#include "qapi/error.h"
#include "qemu/guest-random.h"
@@ -1425,6 +1426,15 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
#undef VBPERMQ_INDEX
#undef VBPERMQ_DW
+void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ Int128 ia = a->s128;
+ Int128 ib = b->s128;
+ Int128 e = clmul_8x8_even(ia, ib);
+ Int128 o = clmul_8x8_odd(ia, ib);
+ r->s128 = int128_xor(e, o);
+}
+
#define PMSUM(name, srcfld, trgfld, trgtyp) \
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
{ \
@@ -1445,7 +1455,6 @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
} \
}
-PMSUM(vpmsumb, u8, u16, uint16_t)
PMSUM(vpmsumh, u16, u32, uint32_t)
PMSUM(vpmsumw, u32, u64, uint64_t)
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 05/18] crypto: Add generic 16-bit carry-less multiply routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (3 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 04/18] target/ppc: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 06/18] target/arm: Use clmul_16* routines Richard Henderson
` (13 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
host/include/generic/host/crypto/clmul.h | 5 +++
include/crypto/clmul.h | 32 +++++++++++++++++++
crypto/clmul.c | 39 ++++++++++++++++++++++++
3 files changed, 76 insertions(+)
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
index 694705f703..cba8bbf3e4 100644
--- a/host/include/generic/host/crypto/clmul.h
+++ b/host/include/generic/host/crypto/clmul.h
@@ -14,4 +14,9 @@
#define clmul_8x8_odd clmul_8x8_odd_gen
#define clmul_8x8_packed clmul_8x8_packed_gen
+#define clmul_16x2_even clmul_16x2_even_gen
+#define clmul_16x2_odd clmul_16x2_odd_gen
+#define clmul_16x4_even clmul_16x4_even_gen
+#define clmul_16x4_odd clmul_16x4_odd_gen
+
#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
index 7f19205d6f..b701bac9d6 100644
--- a/include/crypto/clmul.h
+++ b/include/crypto/clmul.h
@@ -56,6 +56,38 @@ Int128 clmul_8x8_odd_gen(Int128, Int128);
*/
Int128 clmul_8x8_packed_gen(uint64_t, uint64_t);
+/**
+ * clmul_16x2_even:
+ *
+ * Perform two 16x16->32 carry-less multiplies.
+ * The odd words of the inputs are ignored.
+ */
+uint64_t clmul_16x2_even_gen(uint64_t, uint64_t);
+
+/**
+ * clmul_16x2_odd:
+ *
+ * Perform two 16x16->32 carry-less multiplies.
+ * The even bytes of the inputs are ignored.
+ */
+uint64_t clmul_16x2_odd_gen(uint64_t, uint64_t);
+
+/**
+ * clmul_16x4_even:
+ *
+ * Perform four 16x16->32 carry-less multiplies.
+ * The odd bytes of the inputs are ignored.
+ */
+Int128 clmul_16x4_even_gen(Int128, Int128);
+
+/**
+ * clmul_16x4_odd:
+ *
+ * Perform eight 16x16->32 carry-less multiplies.
+ * The even bytes of the inputs are ignored.
+ */
+Int128 clmul_16x4_odd_gen(Int128, Int128);
+
#include "host/crypto/clmul.h"
#endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
index 866704e751..69a3b6f7ff 100644
--- a/crypto/clmul.c
+++ b/crypto/clmul.c
@@ -74,3 +74,42 @@ Int128 clmul_8x8_packed_gen(uint64_t n, uint64_t m)
rh = clmul_8x4_even_gen(unpack_8_to_16(n >> 32), unpack_8_to_16(m >> 32));
return int128_make128(rl, rh);
}
+
+uint64_t clmul_16x2_even_gen(uint64_t n, uint64_t m)
+{
+ uint64_t r = 0;
+
+ n &= 0x0000ffff0000ffffull;
+ m &= 0x0000ffff0000ffffull;
+
+ for (int i = 0; i < 16; ++i) {
+ uint64_t mask = (n & 0x0000000100000001ull) * 0xffffffffull;
+ r ^= m & mask;
+ n >>= 1;
+ m <<= 1;
+ }
+ return r;
+}
+
+uint64_t clmul_16x2_odd_gen(uint64_t n, uint64_t m)
+{
+ return clmul_16x2_even_gen(n >> 16, m >> 16);
+}
+
+Int128 clmul_16x4_even_gen(Int128 n, Int128 m)
+{
+ uint64_t rl, rh;
+
+ rl = clmul_16x2_even_gen(int128_getlo(n), int128_getlo(m));
+ rh = clmul_16x2_even_gen(int128_gethi(n), int128_gethi(m));
+ return int128_make128(rl, rh);
+}
+
+Int128 clmul_16x4_odd_gen(Int128 n, Int128 m)
+{
+ uint64_t rl, rh;
+
+ rl = clmul_16x2_odd_gen(int128_getlo(n), int128_getlo(m));
+ rh = clmul_16x2_odd_gen(int128_gethi(n), int128_gethi(m));
+ return int128_make128(rl, rh);
+}
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 06/18] target/arm: Use clmul_16* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (4 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 05/18] crypto: Add generic 16-bit carry-less multiply routines Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 07/18] target/s390x: " Richard Henderson
` (12 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 16-bit carry-less multiply.
Remove our local version of pmull_w.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/vec_internal.h | 6 ------
target/arm/tcg/mve_helper.c | 8 ++------
target/arm/tcg/vec_helper.c | 13 -------------
3 files changed, 2 insertions(+), 25 deletions(-)
diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
index c4afba6d9f..3ca1b94ccf 100644
--- a/target/arm/tcg/vec_internal.h
+++ b/target/arm/tcg/vec_internal.h
@@ -219,12 +219,6 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
-/*
- * 16 x 16 -> 32 vector polynomial multiply where the inputs are
- * in the low 16 bits of each 32-bit element
- */
-uint64_t pmull_w(uint64_t op1, uint64_t op2);
-
/**
* bfdotadd:
* @sum: addend
diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c
index 96ddfb4b3a..c666a96ba1 100644
--- a/target/arm/tcg/mve_helper.c
+++ b/target/arm/tcg/mve_helper.c
@@ -985,14 +985,10 @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
* Polynomial multiply. We can always do this generating 64 bits
* of the result at a time, so we don't need to use DO_2OP_L.
*/
-#define VMULLPW_MASK 0x0000ffff0000ffffULL
-#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
-#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
-
DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even)
DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd)
-DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
-DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
+DO_2OP(vmullpbw, 8, uint64_t, clmul_16x2_even)
+DO_2OP(vmullptw, 8, uint64_t, clmul_16x2_odd)
/*
* Because the computation type is at least twice as large as required,
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 4384b6c188..1b1d5fccbc 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -2029,19 +2029,6 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-uint64_t pmull_w(uint64_t op1, uint64_t op2)
-{
- uint64_t result = 0;
- int i;
- for (i = 0; i < 16; ++i) {
- uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
- result ^= op2 & mask;
- op1 >>= 1;
- op2 <<= 1;
- }
- return result;
-}
-
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
{
int hi = simd_data(desc);
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 07/18] target/s390x: Use clmul_16* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (5 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 06/18] target/arm: Use clmul_16* routines Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 08/18] target/ppc: " Richard Henderson
` (11 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 16-bit carry-less multiply.
Remove our local version of galois_multiply16.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/s390x/tcg/vec_int_helper.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index e110a7581a..523d6375bb 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -180,7 +180,6 @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \
} \
return res; \
}
-DEF_GALOIS_MULTIPLY(16, 32)
DEF_GALOIS_MULTIPLY(32, 64)
static S390Vector galois_multiply64(uint64_t a, uint64_t b)
@@ -226,6 +225,25 @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
*(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
}
+static Int128 do_gfm16(Int128 n, Int128 m)
+{
+ Int128 e = clmul_16x4_even(n, m);
+ Int128 o = clmul_16x4_odd(n, m);
+ return int128_xor(e, o);
+}
+
+void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d)
+{
+ *(Int128 *)v1 = do_gfm16(*(const Int128 *)v2, *(const Int128 *)v3);
+}
+
+void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
+ const void *v4, uint32_t d)
+{
+ Int128 r = do_gfm16(*(const Int128 *)v2, *(const Int128 *)v3);
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
+}
+
#define DEF_VGFM(BITS, TBITS) \
void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
uint32_t desc) \
@@ -243,7 +261,6 @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
s390_vec_write_element##TBITS(v1, i, d); \
} \
}
-DEF_VGFM(16, 32)
DEF_VGFM(32, 64)
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
@@ -279,7 +296,6 @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \
s390_vec_write_element##TBITS(v1, i, d); \
} \
}
-DEF_VGFMA(16, 32)
DEF_VGFMA(32, 64)
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 08/18] target/ppc: Use clmul_16* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (6 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 07/18] target/s390x: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 09/18] crypto: Add generic 32-bit carry-less multiply routines Richard Henderson
` (10 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 16-bit carry-less multiply.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/ppc/int_helper.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 3bf0f5dbe5..98d6310f59 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1435,6 +1435,15 @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
r->s128 = int128_xor(e, o);
}
+void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ Int128 ia = a->s128;
+ Int128 ib = b->s128;
+ Int128 e = clmul_16x4_even(ia, ib);
+ Int128 o = clmul_16x4_odd(ia, ib);
+ r->s128 = int128_xor(e, o);
+}
+
#define PMSUM(name, srcfld, trgfld, trgtyp) \
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
{ \
@@ -1455,7 +1464,6 @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
} \
}
-PMSUM(vpmsumh, u16, u32, uint32_t)
PMSUM(vpmsumw, u32, u64, uint64_t)
void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 09/18] crypto: Add generic 32-bit carry-less multiply routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (7 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 08/18] target/ppc: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 10/18] target/arm: Use clmul_32* routines Richard Henderson
` (9 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
host/include/generic/host/crypto/clmul.h | 4 +++
include/crypto/clmul.h | 23 ++++++++++++++++++
crypto/clmul.c | 31 ++++++++++++++++++++++++
3 files changed, 58 insertions(+)
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
index cba8bbf3e4..3fbb1576cf 100644
--- a/host/include/generic/host/crypto/clmul.h
+++ b/host/include/generic/host/crypto/clmul.h
@@ -19,4 +19,8 @@
#define clmul_16x4_even clmul_16x4_even_gen
#define clmul_16x4_odd clmul_16x4_odd_gen
+#define clmul_32 clmul_32_gen
+#define clmul_32x2_even clmul_32x2_even_gen
+#define clmul_32x2_odd clmul_32x2_odd_gen
+
#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
index b701bac9d6..ce43c9aeb1 100644
--- a/include/crypto/clmul.h
+++ b/include/crypto/clmul.h
@@ -88,6 +88,29 @@ Int128 clmul_16x4_even_gen(Int128, Int128);
*/
Int128 clmul_16x4_odd_gen(Int128, Int128);
+/**
+ * clmul_32:
+ *
+ * Perform a 32x32->64 carry-less multiply.
+ */
+uint64_t clmul_32_gen(uint32_t, uint32_t);
+
+/**
+ * clmul_32x2_even:
+ *
+ * Perform two 32x32->64 carry-less multiplies.
+ * The odd words of the inputs are ignored.
+ */
+Int128 clmul_32x2_even_gen(Int128, Int128);
+
+/**
+ * clmul_32x2_odd:
+ *
+ * Perform two 32x32->64 carry-less multiplies.
+ * The even words of the inputs are ignored.
+ */
+Int128 clmul_32x2_odd_gen(Int128, Int128);
+
#include "host/crypto/clmul.h"
#endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
index 69a3b6f7ff..c197cd5f21 100644
--- a/crypto/clmul.c
+++ b/crypto/clmul.c
@@ -113,3 +113,34 @@ Int128 clmul_16x4_odd_gen(Int128 n, Int128 m)
rh = clmul_16x2_odd_gen(int128_gethi(n), int128_gethi(m));
return int128_make128(rl, rh);
}
+
+uint64_t clmul_32_gen(uint32_t n, uint32_t m32)
+{
+ uint64_t r = 0;
+ uint64_t m = m32;
+
+ for (int i = 0; i < 32; ++i) {
+ r ^= n & 1 ? m : 0;
+ n >>= 1;
+ m <<= 1;
+ }
+ return r;
+}
+
+Int128 clmul_32x2_even_gen(Int128 n, Int128 m)
+{
+ uint64_t rl, rh;
+
+ rl = clmul_32_gen(int128_getlo(n), int128_getlo(m));
+ rh = clmul_32_gen(int128_gethi(n), int128_gethi(m));
+ return int128_make128(rl, rh);
+}
+
+Int128 clmul_32x2_odd_gen(Int128 n, Int128 m)
+{
+ uint64_t rl, rh;
+
+ rl = clmul_32_gen(int128_getlo(n) >> 32, int128_getlo(m) >> 32);
+ rh = clmul_32_gen(int128_gethi(n) >> 32, int128_gethi(m) >> 32);
+ return int128_make128(rl, rh);
+}
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 10/18] target/arm: Use clmul_32* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (8 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 09/18] crypto: Add generic 32-bit carry-less multiply routines Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 11/18] target/s390x: " Richard Henderson
` (8 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 32-bit carry-less multiply.
Remove our local version of pmull_d.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/vec_helper.c | 14 +-------------
1 file changed, 1 insertion(+), 13 deletions(-)
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 1b1d5fccbc..c81447e674 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -2057,18 +2057,6 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
}
}
-static uint64_t pmull_d(uint64_t op1, uint64_t op2)
-{
- uint64_t result = 0;
- int i;
-
- for (i = 0; i < 32; ++i) {
- uint64_t mask = -((op1 >> i) & 1);
- result ^= (op2 << i) & mask;
- }
- return result;
-}
-
void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
{
intptr_t sel = H4(simd_data(desc));
@@ -2077,7 +2065,7 @@ void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
uint64_t *d = vd;
for (i = 0; i < opr_sz / 8; ++i) {
- d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
+ d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
}
}
#endif
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 11/18] target/s390x: Use clmul_32* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (9 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 10/18] target/arm: Use clmul_32* routines Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 12/18] target/ppc: " Richard Henderson
` (7 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 32-bit carry-less multiply.
Remove our local version of galois_multiply32.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/s390x/tcg/vec_int_helper.c | 70 ++++++++-----------------------
1 file changed, 17 insertions(+), 53 deletions(-)
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index 523d6375bb..f5eea2330a 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -165,22 +165,6 @@ DEF_VCTZ(8)
DEF_VCTZ(16)
/* like binary multiplication, but XOR instead of addition */
-#define DEF_GALOIS_MULTIPLY(BITS, TBITS) \
-static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \
- uint##TBITS##_t b) \
-{ \
- uint##TBITS##_t res = 0; \
- \
- while (b) { \
- if (b & 0x1) { \
- res = res ^ a; \
- } \
- a = a << 1; \
- b = b >> 1; \
- } \
- return res; \
-}
-DEF_GALOIS_MULTIPLY(32, 64)
static S390Vector galois_multiply64(uint64_t a, uint64_t b)
{
@@ -244,24 +228,24 @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
*(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
}
-#define DEF_VGFM(BITS, TBITS) \
-void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
- uint32_t desc) \
-{ \
- int i; \
- \
- for (i = 0; i < (128 / TBITS); i++) { \
- uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \
- uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \
- uint##TBITS##_t d = galois_multiply##BITS(a, b); \
- \
- a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
- b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
- d = d ^ galois_multiply32(a, b); \
- s390_vec_write_element##TBITS(v1, i, d); \
- } \
+static Int128 do_gfm32(Int128 n, Int128 m)
+{
+ Int128 e = clmul_32x2_even(n, m);
+ Int128 o = clmul_32x2_odd(n, m);
+ return int128_xor(e, o);
+}
+
+void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d)
+{
+ *(Int128 *)v1 = do_gfm32(*(const Int128 *)v2, *(const Int128 *)v3);
+}
+
+void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
+ const void *v4, uint32_t d)
+{
+ Int128 r = do_gfm32(*(const Int128 *)v2, *(const Int128 *)v3);
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
}
-DEF_VGFM(32, 64)
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
uint32_t desc)
@@ -278,26 +262,6 @@ void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
s390_vec_xor(v1, &tmp1, &tmp2);
}
-#define DEF_VGFMA(BITS, TBITS) \
-void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \
- const void *v4, uint32_t desc) \
-{ \
- int i; \
- \
- for (i = 0; i < (128 / TBITS); i++) { \
- uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \
- uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \
- uint##TBITS##_t d = galois_multiply##BITS(a, b); \
- \
- a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
- b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
- d = d ^ galois_multiply32(a, b); \
- d = d ^ s390_vec_read_element##TBITS(v4, i); \
- s390_vec_write_element##TBITS(v1, i, d); \
- } \
-}
-DEF_VGFMA(32, 64)
-
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
const void *v4, uint32_t desc)
{
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 12/18] target/ppc: Use clmul_32* routines
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (10 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 11/18] target/s390x: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 13/18] crypto: Add generic 64-bit carry-less multiply routine Richard Henderson
` (6 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routines for 32-bit carry-less multiply.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/ppc/int_helper.c | 27 +++++++--------------------
1 file changed, 7 insertions(+), 20 deletions(-)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 98d6310f59..828f04bce7 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1444,28 +1444,15 @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
r->s128 = int128_xor(e, o);
}
-#define PMSUM(name, srcfld, trgfld, trgtyp) \
-void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
-{ \
- int i, j; \
- trgtyp prod[sizeof(ppc_avr_t) / sizeof(a->srcfld[0])]; \
- \
- VECTOR_FOR_INORDER_I(i, srcfld) { \
- prod[i] = 0; \
- for (j = 0; j < sizeof(a->srcfld[0]) * 8; j++) { \
- if (a->srcfld[i] & (1ull << j)) { \
- prod[i] ^= ((trgtyp)b->srcfld[i] << j); \
- } \
- } \
- } \
- \
- VECTOR_FOR_INORDER_I(i, trgfld) { \
- r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \
- } \
+void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ Int128 ia = a->s128;
+ Int128 ib = b->s128;
+ Int128 e = clmul_32x2_even(ia, ib);
+ Int128 o = clmul_32x2_odd(ia, ib);
+ r->s128 = int128_xor(e, o);
}
-PMSUM(vpmsumw, u32, u64, uint64_t)
-
void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
int i, j;
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 13/18] crypto: Add generic 64-bit carry-less multiply routine
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (11 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 12/18] target/ppc: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 14/18] target/arm: Use clmul_64 Richard Henderson
` (5 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
host/include/generic/host/crypto/clmul.h | 2 ++
include/crypto/clmul.h | 7 +++++++
crypto/clmul.c | 17 +++++++++++++++++
3 files changed, 26 insertions(+)
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
index 3fbb1576cf..7f70afeb57 100644
--- a/host/include/generic/host/crypto/clmul.h
+++ b/host/include/generic/host/crypto/clmul.h
@@ -23,4 +23,6 @@
#define clmul_32x2_even clmul_32x2_even_gen
#define clmul_32x2_odd clmul_32x2_odd_gen
+#define clmul_64 clmul_64_gen
+
#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
index ce43c9aeb1..8b4c263459 100644
--- a/include/crypto/clmul.h
+++ b/include/crypto/clmul.h
@@ -111,6 +111,13 @@ Int128 clmul_32x2_even_gen(Int128, Int128);
*/
Int128 clmul_32x2_odd_gen(Int128, Int128);
+/**
+ * clmul_64:
+ *
+ * Perform a 64x64->128 carry-less multiply.
+ */
+Int128 clmul_64_gen(uint64_t, uint64_t);
+
#include "host/crypto/clmul.h"
#endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
index c197cd5f21..0be06073f0 100644
--- a/crypto/clmul.c
+++ b/crypto/clmul.c
@@ -144,3 +144,20 @@ Int128 clmul_32x2_odd_gen(Int128 n, Int128 m)
rh = clmul_32_gen(int128_gethi(n) >> 32, int128_gethi(m) >> 32);
return int128_make128(rl, rh);
}
+
+Int128 clmul_64_gen(uint64_t n, uint64_t m)
+{
+ uint64_t rl = 0, rh = 0;
+
+ /* Bit 0 can only influence the low 64-bit result. */
+ if (n & 1) {
+ rl = m;
+ }
+
+ for (int i = 1; i < 64; ++i) {
+ uint64_t mask = -((n >> i) & 1);
+ rl ^= (m << i) & mask;
+ rh ^= (m >> (64 - i)) & mask;
+ }
+ return int128_make128(rl, rh);
+}
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 14/18] target/arm: Use clmul_64
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (12 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 13/18] crypto: Add generic 64-bit carry-less multiply routine Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 15/18] target/s390x: " Richard Henderson
` (4 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routine for 64-bit carry-less multiply.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/vec_helper.c | 22 ++++------------------
1 file changed, 4 insertions(+), 18 deletions(-)
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index c81447e674..1a21aff4d9 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -2003,28 +2003,14 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
*/
void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t i, opr_sz = simd_oprsz(desc);
intptr_t hi = simd_data(desc);
uint64_t *d = vd, *n = vn, *m = vm;
for (i = 0; i < opr_sz / 8; i += 2) {
- uint64_t nn = n[i + hi];
- uint64_t mm = m[i + hi];
- uint64_t rhi = 0;
- uint64_t rlo = 0;
-
- /* Bit 0 can only influence the low 64-bit result. */
- if (nn & 1) {
- rlo = mm;
- }
-
- for (j = 1; j < 64; ++j) {
- uint64_t mask = -((nn >> j) & 1);
- rlo ^= (mm << j) & mask;
- rhi ^= (mm >> (64 - j)) & mask;
- }
- d[i] = rlo;
- d[i + 1] = rhi;
+ Int128 r = clmul_64(n[i + hi], m[i + hi]);
+ d[i] = int128_getlo(r);
+ d[i + 1] = int128_gethi(r);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 15/18] target/s390x: Use clmul_64
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (13 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 14/18] target/arm: Use clmul_64 Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 16/18] target/ppc: " Richard Henderson
` (3 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use the generic routine for 64-bit carry-less multiply.
Remove our local version of galois_multiply64.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/s390x/tcg/vec_int_helper.c | 62 +++++++------------------------
1 file changed, 14 insertions(+), 48 deletions(-)
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index f5eea2330a..002ba67b11 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -21,13 +21,6 @@ static bool s390_vec_is_zero(const S390Vector *v)
return !v->doubleword[0] && !v->doubleword[1];
}
-static void s390_vec_xor(S390Vector *res, const S390Vector *a,
- const S390Vector *b)
-{
- res->doubleword[0] = a->doubleword[0] ^ b->doubleword[0];
- res->doubleword[1] = a->doubleword[1] ^ b->doubleword[1];
-}
-
static void s390_vec_and(S390Vector *res, const S390Vector *a,
const S390Vector *b)
{
@@ -166,26 +159,6 @@ DEF_VCTZ(16)
/* like binary multiplication, but XOR instead of addition */
-static S390Vector galois_multiply64(uint64_t a, uint64_t b)
-{
- S390Vector res = {};
- S390Vector va = {
- .doubleword[1] = a,
- };
- S390Vector vb = {
- .doubleword[1] = b,
- };
-
- while (!s390_vec_is_zero(&vb)) {
- if (vb.doubleword[1] & 0x1) {
- s390_vec_xor(&res, &res, &va);
- }
- s390_vec_shl(&va, &va, 1);
- s390_vec_shr(&vb, &vb, 1);
- }
- return res;
-}
-
static Int128 do_gfm8(Int128 n, Int128 m)
{
Int128 e = clmul_8x8_even(n, m);
@@ -247,35 +220,28 @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
*(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
}
+static Int128 do_gfm64(Int128 n, Int128 m)
+{
+ /*
+ * The two 64-bit halves are treated identically,
+ * therefore host ordering does not matter.
+ */
+ Int128 e = clmul_64(int128_getlo(n), int128_getlo(m));
+ Int128 o = clmul_64(int128_gethi(n), int128_gethi(m));
+ return int128_xor(e, o);
+}
+
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
uint32_t desc)
{
- S390Vector tmp1, tmp2;
- uint64_t a, b;
-
- a = s390_vec_read_element64(v2, 0);
- b = s390_vec_read_element64(v3, 0);
- tmp1 = galois_multiply64(a, b);
- a = s390_vec_read_element64(v2, 1);
- b = s390_vec_read_element64(v3, 1);
- tmp2 = galois_multiply64(a, b);
- s390_vec_xor(v1, &tmp1, &tmp2);
+ *(Int128 *)v1 = do_gfm64(*(const Int128 *)v2, *(const Int128 *)v3);
}
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
const void *v4, uint32_t desc)
{
- S390Vector tmp1, tmp2;
- uint64_t a, b;
-
- a = s390_vec_read_element64(v2, 0);
- b = s390_vec_read_element64(v3, 0);
- tmp1 = galois_multiply64(a, b);
- a = s390_vec_read_element64(v2, 1);
- b = s390_vec_read_element64(v3, 1);
- tmp2 = galois_multiply64(a, b);
- s390_vec_xor(&tmp1, &tmp1, &tmp2);
- s390_vec_xor(v1, &tmp1, v4);
+ Int128 r = do_gfm64(*(const Int128 *)v2, *(const Int128 *)v3);
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
}
#define DEF_VMAL(BITS) \
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 16/18] target/ppc: Use clmul_64
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (14 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 15/18] target/s390x: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-13 21:14 ` [PATCH 17/18] host/include/i386: Implement clmul.h Richard Henderson
` (2 subsequent siblings)
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Use generic routine for 64-bit carry-less multiply.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/ppc/int_helper.c | 17 +++--------------
1 file changed, 3 insertions(+), 14 deletions(-)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 828f04bce7..4e1fa2fd68 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1455,20 +1455,9 @@ void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
- int i, j;
- Int128 tmp, prod[2] = {int128_zero(), int128_zero()};
-
- for (j = 0; j < 64; j++) {
- for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
- if (a->VsrD(i) & (1ull << j)) {
- tmp = int128_make64(b->VsrD(i));
- tmp = int128_lshift(tmp, j);
- prod[i] = int128_xor(prod[i], tmp);
- }
- }
- }
-
- r->s128 = int128_xor(prod[0], prod[1]);
+ Int128 e = clmul_64(a->u64[0], b->u64[0]);
+ Int128 o = clmul_64(a->u64[1], b->u64[1]);
+ r->s128 = int128_xor(e, o);
}
#if HOST_BIG_ENDIAN
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 17/18] host/include/i386: Implement clmul.h
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (15 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 16/18] target/ppc: " Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-07-19 11:52 ` Ilya Leoshkevich
2023-07-13 21:14 ` [PATCH 18/18] host/include/aarch64: " Richard Henderson
2023-08-03 14:02 ` [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Ard Biesheuvel
18 siblings, 1 reply; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Detect PCLMUL in cpuinfo; implement the accel hooks.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
host/include/i386/host/cpuinfo.h | 1 +
host/include/i386/host/crypto/clmul.h | 187 ++++++++++++++++++++++++
host/include/x86_64/host/crypto/clmul.h | 1 +
util/cpuinfo-i386.c | 1 +
4 files changed, 190 insertions(+)
create mode 100644 host/include/i386/host/crypto/clmul.h
create mode 100644 host/include/x86_64/host/crypto/clmul.h
diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
index 073d0a426f..7ae21568f7 100644
--- a/host/include/i386/host/cpuinfo.h
+++ b/host/include/i386/host/cpuinfo.h
@@ -27,6 +27,7 @@
#define CPUINFO_ATOMIC_VMOVDQA (1u << 16)
#define CPUINFO_ATOMIC_VMOVDQU (1u << 17)
#define CPUINFO_AES (1u << 18)
+#define CPUINFO_PCLMUL (1u << 19)
/* Initialized with a constructor. */
extern unsigned cpuinfo;
diff --git a/host/include/i386/host/crypto/clmul.h b/host/include/i386/host/crypto/clmul.h
new file mode 100644
index 0000000000..0877d65ab6
--- /dev/null
+++ b/host/include/i386/host/crypto/clmul.h
@@ -0,0 +1,187 @@
+/*
+ * x86 specific clmul acceleration.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef X86_HOST_CRYPTO_CLMUL_H
+#define X86_HOST_CRYPTO_CLMUL_H
+
+#include "host/cpuinfo.h"
+#include <immintrin.h>
+
+#if defined(__PCLMUL__)
+# define HAVE_CLMUL_ACCEL true
+# define ATTR_CLMUL_ACCEL
+#else
+# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL)
+# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul")))
+#endif
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_64(uint64_t n, uint64_t m)
+{
+ union { __m128i v; Int128 s; } u;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_64_gen(n, m);
+ }
+
+ u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0);
+ return u.s;
+}
+
+static inline uint64_t ATTR_CLMUL_ACCEL
+clmul_32(uint32_t n, uint32_t m)
+{
+ __m128i r;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_32_gen(n, m);
+ }
+
+ r = _mm_clmulepi64_si128(_mm_cvtsi32_si128(n), _mm_cvtsi32_si128(m), 0);
+ return ((__v2di)r)[0];
+}
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_32x2_even(Int128 n, Int128 m)
+{
+ union { __m128i v; Int128 s; } ur, un, um;
+ __m128i n02, m02, r0, r2;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_32x2_even_gen(n, m);
+ }
+
+ un.s = n;
+ um.s = m;
+ n02 = _mm_slli_epi64(un.v, 32);
+ m02 = _mm_slli_epi64(um.v, 32);
+ r0 = _mm_clmulepi64_si128(n02, m02, 0x00);
+ r2 = _mm_clmulepi64_si128(n02, m02, 0x11);
+ ur.v = _mm_unpackhi_epi64(r0, r2);
+ return ur.s;
+}
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_32x2_odd(Int128 n, Int128 m)
+{
+ union { __m128i v; Int128 s; } ur, un, um;
+ __m128i n13, m13, r1, r3;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_32x2_odd_gen(n, m);
+ }
+
+ un.s = n;
+ um.s = m;
+ n13 = _mm_srli_epi64(un.v, 32);
+ m13 = _mm_srli_epi64(um.v, 32);
+ r1 = _mm_clmulepi64_si128(n13, m13, 0x00);
+ r3 = _mm_clmulepi64_si128(n13, m13, 0x11);
+ ur.v = _mm_unpacklo_epi64(r1, r3);
+ return ur.s;
+}
+
+static inline uint64_t ATTR_CLMUL_ACCEL
+clmul_16x2_even(uint64_t n, uint64_t m)
+{
+ __m128i r0, r2;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_16x2_even_gen(n, m);
+ }
+
+ r0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(n & 0xffff),
+ _mm_cvtsi32_si128(m & 0xffff), 0);
+ r2 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 32) & 0xffff),
+ _mm_cvtsi32_si128((m >> 32) & 0xffff), 0);
+ r0 = _mm_unpacklo_epi32(r0, r2);
+ return ((__v2di)r0)[0];
+}
+
+static inline uint64_t ATTR_CLMUL_ACCEL
+clmul_16x2_odd(uint64_t n, uint64_t m)
+{
+ __m128i r1, r3;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_16x2_even_gen(n, m);
+ }
+
+ r1 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 16) & 0xffff),
+ _mm_cvtsi32_si128((m >> 16) & 0xffff), 0);
+ r3 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 48) & 0xffff),
+ _mm_cvtsi32_si128((m >> 48) & 0xffff), 0);
+ r1 = _mm_unpacklo_epi32(r1, r3);
+ return ((__v2di)r1)[0];
+}
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_16x4_even(Int128 n, Int128 m)
+{
+ union { __m128i v; Int128 s; } ur, un, um;
+ __m128i mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
+ __m128i n04, m04, n26, m26, r0, r2, r4, r6;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_16x4_even_gen(n, m);
+ }
+
+ un.s = n;
+ um.s = m;
+ n04 = _mm_and_si128(un.v, mask);
+ m04 = _mm_and_si128(um.v, mask);
+ r0 = _mm_clmulepi64_si128(n04, m04, 0x00);
+ r4 = _mm_clmulepi64_si128(n04, m04, 0x11);
+ n26 = _mm_and_si128(_mm_srli_epi64(un.v, 32), mask);
+ m26 = _mm_and_si128(_mm_srli_epi64(um.v, 32), mask);
+ r2 = _mm_clmulepi64_si128(n26, m26, 0x00);
+ r6 = _mm_clmulepi64_si128(n26, m26, 0x11);
+
+ r0 = _mm_unpacklo_epi32(r0, r2);
+ r4 = _mm_unpacklo_epi32(r4, r6);
+ ur.v = _mm_unpacklo_epi64(r0, r4);
+ return ur.s;
+}
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_16x4_odd(Int128 n, Int128 m)
+{
+ union { __m128i v; Int128 s; } ur, un, um;
+ __m128i mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
+ __m128i n15, m15, n37, m37, r1, r3, r5, r7;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_16x4_odd_gen(n, m);
+ }
+
+ un.s = n;
+ um.s = m;
+ n15 = _mm_and_si128(_mm_srli_epi64(un.v, 16), mask);
+ m15 = _mm_and_si128(_mm_srli_epi64(um.v, 16), mask);
+ r1 = _mm_clmulepi64_si128(n15, m15, 0x00);
+ r5 = _mm_clmulepi64_si128(n15, m15, 0x11);
+ n37 = _mm_srli_epi64(un.v, 48);
+ m37 = _mm_srli_epi64(um.v, 48);
+ r3 = _mm_clmulepi64_si128(n37, m37, 0x00);
+ r7 = _mm_clmulepi64_si128(n37, m37, 0x11);
+
+ r1 = _mm_unpacklo_epi32(r1, r3);
+ r5 = _mm_unpacklo_epi32(r5, r7);
+ ur.v = _mm_unpacklo_epi64(r1, r5);
+ return ur.s;
+}
+
+/*
+ * Defer everything else to the generic routines.
+ * We could implement them with even more element manipulation.
+ */
+#define clmul_8x8_low clmul_8x8_low_gen
+#define clmul_8x4_even clmul_8x4_even_gen
+#define clmul_8x4_odd clmul_8x4_odd_gen
+#define clmul_8x8_even clmul_8x8_even_gen
+#define clmul_8x8_odd clmul_8x8_odd_gen
+#define clmul_8x8_packed clmul_8x8_packed_gen
+
+#endif /* X86_HOST_CRYPTO_CLMUL_H */
diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h
new file mode 100644
index 0000000000..f25eced416
--- /dev/null
+++ b/host/include/x86_64/host/crypto/clmul.h
@@ -0,0 +1 @@
+#include "host/include/i386/host/crypto/clmul.h"
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index 3a7b7e0ad1..c6f6364826 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -39,6 +39,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
+ info |= (c & bit_PCLMULQDQ ? CPUINFO_PCLMUL : 0);
/* Our AES support requires PSHUFB as well. */
info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0);
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH 17/18] host/include/i386: Implement clmul.h
2023-07-13 21:14 ` [PATCH 17/18] host/include/i386: Implement clmul.h Richard Henderson
@ 2023-07-19 11:52 ` Ilya Leoshkevich
2023-07-22 11:47 ` Richard Henderson
0 siblings, 1 reply; 23+ messages in thread
From: Ilya Leoshkevich @ 2023-07-19 11:52 UTC (permalink / raw)
To: Richard Henderson, qemu-devel; +Cc: berrange, ardb
On Thu, 2023-07-13 at 22:14 +0100, Richard Henderson wrote:
> Detect PCLMUL in cpuinfo; implement the accel hooks.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> host/include/i386/host/cpuinfo.h | 1 +
> host/include/i386/host/crypto/clmul.h | 187
> ++++++++++++++++++++++++
> host/include/x86_64/host/crypto/clmul.h | 1 +
> util/cpuinfo-i386.c | 1 +
> 4 files changed, 190 insertions(+)
> create mode 100644 host/include/i386/host/crypto/clmul.h
> create mode 100644 host/include/x86_64/host/crypto/clmul.h
...
> diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
> index 3a7b7e0ad1..c6f6364826 100644
> --- a/util/cpuinfo-i386.c
> +++ b/util/cpuinfo-i386.c
> @@ -39,6 +39,7 @@ unsigned __attribute__((constructor))
> cpuinfo_init(void)
> info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
> info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
> info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
> + info |= (c & bit_PCLMULQDQ ? CPUINFO_PCLMUL : 0);
I wanted to give this a try, but my cpuid.h has only
bit_VPCLMULQDQ, and I don't see it in [1] either.
Where is this supposed to come from? Does qemu/cpuid.h need an update?
...
[1]
https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/config/i386/cpuid.h;h=03fd6fc9478e8ef87d0f7191b9f80539e9c3e939;hb=refs/heads/master
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 17/18] host/include/i386: Implement clmul.h
2023-07-19 11:52 ` Ilya Leoshkevich
@ 2023-07-22 11:47 ` Richard Henderson
0 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-22 11:47 UTC (permalink / raw)
To: Ilya Leoshkevich, qemu-devel; +Cc: berrange, ardb
On 7/19/23 12:52, Ilya Leoshkevich wrote:
> On Thu, 2023-07-13 at 22:14 +0100, Richard Henderson wrote:
>> Detect PCLMUL in cpuinfo; implement the accel hooks.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>> host/include/i386/host/cpuinfo.h | 1 +
>> host/include/i386/host/crypto/clmul.h | 187
>> ++++++++++++++++++++++++
>> host/include/x86_64/host/crypto/clmul.h | 1 +
>> util/cpuinfo-i386.c | 1 +
>> 4 files changed, 190 insertions(+)
>> create mode 100644 host/include/i386/host/crypto/clmul.h
>> create mode 100644 host/include/x86_64/host/crypto/clmul.h
>
> ...
>
>> diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
>> index 3a7b7e0ad1..c6f6364826 100644
>> --- a/util/cpuinfo-i386.c
>> +++ b/util/cpuinfo-i386.c
>> @@ -39,6 +39,7 @@ unsigned __attribute__((constructor))
>> cpuinfo_init(void)
>> info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
>> info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
>> info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
>> + info |= (c & bit_PCLMULQDQ ? CPUINFO_PCLMUL : 0);
>
> I wanted to give this a try, but my cpuid.h has only
> bit_VPCLMULQDQ, and I don't see it in [1] either.
Should have been bit_PCLMUL (VPCLMULQDQ is for the 256- and 512-bit inputs). I'll have to
track down why this compiled for me.
> Where is this supposed to come from? Does qemu/cpuid.h need an update?
Yes, an update is required.
r~
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 18/18] host/include/aarch64: Implement clmul.h
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (16 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 17/18] host/include/i386: Implement clmul.h Richard Henderson
@ 2023-07-13 21:14 ` Richard Henderson
2023-08-03 14:02 ` [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Ard Biesheuvel
18 siblings, 0 replies; 23+ messages in thread
From: Richard Henderson @ 2023-07-13 21:14 UTC (permalink / raw)
To: qemu-devel; +Cc: berrange, ardb
Detect PMULL in cpuinfo; implement the accel hooks.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
host/include/aarch64/host/cpuinfo.h | 1 +
host/include/aarch64/host/crypto/clmul.h | 230 +++++++++++++++++++++++
util/cpuinfo-aarch64.c | 4 +-
3 files changed, 234 insertions(+), 1 deletion(-)
create mode 100644 host/include/aarch64/host/crypto/clmul.h
diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
index 05feeb4f43..da268dce13 100644
--- a/host/include/aarch64/host/cpuinfo.h
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -10,6 +10,7 @@
#define CPUINFO_LSE (1u << 1)
#define CPUINFO_LSE2 (1u << 2)
#define CPUINFO_AES (1u << 3)
+#define CPUINFO_PMULL (1u << 4)
/* Initialized with a constructor. */
extern unsigned cpuinfo;
diff --git a/host/include/aarch64/host/crypto/clmul.h b/host/include/aarch64/host/crypto/clmul.h
new file mode 100644
index 0000000000..7fd827898b
--- /dev/null
+++ b/host/include/aarch64/host/crypto/clmul.h
@@ -0,0 +1,230 @@
+/*
+ * AArch64 specific clmul acceleration.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef AARCH64_HOST_CRYPTO_CLMUL_H
+#define AARCH64_HOST_CRYPTO_CLMUL_H
+
+#include "host/cpuinfo.h"
+#include <arm_neon.h>
+
+/* Both FEAT_AES and FEAT_PMULL are covered under the same macro. */
+#ifdef __ARM_FEATURE_AES
+# define HAVE_CLMUL_ACCEL true
+#else
+# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL)
+#endif
+#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN)
+# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto")))
+#else
+# define ATTR_CLMUL_ACCEL
+#endif
+
+/*
+ * The 8x8->8 pmul and 8x8->16 pmull are available unconditionally.
+ */
+
+static inline uint64_t clmul_8x8_low(uint64_t n, uint64_t m)
+{
+ return (uint64_t)vmul_p8((poly8x8_t)n, (poly8x8_t)m);
+}
+
+static inline Int128 clmul_8x8_packed(uint64_t n, uint64_t m)
+{
+ union { poly16x8_t v; Int128 s; } u;
+ u.v = vmull_p8((poly8x8_t)n, (poly8x8_t)m);
+ return u.s;
+}
+
+static inline Int128 clmul_8x8_even(Int128 n, Int128 m)
+{
+ union { uint16x8_t v; Int128 s; } un, um;
+ uint8x8_t pn, pm;
+
+ un.s = n;
+ um.s = m;
+ pn = vmovn_u16(un.v);
+ pm = vmovn_u16(um.v);
+ return clmul_8x8_packed((uint64_t)pn, (uint64_t)pm);
+}
+
+static inline Int128 clmul_8x8_odd(Int128 n, Int128 m)
+{
+ union { uint8x16_t v; Int128 s; } un, um;
+ uint8x8_t pn, pm;
+
+ un.s = n;
+ um.s = m;
+ pn = vqtbl1_u8(un.v, (uint8x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 });
+ pm = vqtbl1_u8(um.v, (uint8x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 });
+ return clmul_8x8_packed((uint64_t)pn, (uint64_t)pm);
+}
+
+static inline uint64_t clmul_8x4_even(uint64_t n, uint64_t m)
+{
+ return int128_getlo(clmul_8x8_even(int128_make64(n), int128_make64(m)));
+}
+
+static inline uint64_t clmul_8x4_odd(uint64_t n, uint64_t m)
+{
+ return int128_getlo(clmul_8x8_odd(int128_make64(n), int128_make64(m)));
+}
+
+static inline Int128 clmul_16x4_packed_accel(uint16x4_t n, uint16x4_t m)
+{
+ union { uint32x4_t v; Int128 s; } u;
+ uint32x4_t r0, r1, r2;
+
+ /*
+ * Considering the per-byte multiplication:
+ * ab
+ * cd
+ * -----
+ * bd << 0
+ * bc << 8
+ * ad << 8
+ * ac << 16
+ *
+ * We get the ac and bd rows of the result for free from the expanding
+ * packed multiply. Reverse the two bytes in M, repeat, and we get the
+ * ad and bc results, but in the wrong column; shift to fix and sum all.
+ */
+ r0 = (uint32x4_t)vmull_p8((poly8x8_t)n, (poly8x8_t)m);
+ r1 = (uint32x4_t)vmull_p8((poly8x8_t)n, vrev16_p8((poly8x8_t)m));
+ r2 = r1 << 8; /* bc */
+ r1 = r1 >> 8; /* ad */
+ r1 &= (uint32x4_t){ 0x00ffff00, 0x00ffff00, 0x00ffff00, 0x00ffff00 };
+ r2 &= (uint32x4_t){ 0x00ffff00, 0x00ffff00, 0x00ffff00, 0x00ffff00 };
+ r0 = r0 ^ r1 ^ r2;
+
+ u.v = r0;
+ return u.s;
+}
+
+static inline Int128 clmul_16x4_even(Int128 n, Int128 m)
+{
+ union { uint32x4_t v; Int128 s; } um, un;
+ uint16x4_t pn, pm;
+
+ /* Extract even uint16_t. */
+ un.s = n;
+ um.s = m;
+ pn = vmovn_u32(un.v);
+ pm = vmovn_u32(um.v);
+ return clmul_16x4_packed_accel(pn, pm);
+}
+
+static inline Int128 clmul_16x4_odd(Int128 n, Int128 m)
+{
+ union { uint8x16_t v; Int128 s; } um, un;
+ uint16x4_t pn, pm;
+
+ /* Extract odd uint16_t. */
+ un.s = n;
+ um.s = m;
+ pn = (uint16x4_t)vqtbl1_u8(un.v, (uint8x8_t){ 2, 3, 6, 7, 10, 11, 14, 15 });
+ pm = (uint16x4_t)vqtbl1_u8(um.v, (uint8x8_t){ 2, 3, 6, 7, 10, 11, 14, 15 });
+ return clmul_16x4_packed_accel(pn, pm);
+}
+
+static inline uint64_t clmul_16x2_even(uint64_t n, uint64_t m)
+{
+ return int128_getlo(clmul_16x4_even(int128_make64(n), int128_make64(m)));
+}
+
+static inline uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
+{
+ return int128_getlo(clmul_16x4_odd(int128_make64(n), int128_make64(m)));
+}
+
+/*
+ * The 64x64->128 pmull is available with FEAT_PMULL.
+ */
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_64(uint64_t n, uint64_t m)
+{
+ union { poly128_t v; Int128 s; } u;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_64_gen(n, m);
+ }
+
+#ifdef CONFIG_ARM_AES_BUILTIN
+ u.v = vmull_p64((poly64_t)n, (poly64_t)m);
+#else
+ asm(".arch_extension aes\n\t"
+ "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m));
+#endif
+ return u.s;
+}
+
+static inline uint64_t ATTR_CLMUL_ACCEL
+clmul_32(uint32_t n, uint32_t m)
+{
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_32_gen(n, m);
+ }
+ return int128_getlo(clmul_64(n, m));
+}
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_32x2_even(Int128 n, Int128 m)
+{
+ union { uint64x2_t v; poly64_t h; Int128 s; } um, un, ur;
+ uint64x2_t r0, r2;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_32x2_even_gen(n, m);
+ }
+
+ un.s = n;
+ um.s = m;
+ un.v &= (uint64x2_t){ 0xffffffffu, 0xffffffffu };
+ um.v &= (uint64x2_t){ 0xffffffffu, 0xffffffffu };
+
+#ifdef CONFIG_ARM_AES_BUILTIN
+ r0 = (uint64x2_t)vmull_p64(un.h, um.h);
+ r2 = (uint64x2_t)vmull_high_p64((poly64x2_t)un.v, (poly64x2_t)um.v);
+#else
+ asm(".arch_extension aes\n\t"
+ "pmull %0.1q, %2.1d, %3.1d\n\t"
+ "pmull2 %1.1q, %2.2d, %3.2d"
+ : "=&w"(r0), "=w"(r2) : "w"(un.v), "w"(um.v));
+#endif
+
+ ur.v = vzip1q_u64(r0, r2);
+ return ur.s;
+}
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_32x2_odd(Int128 n, Int128 m)
+{
+ union { uint64x2_t v; poly64_t h; Int128 s; } um, un, ur;
+ uint64x2_t r0, r2;
+
+ if (!HAVE_CLMUL_ACCEL) {
+ return clmul_32x2_odd_gen(n, m);
+ }
+
+ un.s = n;
+ um.s = m;
+ un.v &= (uint64x2_t){ 0xffffffff00000000ull, 0xffffffff00000000ull };
+ um.v &= (uint64x2_t){ 0xffffffff00000000ull, 0xffffffff00000000ull };
+
+#ifdef CONFIG_ARM_AES_BUILTIN
+ r0 = (uint64x2_t)vmull_p64(un.h, um.h);
+ r2 = (uint64x2_t)vmull_high_p64((poly64x2_t)un.v, (poly64x2_t)um.v);
+#else
+ asm(".arch_extension aes\n\t"
+ "pmull %0.1q, %2.1d, %3.1d\n\t"
+ "pmull2 %1.1q, %2.2d, %3.2d"
+ : "=&w"(r0), "=w"(r2) : "w"(un.v), "w"(um.v));
+#endif
+
+ ur.v = vzip2q_u64(r0, r2);
+ return ur.s;
+}
+
+#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
index ababc39550..1d565b8420 100644
--- a/util/cpuinfo-aarch64.c
+++ b/util/cpuinfo-aarch64.c
@@ -56,12 +56,14 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
unsigned long hwcap = qemu_getauxval(AT_HWCAP);
info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
- info |= (hwcap & HWCAP_AES ? CPUINFO_AES: 0);
+ info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
+ info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
#endif
#ifdef CONFIG_DARWIN
info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
info |= sysctl_for_bool("hw.optional.arm.FEAT_AES") * CPUINFO_AES;
+ info |= sysctl_for_bool("hw.optional.arm.FEAT_PMULL") * CPUINFO_PMULL;
#endif
cpuinfo = info;
--
2.34.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel
2023-07-13 21:14 [RFC PATCH for-8.2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
` (17 preceding siblings ...)
2023-07-13 21:14 ` [PATCH 18/18] host/include/aarch64: " Richard Henderson
@ 2023-08-03 14:02 ` Ard Biesheuvel
18 siblings, 0 replies; 23+ messages in thread
From: Ard Biesheuvel @ 2023-08-03 14:02 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-devel, berrange
On Thu, 13 Jul 2023 at 23:14, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Inspired by Ard Biesheuvel's RFC patches [1] for accelerating
> carry-less multiply under emulation.
>
> This is less polished than the AES patch set:
>
> (1) Should I split HAVE_CLMUL_ACCEL into per-width HAVE_CLMUL{N}_ACCEL?
> The "_generic" and "_accel" split is different from aes-round.h
> because of the difference in support for different widths, and it
> means that each host accel has more boilerplate.
>
> (2) Should I bother trying to accelerate anything other than 64x64->128?
That is the only compelling use case afaict.
> That seems to be the one that GSM really wants anyway. I'd keep all
> of the sizes implemented generically, since that centralizes the 3
> target implementations.
>
> (3) The use of Int128 isn't fantastic -- better would be a vector type,
> though that has its own special problems for ppc64le (see the
> endianness hoops within aes-round.h). Perhaps leave things in
> env memory, like I was mostly able to do with AES?
>
> (4) No guest test case(s).
>
>
> r~
>
>
> [1] https://patchew.org/QEMU/20230601123332.3297404-1-ardb@kernel.org/
>
> Richard Henderson (18):
> crypto: Add generic 8-bit carry-less multiply routines
> target/arm: Use clmul_8* routines
> target/s390x: Use clmul_8* routines
> target/ppc: Use clmul_8* routines
> crypto: Add generic 16-bit carry-less multiply routines
> target/arm: Use clmul_16* routines
> target/s390x: Use clmul_16* routines
> target/ppc: Use clmul_16* routines
> crypto: Add generic 32-bit carry-less multiply routines
> target/arm: Use clmul_32* routines
> target/s390x: Use clmul_32* routines
> target/ppc: Use clmul_32* routines
> crypto: Add generic 64-bit carry-less multiply routine
> target/arm: Use clmul_64
> target/s390x: Use clmul_64
> target/ppc: Use clmul_64
> host/include/i386: Implement clmul.h
> host/include/aarch64: Implement clmul.h
>
> host/include/aarch64/host/cpuinfo.h | 1 +
> host/include/aarch64/host/crypto/clmul.h | 230 +++++++++++++++++++++++
> host/include/generic/host/crypto/clmul.h | 28 +++
> host/include/i386/host/cpuinfo.h | 1 +
> host/include/i386/host/crypto/clmul.h | 187 ++++++++++++++++++
> host/include/x86_64/host/crypto/clmul.h | 1 +
> include/crypto/clmul.h | 123 ++++++++++++
> target/arm/tcg/vec_internal.h | 11 --
> crypto/clmul.c | 163 ++++++++++++++++
> target/arm/tcg/mve_helper.c | 16 +-
> target/arm/tcg/vec_helper.c | 112 ++---------
> target/ppc/int_helper.c | 63 +++----
> target/s390x/tcg/vec_int_helper.c | 175 +++++++----------
> util/cpuinfo-aarch64.c | 4 +-
> util/cpuinfo-i386.c | 1 +
> crypto/meson.build | 9 +-
> 16 files changed, 865 insertions(+), 260 deletions(-)
> create mode 100644 host/include/aarch64/host/crypto/clmul.h
> create mode 100644 host/include/generic/host/crypto/clmul.h
> create mode 100644 host/include/i386/host/crypto/clmul.h
> create mode 100644 host/include/x86_64/host/crypto/clmul.h
> create mode 100644 include/crypto/clmul.h
> create mode 100644 crypto/clmul.c
>
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 23+ messages in thread