qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 00/18] crypto: Provide clmul.h and host accel
@ 2023-08-19  1:02 Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 01/18] crypto: Add generic 8-bit carry-less multiply routines Richard Henderson
                   ` (18 more replies)
  0 siblings, 19 replies; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Inspired by Ard Biesheuvel's RFC patches [1] for accelerating
carry-less multiply under emulation.

Changes for v2:
  * Only accelerate clmul_64; keep generic helpers for other sizes.
  * Drop most of the Int128 interfaces, except for clmul_64.
  * Use the same acceleration format as aes-round.h.


r~


[1] https://patchew.org/QEMU/20230601123332.3297404-1-ardb@kernel.org/

Richard Henderson (18):
  crypto: Add generic 8-bit carry-less multiply routines
  target/arm: Use clmul_8* routines
  target/s390x: Use clmul_8* routines
  target/ppc: Use clmul_8* routines
  crypto: Add generic 16-bit carry-less multiply routines
  target/arm: Use clmul_16* routines
  target/s390x: Use clmul_16* routines
  target/ppc: Use clmul_16* routines
  crypto: Add generic 32-bit carry-less multiply routines
  target/arm: Use clmul_32* routines
  target/s390x: Use clmul_32* routines
  target/ppc: Use clmul_32* routines
  crypto: Add generic 64-bit carry-less multiply routine
  target/arm: Use clmul_64
  target/s390x: Use clmul_64
  target/ppc: Use clmul_64
  host/include/i386: Implement clmul.h
  host/include/aarch64: Implement clmul.h

 host/include/aarch64/host/cpuinfo.h      |   1 +
 host/include/aarch64/host/crypto/clmul.h |  41 +++++
 host/include/generic/host/crypto/clmul.h |  15 ++
 host/include/i386/host/cpuinfo.h         |   1 +
 host/include/i386/host/crypto/clmul.h    |  29 ++++
 host/include/x86_64/host/crypto/clmul.h  |   1 +
 include/crypto/clmul.h                   |  83 ++++++++++
 include/qemu/cpuid.h                     |   3 +
 target/arm/tcg/vec_internal.h            |  11 --
 crypto/clmul.c                           | 112 ++++++++++++++
 target/arm/tcg/mve_helper.c              |  16 +-
 target/arm/tcg/vec_helper.c              | 102 ++-----------
 target/ppc/int_helper.c                  |  64 ++++----
 target/s390x/tcg/vec_int_helper.c        | 186 ++++++++++-------------
 util/cpuinfo-aarch64.c                   |   4 +-
 util/cpuinfo-i386.c                      |   1 +
 crypto/meson.build                       |   9 +-
 17 files changed, 425 insertions(+), 254 deletions(-)
 create mode 100644 host/include/aarch64/host/crypto/clmul.h
 create mode 100644 host/include/generic/host/crypto/clmul.h
 create mode 100644 host/include/i386/host/crypto/clmul.h
 create mode 100644 host/include/x86_64/host/crypto/clmul.h
 create mode 100644 include/crypto/clmul.h
 create mode 100644 crypto/clmul.c

-- 
2.34.1



^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 01/18] crypto: Add generic 8-bit carry-less multiply routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-09-10 12:19   ` Ard Biesheuvel
  2023-08-19  1:02 ` [PATCH v2 02/18] target/arm: Use clmul_8* routines Richard Henderson
                   ` (17 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/crypto/clmul.h | 41 +++++++++++++++++++++++++++++
 crypto/clmul.c         | 60 ++++++++++++++++++++++++++++++++++++++++++
 crypto/meson.build     |  9 ++++---
 3 files changed, 107 insertions(+), 3 deletions(-)
 create mode 100644 include/crypto/clmul.h
 create mode 100644 crypto/clmul.c

diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
new file mode 100644
index 0000000000..153b5e3057
--- /dev/null
+++ b/include/crypto/clmul.h
@@ -0,0 +1,41 @@
+/*
+ * Carry-less multiply operations.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef CRYPTO_CLMUL_H
+#define CRYPTO_CLMUL_H
+
+/**
+ * clmul_8x8_low:
+ *
+ * Perform eight 8x8->8 carry-less multiplies.
+ */
+uint64_t clmul_8x8_low(uint64_t, uint64_t);
+
+/**
+ * clmul_8x4_even:
+ *
+ * Perform four 8x8->16 carry-less multiplies.
+ * The odd bytes of the inputs are ignored.
+ */
+uint64_t clmul_8x4_even(uint64_t, uint64_t);
+
+/**
+ * clmul_8x4_odd:
+ *
+ * Perform four 8x8->16 carry-less multiplies.
+ * The even bytes of the inputs are ignored.
+ */
+uint64_t clmul_8x4_odd(uint64_t, uint64_t);
+
+/**
+ * clmul_8x4_packed:
+ *
+ * Perform four 8x8->16 carry-less multiplies.
+ */
+uint64_t clmul_8x4_packed(uint32_t, uint32_t);
+
+#endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
new file mode 100644
index 0000000000..82d873fee5
--- /dev/null
+++ b/crypto/clmul.c
@@ -0,0 +1,60 @@
+/*
+ * Carry-less multiply operations.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/clmul.h"
+
+uint64_t clmul_8x8_low(uint64_t n, uint64_t m)
+{
+    uint64_t r = 0;
+
+    for (int i = 0; i < 8; ++i) {
+        uint64_t mask = (n & 0x0101010101010101ull) * 0xff;
+        r ^= m & mask;
+        m = (m << 1) & 0xfefefefefefefefeull;
+        n >>= 1;
+    }
+    return r;
+}
+
+static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m)
+{
+    uint64_t r = 0;
+
+    for (int i = 0; i < 8; ++i) {
+        uint64_t mask = (n & 0x0001000100010001ull) * 0xffff;
+        r ^= m & mask;
+        n >>= 1;
+        m <<= 1;
+    }
+    return r;
+}
+
+uint64_t clmul_8x4_even(uint64_t n, uint64_t m)
+{
+    n &= 0x00ff00ff00ff00ffull;
+    m &= 0x00ff00ff00ff00ffull;
+    return clmul_8x4_even_int(n, m);
+}
+
+uint64_t clmul_8x4_odd(uint64_t n, uint64_t m)
+{
+    return clmul_8x4_even(n >> 8, m >> 8);
+}
+
+static uint64_t unpack_8_to_16(uint64_t x)
+{
+    return  (x & 0x000000ff)
+         | ((x & 0x0000ff00) << 8)
+         | ((x & 0x00ff0000) << 16)
+         | ((x & 0xff000000) << 24);
+}
+
+uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
+{
+    return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
+}
diff --git a/crypto/meson.build b/crypto/meson.build
index 5f03a30d34..9ac1a89802 100644
--- a/crypto/meson.build
+++ b/crypto/meson.build
@@ -48,9 +48,12 @@ if have_afalg
 endif
 crypto_ss.add(when: gnutls, if_true: files('tls-cipher-suites.c'))
 
-util_ss.add(files('sm4.c'))
-util_ss.add(files('aes.c'))
-util_ss.add(files('init.c'))
+util_ss.add(files(
+  'aes.c',
+  'clmul.c',
+  'init.c',
+  'sm4.c',
+))
 if gnutls.found()
   util_ss.add(gnutls)
 endif
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 02/18] target/arm: Use clmul_8* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 01/18] crypto: Add generic 8-bit carry-less multiply routines Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21  7:49   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 03/18] target/s390x: " Richard Henderson
                   ` (16 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 8-bit carry-less multiply.
Remove our local version of pmull_h.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/vec_internal.h |  5 ----
 target/arm/tcg/mve_helper.c   |  8 ++----
 target/arm/tcg/vec_helper.c   | 53 ++++-------------------------------
 3 files changed, 9 insertions(+), 57 deletions(-)

diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
index 1f4ed80ff7..c4afba6d9f 100644
--- a/target/arm/tcg/vec_internal.h
+++ b/target/arm/tcg/vec_internal.h
@@ -219,11 +219,6 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
 int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
 int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
 
-/*
- * 8 x 8 -> 16 vector polynomial multiply where the inputs are
- * in the low 8 bits of each 16-bit element
-*/
-uint64_t pmull_h(uint64_t op1, uint64_t op2);
 /*
  * 16 x 16 -> 32 vector polynomial multiply where the inputs are
  * in the low 16 bits of each 32-bit element
diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c
index 403b345ea3..96ddfb4b3a 100644
--- a/target/arm/tcg/mve_helper.c
+++ b/target/arm/tcg/mve_helper.c
@@ -26,6 +26,7 @@
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "fpu/softfloat.h"
+#include "crypto/clmul.h"
 
 static uint16_t mve_eci_mask(CPUARMState *env)
 {
@@ -984,15 +985,12 @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
  * Polynomial multiply. We can always do this generating 64 bits
  * of the result at a time, so we don't need to use DO_2OP_L.
  */
-#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
 #define VMULLPW_MASK 0x0000ffff0000ffffULL
-#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
-#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
 #define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
 #define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
 
-DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
-DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
+DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even)
+DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd)
 DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
 DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
 
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 6712a2c790..cd630ff905 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -23,6 +23,7 @@
 #include "tcg/tcg-gvec-desc.h"
 #include "fpu/softfloat.h"
 #include "qemu/int128.h"
+#include "crypto/clmul.h"
 #include "vec_internal.h"
 
 /*
@@ -1986,21 +1987,11 @@ void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
  */
 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
 {
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    intptr_t i, opr_sz = simd_oprsz(desc);
     uint64_t *d = vd, *n = vn, *m = vm;
 
     for (i = 0; i < opr_sz / 8; ++i) {
-        uint64_t nn = n[i];
-        uint64_t mm = m[i];
-        uint64_t rr = 0;
-
-        for (j = 0; j < 8; ++j) {
-            uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
-            rr ^= mm & mask;
-            mm = (mm << 1) & 0xfefefefefefefefeull;
-            nn >>= 1;
-        }
-        d[i] = rr;
+        d[i] = clmul_8x8_low(n[i], m[i]);
     }
     clear_tail(d, opr_sz, simd_maxsz(desc));
 }
@@ -2038,22 +2029,6 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
     clear_tail(d, opr_sz, simd_maxsz(desc));
 }
 
-/*
- * 8x8->16 polynomial multiply.
- *
- * The byte inputs are expanded to (or extracted from) half-words.
- * Note that neon and sve2 get the inputs from different positions.
- * This allows 4 bytes to be processed in parallel with uint64_t.
- */
-
-static uint64_t expand_byte_to_half(uint64_t x)
-{
-    return  (x & 0x000000ff)
-         | ((x & 0x0000ff00) << 8)
-         | ((x & 0x00ff0000) << 16)
-         | ((x & 0xff000000) << 24);
-}
-
 uint64_t pmull_w(uint64_t op1, uint64_t op2)
 {
     uint64_t result = 0;
@@ -2067,29 +2042,16 @@ uint64_t pmull_w(uint64_t op1, uint64_t op2)
     return result;
 }
 
-uint64_t pmull_h(uint64_t op1, uint64_t op2)
-{
-    uint64_t result = 0;
-    int i;
-    for (i = 0; i < 8; ++i) {
-        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
-        result ^= op2 & mask;
-        op1 >>= 1;
-        op2 <<= 1;
-    }
-    return result;
-}
-
 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
 {
     int hi = simd_data(desc);
     uint64_t *d = vd, *n = vn, *m = vm;
     uint64_t nn = n[hi], mm = m[hi];
 
-    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+    d[0] = clmul_8x4_packed(nn, mm);
     nn >>= 32;
     mm >>= 32;
-    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+    d[1] = clmul_8x4_packed(nn, mm);
 
     clear_tail(d, 16, simd_maxsz(desc));
 }
@@ -2102,10 +2064,7 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
     uint64_t *d = vd, *n = vn, *m = vm;
 
     for (i = 0; i < opr_sz / 8; ++i) {
-        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
-        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
-
-        d[i] = pmull_h(nn, mm);
+        d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
     }
 }
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 03/18] target/s390x: Use clmul_8* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 01/18] crypto: Add generic 8-bit carry-less multiply routines Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 02/18] target/arm: Use clmul_8* routines Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21 12:45   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 04/18] target/ppc: " Richard Henderson
                   ` (15 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 8-bit carry-less multiply.
Remove our local version of galois_multiply8.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/vec_int_helper.c | 32 ++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index 53ab5c5eb3..edff4d6b2b 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -14,6 +14,7 @@
 #include "vec.h"
 #include "exec/helper-proto.h"
 #include "tcg/tcg-gvec-desc.h"
+#include "crypto/clmul.h"
 
 static bool s390_vec_is_zero(const S390Vector *v)
 {
@@ -179,7 +180,6 @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a,                \
     }                                                                          \
     return res;                                                                \
 }
-DEF_GALOIS_MULTIPLY(8, 16)
 DEF_GALOIS_MULTIPLY(16, 32)
 DEF_GALOIS_MULTIPLY(32, 64)
 
@@ -203,6 +203,34 @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b)
     return res;
 }
 
+/*
+ * There is no carry across the two doublewords, so their order does
+ * not matter.  Nor is there partial overlap between registers.
+ */
+static inline uint64_t do_gfma8(uint64_t n, uint64_t m, uint64_t a)
+{
+    return clmul_8x4_even(n, m) ^ clmul_8x4_odd(n, m) ^ a;
+}
+
+void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d)
+{
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3;
+
+    q1[0] = do_gfma8(q2[0], q3[0], 0);
+    q1[1] = do_gfma8(q2[1], q3[1], 0);
+}
+
+void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
+                         const void *v4, uint32_t desc)
+{
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
+
+    q1[0] = do_gfma8(q2[0], q3[0], q4[0]);
+    q1[1] = do_gfma8(q2[1], q3[1], q4[1]);
+}
+
 #define DEF_VGFM(BITS, TBITS)                                                  \
 void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3,         \
                              uint32_t desc)                                    \
@@ -220,7 +248,6 @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3,         \
         s390_vec_write_element##TBITS(v1, i, d);                               \
     }                                                                          \
 }
-DEF_VGFM(8, 16)
 DEF_VGFM(16, 32)
 DEF_VGFM(32, 64)
 
@@ -257,7 +284,6 @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3,        \
         s390_vec_write_element##TBITS(v1, i, d);                               \
     }                                                                          \
 }
-DEF_VGFMA(8, 16)
 DEF_VGFMA(16, 32)
 DEF_VGFMA(32, 64)
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 04/18] target/ppc: Use clmul_8* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (2 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 03/18] target/s390x: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 05/18] crypto: Add generic 16-bit carry-less multiply routines Richard Henderson
                   ` (14 subsequent siblings)
  18 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 8-bit carry-less multiply.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/int_helper.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 834da80fe3..343874863a 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -26,6 +26,7 @@
 #include "exec/helper-proto.h"
 #include "crypto/aes.h"
 #include "crypto/aes-round.h"
+#include "crypto/clmul.h"
 #include "fpu/softfloat.h"
 #include "qapi/error.h"
 #include "qemu/guest-random.h"
@@ -1425,6 +1426,18 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 #undef VBPERMQ_INDEX
 #undef VBPERMQ_DW
 
+/*
+ * There is no carry across the two doublewords, so their order does
+ * not matter.  Nor is there partial overlap between registers.
+ */
+void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+    for (int i = 0; i < 2; ++i) {
+        uint64_t aa = a->u64[i], bb = b->u64[i];
+        r->u64[i] = clmul_8x4_even(aa, bb) ^ clmul_8x4_odd(aa, bb);
+    }
+}
+
 #define PMSUM(name, srcfld, trgfld, trgtyp)                   \
 void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)  \
 {                                                             \
@@ -1445,7 +1458,6 @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)  \
     }                                                         \
 }
 
-PMSUM(vpmsumb, u8, u16, uint16_t)
 PMSUM(vpmsumh, u16, u32, uint32_t)
 PMSUM(vpmsumw, u32, u64, uint64_t)
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 05/18] crypto: Add generic 16-bit carry-less multiply routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (3 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 04/18] target/ppc: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-09-10 12:22   ` Ard Biesheuvel
  2023-08-19  1:02 ` [PATCH v2 06/18] target/arm: Use clmul_16* routines Richard Henderson
                   ` (13 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/crypto/clmul.h | 16 ++++++++++++++++
 crypto/clmul.c         | 21 +++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
index 153b5e3057..c7ad28aa85 100644
--- a/include/crypto/clmul.h
+++ b/include/crypto/clmul.h
@@ -38,4 +38,20 @@ uint64_t clmul_8x4_odd(uint64_t, uint64_t);
  */
 uint64_t clmul_8x4_packed(uint32_t, uint32_t);
 
+/**
+ * clmul_16x2_even:
+ *
+ * Perform two 16x16->32 carry-less multiplies.
+ * The odd words of the inputs are ignored.
+ */
+uint64_t clmul_16x2_even(uint64_t, uint64_t);
+
+/**
+ * clmul_16x2_odd:
+ *
+ * Perform two 16x16->32 carry-less multiplies.
+ * The even bytes of the inputs are ignored.
+ */
+uint64_t clmul_16x2_odd(uint64_t, uint64_t);
+
 #endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
index 82d873fee5..2c87cfbf8a 100644
--- a/crypto/clmul.c
+++ b/crypto/clmul.c
@@ -58,3 +58,24 @@ uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
 {
     return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
 }
+
+uint64_t clmul_16x2_even(uint64_t n, uint64_t m)
+{
+    uint64_t r = 0;
+
+    n &= 0x0000ffff0000ffffull;
+    m &= 0x0000ffff0000ffffull;
+
+    for (int i = 0; i < 16; ++i) {
+        uint64_t mask = (n & 0x0000000100000001ull) * 0xffffffffull;
+        r ^= m & mask;
+        n >>= 1;
+        m <<= 1;
+    }
+    return r;
+}
+
+uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
+{
+    return clmul_16x2_even(n >> 16, m >> 16);
+}
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 06/18] target/arm: Use clmul_16* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (4 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 05/18] crypto: Add generic 16-bit carry-less multiply routines Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21  7:51   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 07/18] target/s390x: " Richard Henderson
                   ` (12 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 16-bit carry-less multiply.
Remove our local version of pmull_w.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/vec_internal.h |  6 ------
 target/arm/tcg/mve_helper.c   |  8 ++------
 target/arm/tcg/vec_helper.c   | 13 -------------
 3 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
index c4afba6d9f..3ca1b94ccf 100644
--- a/target/arm/tcg/vec_internal.h
+++ b/target/arm/tcg/vec_internal.h
@@ -219,12 +219,6 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
 int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
 int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
 
-/*
- * 16 x 16 -> 32 vector polynomial multiply where the inputs are
- * in the low 16 bits of each 32-bit element
- */
-uint64_t pmull_w(uint64_t op1, uint64_t op2);
-
 /**
  * bfdotadd:
  * @sum: addend
diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c
index 96ddfb4b3a..c666a96ba1 100644
--- a/target/arm/tcg/mve_helper.c
+++ b/target/arm/tcg/mve_helper.c
@@ -985,14 +985,10 @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
  * Polynomial multiply. We can always do this generating 64 bits
  * of the result at a time, so we don't need to use DO_2OP_L.
  */
-#define VMULLPW_MASK 0x0000ffff0000ffffULL
-#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
-#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
-
 DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even)
 DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd)
-DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
-DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
+DO_2OP(vmullpbw, 8, uint64_t, clmul_16x2_even)
+DO_2OP(vmullptw, 8, uint64_t, clmul_16x2_odd)
 
 /*
  * Because the computation type is at least twice as large as required,
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index cd630ff905..5def86b573 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -2029,19 +2029,6 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
     clear_tail(d, opr_sz, simd_maxsz(desc));
 }
 
-uint64_t pmull_w(uint64_t op1, uint64_t op2)
-{
-    uint64_t result = 0;
-    int i;
-    for (i = 0; i < 16; ++i) {
-        uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
-        result ^= op2 & mask;
-        op1 >>= 1;
-        op2 <<= 1;
-    }
-    return result;
-}
-
 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
 {
     int hi = simd_data(desc);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 07/18] target/s390x: Use clmul_16* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (5 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 06/18] target/arm: Use clmul_16* routines Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21 12:44   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 08/18] target/ppc: " Richard Henderson
                   ` (11 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 16-bit carry-less multiply.
Remove our local version of galois_multiply16.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index edff4d6b2b..11477556e5 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -180,7 +180,6 @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a,                \
     }                                                                          \
     return res;                                                                \
 }
-DEF_GALOIS_MULTIPLY(16, 32)
 DEF_GALOIS_MULTIPLY(32, 64)
 
 static S390Vector galois_multiply64(uint64_t a, uint64_t b)
@@ -231,6 +230,30 @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
     q1[1] = do_gfma8(q2[1], q3[1], q4[1]);
 }
 
+static inline uint64_t do_gfma16(uint64_t n, uint64_t m, uint64_t a)
+{
+    return clmul_16x2_even(n, m) ^ clmul_16x2_odd(n, m) ^ a;
+}
+
+void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d)
+{
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3;
+
+    q1[0] = do_gfma16(q2[0], q3[0], 0);
+    q1[1] = do_gfma16(q2[1], q3[1], 0);
+}
+
+void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
+                         const void *v4, uint32_t d)
+{
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
+
+    q1[0] = do_gfma16(q2[0], q3[0], q4[0]);
+    q1[1] = do_gfma16(q2[1], q3[1], q4[1]);
+}
+
 #define DEF_VGFM(BITS, TBITS)                                                  \
 void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3,         \
                              uint32_t desc)                                    \
@@ -248,7 +271,6 @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3,         \
         s390_vec_write_element##TBITS(v1, i, d);                               \
     }                                                                          \
 }
-DEF_VGFM(16, 32)
 DEF_VGFM(32, 64)
 
 void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
@@ -284,7 +306,6 @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3,        \
         s390_vec_write_element##TBITS(v1, i, d);                               \
     }                                                                          \
 }
-DEF_VGFMA(16, 32)
 DEF_VGFMA(32, 64)
 
 void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 08/18] target/ppc: Use clmul_16* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (6 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 07/18] target/s390x: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 09/18] crypto: Add generic 32-bit carry-less multiply routines Richard Henderson
                   ` (10 subsequent siblings)
  18 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 16-bit carry-less multiply.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/int_helper.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 343874863a..10e19d8c9b 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1438,6 +1438,14 @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
     }
 }
 
+void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+    for (int i = 0; i < 2; ++i) {
+        uint64_t aa = a->u64[i], bb = b->u64[i];
+        r->u64[i] = clmul_16x2_even(aa, bb) ^ clmul_16x2_odd(aa, bb);
+    }
+}
+
 #define PMSUM(name, srcfld, trgfld, trgtyp)                   \
 void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)  \
 {                                                             \
@@ -1458,7 +1466,6 @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)  \
     }                                                         \
 }
 
-PMSUM(vpmsumh, u16, u32, uint32_t)
 PMSUM(vpmsumw, u32, u64, uint64_t)
 
 void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 09/18] crypto: Add generic 32-bit carry-less multiply routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (7 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 08/18] target/ppc: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-09-10 12:23   ` Ard Biesheuvel
  2023-08-19  1:02 ` [PATCH v2 10/18] target/arm: Use clmul_32* routines Richard Henderson
                   ` (9 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/crypto/clmul.h |  7 +++++++
 crypto/clmul.c         | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
index c7ad28aa85..0ea25a252c 100644
--- a/include/crypto/clmul.h
+++ b/include/crypto/clmul.h
@@ -54,4 +54,11 @@ uint64_t clmul_16x2_even(uint64_t, uint64_t);
  */
 uint64_t clmul_16x2_odd(uint64_t, uint64_t);
 
+/**
+ * clmul_32:
+ *
+ * Perform a 32x32->64 carry-less multiply.
+ */
+uint64_t clmul_32(uint32_t, uint32_t);
+
 #endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
index 2c87cfbf8a..36ada1be9d 100644
--- a/crypto/clmul.c
+++ b/crypto/clmul.c
@@ -79,3 +79,16 @@ uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
 {
     return clmul_16x2_even(n >> 16, m >> 16);
 }
+
+uint64_t clmul_32(uint32_t n, uint32_t m32)
+{
+    uint64_t r = 0;
+    uint64_t m = m32;
+
+    for (int i = 0; i < 32; ++i) {
+        r ^= n & 1 ? m : 0;
+        n >>= 1;
+        m <<= 1;
+    }
+    return r;
+}
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 10/18] target/arm: Use clmul_32* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (8 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 09/18] crypto: Add generic 32-bit carry-less multiply routines Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21  7:53   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 11/18] target/s390x: " Richard Henderson
                   ` (8 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 32-bit carry-less multiply.
Remove our local version of pmull_d.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/vec_helper.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index 5def86b573..ffb4b44ce4 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -2055,18 +2055,6 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
     }
 }
 
-static uint64_t pmull_d(uint64_t op1, uint64_t op2)
-{
-    uint64_t result = 0;
-    int i;
-
-    for (i = 0; i < 32; ++i) {
-        uint64_t mask = -((op1 >> i) & 1);
-        result ^= (op2 << i) & mask;
-    }
-    return result;
-}
-
 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
 {
     intptr_t sel = H4(simd_data(desc));
@@ -2075,7 +2063,7 @@ void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
     uint64_t *d = vd;
 
     for (i = 0; i < opr_sz / 8; ++i) {
-        d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
+        d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
     }
 }
 #endif
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 11/18] target/s390x: Use clmul_32* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (9 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 10/18] target/arm: Use clmul_32* routines Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21 12:39   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 12/18] target/ppc: " Richard Henderson
                   ` (7 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 32-bit carry-less multiply.
Remove our local version of galois_multiply32.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/vec_int_helper.c | 75 +++++++++----------------------
 1 file changed, 22 insertions(+), 53 deletions(-)

diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index 11477556e5..ba284b5379 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -165,22 +165,6 @@ DEF_VCTZ(8)
 DEF_VCTZ(16)
 
 /* like binary multiplication, but XOR instead of addition */
-#define DEF_GALOIS_MULTIPLY(BITS, TBITS)                                       \
-static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a,                \
-                                             uint##TBITS##_t b)                \
-{                                                                              \
-    uint##TBITS##_t res = 0;                                                   \
-                                                                               \
-    while (b) {                                                                \
-        if (b & 0x1) {                                                         \
-            res = res ^ a;                                                     \
-        }                                                                      \
-        a = a << 1;                                                            \
-        b = b >> 1;                                                            \
-    }                                                                          \
-    return res;                                                                \
-}
-DEF_GALOIS_MULTIPLY(32, 64)
 
 static S390Vector galois_multiply64(uint64_t a, uint64_t b)
 {
@@ -254,24 +238,29 @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
     q1[1] = do_gfma16(q2[1], q3[1], q4[1]);
 }
 
-#define DEF_VGFM(BITS, TBITS)                                                  \
-void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3,         \
-                             uint32_t desc)                                    \
-{                                                                              \
-    int i;                                                                     \
-                                                                               \
-    for (i = 0; i < (128 / TBITS); i++) {                                      \
-        uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2);             \
-        uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2);             \
-        uint##TBITS##_t d = galois_multiply##BITS(a, b);                       \
-                                                                               \
-        a = s390_vec_read_element##BITS(v2, i * 2 + 1);                        \
-        b = s390_vec_read_element##BITS(v3, i * 2 + 1);                        \
-        d = d ^ galois_multiply32(a, b);                                       \
-        s390_vec_write_element##TBITS(v1, i, d);                               \
-    }                                                                          \
+static inline uint64_t do_gfma32(uint64_t n, uint64_t m, uint64_t a)
+{
+    return clmul_32(n, m) ^ clmul_32(n >> 32, m >> 32) ^ a;
+}
+
+void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d)
+{
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3;
+
+    q1[0] = do_gfma32(q2[0], q3[0], 0);
+    q1[1] = do_gfma32(q2[1], q3[1], 0);
+}
+
+void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
+                         const void *v4, uint32_t d)
+{
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
+
+    q1[0] = do_gfma32(q2[0], q3[0], q4[0]);
+    q1[1] = do_gfma32(q2[1], q3[1], q4[1]);
 }
-DEF_VGFM(32, 64)
 
 void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
                          uint32_t desc)
@@ -288,26 +277,6 @@ void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
     s390_vec_xor(v1, &tmp1, &tmp2);
 }
 
-#define DEF_VGFMA(BITS, TBITS)                                                 \
-void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3,        \
-                              const void *v4, uint32_t desc)                   \
-{                                                                              \
-    int i;                                                                     \
-                                                                               \
-    for (i = 0; i < (128 / TBITS); i++) {                                      \
-        uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2);             \
-        uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2);             \
-        uint##TBITS##_t d = galois_multiply##BITS(a, b);                       \
-                                                                               \
-        a = s390_vec_read_element##BITS(v2, i * 2 + 1);                        \
-        b = s390_vec_read_element##BITS(v3, i * 2 + 1);                        \
-        d = d ^ galois_multiply32(a, b);                                       \
-        d = d ^ s390_vec_read_element##TBITS(v4, i);                           \
-        s390_vec_write_element##TBITS(v1, i, d);                               \
-    }                                                                          \
-}
-DEF_VGFMA(32, 64)
-
 void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
                           const void *v4, uint32_t desc)
 {
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 12/18] target/ppc: Use clmul_32* routines
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (10 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 11/18] target/s390x: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 13/18] crypto: Add generic 64-bit carry-less multiply routine Richard Henderson
                   ` (6 subsequent siblings)
  18 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routines for 32-bit carry-less multiply.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/int_helper.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 10e19d8c9b..ce793cf163 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1446,28 +1446,14 @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
     }
 }
 
-#define PMSUM(name, srcfld, trgfld, trgtyp)                   \
-void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)  \
-{                                                             \
-    int i, j;                                                 \
-    trgtyp prod[sizeof(ppc_avr_t) / sizeof(a->srcfld[0])];    \
-                                                              \
-    VECTOR_FOR_INORDER_I(i, srcfld) {                         \
-        prod[i] = 0;                                          \
-        for (j = 0; j < sizeof(a->srcfld[0]) * 8; j++) {      \
-            if (a->srcfld[i] & (1ull << j)) {                 \
-                prod[i] ^= ((trgtyp)b->srcfld[i] << j);       \
-            }                                                 \
-        }                                                     \
-    }                                                         \
-                                                              \
-    VECTOR_FOR_INORDER_I(i, trgfld) {                         \
-        r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1];         \
-    }                                                         \
+void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+    for (int i = 0; i < 2; ++i) {
+        uint64_t aa = a->u64[i], bb = b->u64[i];
+        r->u64[i] = clmul_32(aa, bb) ^ clmul_32(aa >> 32, bb >> 32);
+    }
 }
 
-PMSUM(vpmsumw, u32, u64, uint64_t)
-
 void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
     int i, j;
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 13/18] crypto: Add generic 64-bit carry-less multiply routine
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (11 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 12/18] target/ppc: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 14/18] target/arm: Use clmul_64 Richard Henderson
                   ` (5 subsequent siblings)
  18 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/generic/host/crypto/clmul.h | 15 +++++++++++++++
 include/crypto/clmul.h                   | 19 +++++++++++++++++++
 crypto/clmul.c                           | 18 ++++++++++++++++++
 3 files changed, 52 insertions(+)
 create mode 100644 host/include/generic/host/crypto/clmul.h

diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
new file mode 100644
index 0000000000..915bfb88d3
--- /dev/null
+++ b/host/include/generic/host/crypto/clmul.h
@@ -0,0 +1,15 @@
+/*
+ * No host specific carry-less multiply acceleration.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef GENERIC_HOST_CRYPTO_CLMUL_H
+#define GENERIC_HOST_CRYPTO_CLMUL_H
+
+#define HAVE_CLMUL_ACCEL  false
+#define ATTR_CLMUL_ACCEL
+
+Int128 clmul_64_accel(uint64_t, uint64_t)
+    QEMU_ERROR("unsupported accel");
+
+#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
index 0ea25a252c..c82d2d7559 100644
--- a/include/crypto/clmul.h
+++ b/include/crypto/clmul.h
@@ -8,6 +8,9 @@
 #ifndef CRYPTO_CLMUL_H
 #define CRYPTO_CLMUL_H
 
+#include "qemu/int128.h"
+#include "host/crypto/clmul.h"
+
 /**
  * clmul_8x8_low:
  *
@@ -61,4 +64,20 @@ uint64_t clmul_16x2_odd(uint64_t, uint64_t);
  */
 uint64_t clmul_32(uint32_t, uint32_t);
 
+/**
+ * clmul_64:
+ *
+ * Perform a 64x64->128 carry-less multiply.
+ */
+Int128 clmul_64_gen(uint64_t, uint64_t);
+
+static inline Int128 clmul_64(uint64_t a, uint64_t b)
+{
+    if (HAVE_CLMUL_ACCEL) {
+        return clmul_64_accel(a, b);
+    } else {
+        return clmul_64_gen(a, b);
+    }
+}
+
 #endif /* CRYPTO_CLMUL_H */
diff --git a/crypto/clmul.c b/crypto/clmul.c
index 36ada1be9d..abf79cc49a 100644
--- a/crypto/clmul.c
+++ b/crypto/clmul.c
@@ -92,3 +92,21 @@ uint64_t clmul_32(uint32_t n, uint32_t m32)
     }
     return r;
 }
+
+Int128 clmul_64_gen(uint64_t n, uint64_t m)
+{
+    uint64_t rl = 0, rh = 0;
+
+    /* Bit 0 can only influence the low 64-bit result.  */
+    if (n & 1) {
+        rl = m;
+    }
+
+    for (int i = 1; i < 64; ++i) {
+        uint64_t mask = -(n & 1);
+        rl ^= (m << i) & mask;
+        rh ^= (m >> (64 - i)) & mask;
+        n >>= 1;
+    }
+    return int128_make128(rl, rh);
+}
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 14/18] target/arm: Use clmul_64
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (12 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 13/18] crypto: Add generic 64-bit carry-less multiply routine Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21 12:27   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 15/18] target/s390x: " Richard Henderson
                   ` (4 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routine for 64-bit carry-less multiply.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/vec_helper.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c
index ffb4b44ce4..1f93510b85 100644
--- a/target/arm/tcg/vec_helper.c
+++ b/target/arm/tcg/vec_helper.c
@@ -2003,28 +2003,14 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
  */
 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
 {
-    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    intptr_t i, opr_sz = simd_oprsz(desc);
     intptr_t hi = simd_data(desc);
     uint64_t *d = vd, *n = vn, *m = vm;
 
     for (i = 0; i < opr_sz / 8; i += 2) {
-        uint64_t nn = n[i + hi];
-        uint64_t mm = m[i + hi];
-        uint64_t rhi = 0;
-        uint64_t rlo = 0;
-
-        /* Bit 0 can only influence the low 64-bit result.  */
-        if (nn & 1) {
-            rlo = mm;
-        }
-
-        for (j = 1; j < 64; ++j) {
-            uint64_t mask = -((nn >> j) & 1);
-            rlo ^= (mm << j) & mask;
-            rhi ^= (mm >> (64 - j)) & mask;
-        }
-        d[i] = rlo;
-        d[i + 1] = rhi;
+        Int128 r = clmul_64(n[i + hi], m[i + hi]);
+        d[i] = int128_getlo(r);
+        d[i + 1] = int128_gethi(r);
     }
     clear_tail(d, opr_sz, simd_maxsz(desc));
 }
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 15/18] target/s390x: Use clmul_64
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (13 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 14/18] target/arm: Use clmul_64 Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21 12:33   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 16/18] target/ppc: " Richard Henderson
                   ` (3 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use the generic routine for 64-bit carry-less multiply.
Remove our local version of galois_multiply64.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/vec_int_helper.c | 58 +++++++------------------------
 1 file changed, 12 insertions(+), 46 deletions(-)

diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index ba284b5379..b18d8a6d16 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -21,13 +21,6 @@ static bool s390_vec_is_zero(const S390Vector *v)
     return !v->doubleword[0] && !v->doubleword[1];
 }
 
-static void s390_vec_xor(S390Vector *res, const S390Vector *a,
-                         const S390Vector *b)
-{
-    res->doubleword[0] = a->doubleword[0] ^ b->doubleword[0];
-    res->doubleword[1] = a->doubleword[1] ^ b->doubleword[1];
-}
-
 static void s390_vec_and(S390Vector *res, const S390Vector *a,
                          const S390Vector *b)
 {
@@ -166,26 +159,6 @@ DEF_VCTZ(16)
 
 /* like binary multiplication, but XOR instead of addition */
 
-static S390Vector galois_multiply64(uint64_t a, uint64_t b)
-{
-    S390Vector res = {};
-    S390Vector va = {
-        .doubleword[1] = a,
-    };
-    S390Vector vb = {
-        .doubleword[1] = b,
-    };
-
-    while (!s390_vec_is_zero(&vb)) {
-        if (vb.doubleword[1] & 0x1) {
-            s390_vec_xor(&res, &res, &va);
-        }
-        s390_vec_shl(&va, &va, 1);
-        s390_vec_shr(&vb, &vb, 1);
-    }
-    return res;
-}
-
 /*
  * There is no carry across the two doublewords, so their order does
  * not matter.  Nor is there partial overlap between registers.
@@ -265,32 +238,25 @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
 void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
                          uint32_t desc)
 {
-    S390Vector tmp1, tmp2;
-    uint64_t a, b;
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3;
+    Int128 r;
 
-    a = s390_vec_read_element64(v2, 0);
-    b = s390_vec_read_element64(v3, 0);
-    tmp1 = galois_multiply64(a, b);
-    a = s390_vec_read_element64(v2, 1);
-    b = s390_vec_read_element64(v3, 1);
-    tmp2 = galois_multiply64(a, b);
-    s390_vec_xor(v1, &tmp1, &tmp2);
+    r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1]));
+    q1[0] = int128_gethi(r);
+    q1[1] = int128_getlo(r);
 }
 
 void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
                           const void *v4, uint32_t desc)
 {
-    S390Vector tmp1, tmp2;
-    uint64_t a, b;
+    uint64_t *q1 = v1;
+    const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
+    Int128 r;
 
-    a = s390_vec_read_element64(v2, 0);
-    b = s390_vec_read_element64(v3, 0);
-    tmp1 = galois_multiply64(a, b);
-    a = s390_vec_read_element64(v2, 1);
-    b = s390_vec_read_element64(v3, 1);
-    tmp2 = galois_multiply64(a, b);
-    s390_vec_xor(&tmp1, &tmp1, &tmp2);
-    s390_vec_xor(v1, &tmp1, v4);
+    r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1]));
+    q1[0] = q4[0] ^ int128_gethi(r);
+    q1[1] = q4[1] ^ int128_getlo(r);
 }
 
 #define DEF_VMAL(BITS)                                                         \
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 16/18] target/ppc: Use clmul_64
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (14 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 15/18] target/s390x: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21 12:34   ` Philippe Mathieu-Daudé
  2023-08-19  1:02 ` [PATCH v2 17/18] host/include/i386: Implement clmul.h Richard Henderson
                   ` (2 subsequent siblings)
  18 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Use generic routine for 64-bit carry-less multiply.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/int_helper.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index ce793cf163..432834c7d5 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1456,20 +1456,9 @@ void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 
 void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
 {
-    int i, j;
-    Int128 tmp, prod[2] = {int128_zero(), int128_zero()};
-
-    for (j = 0; j < 64; j++) {
-        for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
-            if (a->VsrD(i) & (1ull << j)) {
-                tmp = int128_make64(b->VsrD(i));
-                tmp = int128_lshift(tmp, j);
-                prod[i] = int128_xor(prod[i], tmp);
-            }
-        }
-    }
-
-    r->s128 = int128_xor(prod[0], prod[1]);
+    Int128 e = clmul_64(a->u64[0], b->u64[0]);
+    Int128 o = clmul_64(a->u64[1], b->u64[1]);
+    r->s128 = int128_xor(e, o);
 }
 
 #if HOST_BIG_ENDIAN
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 17/18] host/include/i386: Implement clmul.h
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (15 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 16/18] target/ppc: " Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-19  1:02 ` [PATCH v2 18/18] host/include/aarch64: " Richard Henderson
  2023-08-21 14:57 ` [PATCH v2 00/18] crypto: Provide clmul.h and host accel Ard Biesheuvel
  18 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Detect PCLMUL in cpuinfo; implement the accel hook.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/i386/host/cpuinfo.h        |  1 +
 host/include/i386/host/crypto/clmul.h   | 29 +++++++++++++++++++++++++
 host/include/x86_64/host/crypto/clmul.h |  1 +
 include/qemu/cpuid.h                    |  3 +++
 util/cpuinfo-i386.c                     |  1 +
 5 files changed, 35 insertions(+)
 create mode 100644 host/include/i386/host/crypto/clmul.h
 create mode 100644 host/include/x86_64/host/crypto/clmul.h

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
index 073d0a426f..7ae21568f7 100644
--- a/host/include/i386/host/cpuinfo.h
+++ b/host/include/i386/host/cpuinfo.h
@@ -27,6 +27,7 @@
 #define CPUINFO_ATOMIC_VMOVDQA  (1u << 16)
 #define CPUINFO_ATOMIC_VMOVDQU  (1u << 17)
 #define CPUINFO_AES             (1u << 18)
+#define CPUINFO_PCLMUL          (1u << 19)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
diff --git a/host/include/i386/host/crypto/clmul.h b/host/include/i386/host/crypto/clmul.h
new file mode 100644
index 0000000000..dc3c814797
--- /dev/null
+++ b/host/include/i386/host/crypto/clmul.h
@@ -0,0 +1,29 @@
+/*
+ * x86 specific clmul acceleration.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef X86_HOST_CRYPTO_CLMUL_H
+#define X86_HOST_CRYPTO_CLMUL_H
+
+#include "host/cpuinfo.h"
+#include <immintrin.h>
+
+#if defined(__PCLMUL__)
+# define HAVE_CLMUL_ACCEL  true
+# define ATTR_CLMUL_ACCEL
+#else
+# define HAVE_CLMUL_ACCEL  likely(cpuinfo & CPUINFO_PCLMUL)
+# define ATTR_CLMUL_ACCEL  __attribute__((target("pclmul")))
+#endif
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_64_accel(uint64_t n, uint64_t m)
+{
+    union { __m128i v; Int128 s; } u;
+
+    u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0);
+    return u.s;
+}
+
+#endif /* X86_HOST_CRYPTO_CLMUL_H */
diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h
new file mode 100644
index 0000000000..f25eced416
--- /dev/null
+++ b/host/include/x86_64/host/crypto/clmul.h
@@ -0,0 +1 @@
+#include "host/include/i386/host/crypto/clmul.h"
diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
index 35325f1995..b11161555b 100644
--- a/include/qemu/cpuid.h
+++ b/include/qemu/cpuid.h
@@ -25,6 +25,9 @@
 #endif
 
 /* Leaf 1, %ecx */
+#ifndef bit_PCLMUL
+#define bit_PCLMUL      (1 << 1)
+#endif
 #ifndef bit_SSE4_1
 #define bit_SSE4_1      (1 << 19)
 #endif
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index 3a7b7e0ad1..36783fd199 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -39,6 +39,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
         info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
         info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
         info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
+        info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0);
 
         /* Our AES support requires PSHUFB as well. */
         info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 18/18] host/include/aarch64: Implement clmul.h
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (16 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 17/18] host/include/i386: Implement clmul.h Richard Henderson
@ 2023-08-19  1:02 ` Richard Henderson
  2023-08-21 14:57 ` [PATCH v2 00/18] crypto: Provide clmul.h and host accel Ard Biesheuvel
  18 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2023-08-19  1:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: berrange, ardb

Detect PMULL in cpuinfo; implement the accel hook.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 host/include/aarch64/host/cpuinfo.h      |  1 +
 host/include/aarch64/host/crypto/clmul.h | 41 ++++++++++++++++++++++++
 util/cpuinfo-aarch64.c                   |  4 ++-
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 host/include/aarch64/host/crypto/clmul.h

diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
index 769626b098..fe8c3b3fd1 100644
--- a/host/include/aarch64/host/cpuinfo.h
+++ b/host/include/aarch64/host/cpuinfo.h
@@ -10,6 +10,7 @@
 #define CPUINFO_LSE             (1u << 1)
 #define CPUINFO_LSE2            (1u << 2)
 #define CPUINFO_AES             (1u << 3)
+#define CPUINFO_PMULL           (1u << 4)
 
 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
diff --git a/host/include/aarch64/host/crypto/clmul.h b/host/include/aarch64/host/crypto/clmul.h
new file mode 100644
index 0000000000..bb516d8b2f
--- /dev/null
+++ b/host/include/aarch64/host/crypto/clmul.h
@@ -0,0 +1,41 @@
+/*
+ * AArch64 specific clmul acceleration.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef AARCH64_HOST_CRYPTO_CLMUL_H
+#define AARCH64_HOST_CRYPTO_CLMUL_H
+
+#include "host/cpuinfo.h"
+#include <arm_neon.h>
+
+/*
+ * 64x64->128 pmull is available with FEAT_PMULL.
+ * Both FEAT_AES and FEAT_PMULL are covered under the same macro.
+ */
+#ifdef __ARM_FEATURE_AES
+# define HAVE_CLMUL_ACCEL  true
+#else
+# define HAVE_CLMUL_ACCEL  likely(cpuinfo & CPUINFO_PMULL)
+#endif
+#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN)
+# define ATTR_CLMUL_ACCEL  __attribute__((target("+crypto")))
+#else
+# define ATTR_CLMUL_ACCEL
+#endif
+
+static inline Int128 ATTR_CLMUL_ACCEL
+clmul_64_accel(uint64_t n, uint64_t m)
+{
+    union { poly128_t v; Int128 s; } u;
+
+#ifdef CONFIG_ARM_AES_BUILTIN
+    u.v = vmull_p64((poly64_t)n, (poly64_t)m);
+#else
+    asm(".arch_extension aes\n\t"
+        "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m));
+#endif
+    return u.s;
+}
+
+#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
index ababc39550..1d565b8420 100644
--- a/util/cpuinfo-aarch64.c
+++ b/util/cpuinfo-aarch64.c
@@ -56,12 +56,14 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
     unsigned long hwcap = qemu_getauxval(AT_HWCAP);
     info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
     info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
-    info |= (hwcap & HWCAP_AES ? CPUINFO_AES: 0);
+    info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
+    info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
 #endif
 #ifdef CONFIG_DARWIN
     info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
     info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
     info |= sysctl_for_bool("hw.optional.arm.FEAT_AES") * CPUINFO_AES;
+    info |= sysctl_for_bool("hw.optional.arm.FEAT_PMULL") * CPUINFO_PMULL;
 #endif
 
     cpuinfo = info;
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 02/18] target/arm: Use clmul_8* routines
  2023-08-19  1:02 ` [PATCH v2 02/18] target/arm: Use clmul_8* routines Richard Henderson
@ 2023-08-21  7:49   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21  7:49 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routines for 8-bit carry-less multiply.
> Remove our local version of pmull_h.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/arm/tcg/vec_internal.h |  5 ----
>   target/arm/tcg/mve_helper.c   |  8 ++----
>   target/arm/tcg/vec_helper.c   | 53 ++++-------------------------------
>   3 files changed, 9 insertions(+), 57 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 06/18] target/arm: Use clmul_16* routines
  2023-08-19  1:02 ` [PATCH v2 06/18] target/arm: Use clmul_16* routines Richard Henderson
@ 2023-08-21  7:51   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21  7:51 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routines for 16-bit carry-less multiply.
> Remove our local version of pmull_w.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/arm/tcg/vec_internal.h |  6 ------
>   target/arm/tcg/mve_helper.c   |  8 ++------
>   target/arm/tcg/vec_helper.c   | 13 -------------
>   3 files changed, 2 insertions(+), 25 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 10/18] target/arm: Use clmul_32* routines
  2023-08-19  1:02 ` [PATCH v2 10/18] target/arm: Use clmul_32* routines Richard Henderson
@ 2023-08-21  7:53   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21  7:53 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routines for 32-bit carry-less multiply.
> Remove our local version of pmull_d.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/arm/tcg/vec_helper.c | 14 +-------------
>   1 file changed, 1 insertion(+), 13 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 14/18] target/arm: Use clmul_64
  2023-08-19  1:02 ` [PATCH v2 14/18] target/arm: Use clmul_64 Richard Henderson
@ 2023-08-21 12:27   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21 12:27 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routine for 64-bit carry-less multiply.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/arm/tcg/vec_helper.c | 22 ++++------------------
>   1 file changed, 4 insertions(+), 18 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 15/18] target/s390x: Use clmul_64
  2023-08-19  1:02 ` [PATCH v2 15/18] target/s390x: " Richard Henderson
@ 2023-08-21 12:33   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21 12:33 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use the generic routine for 64-bit carry-less multiply.
> Remove our local version of galois_multiply64.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/s390x/tcg/vec_int_helper.c | 58 +++++++------------------------
>   1 file changed, 12 insertions(+), 46 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 16/18] target/ppc: Use clmul_64
  2023-08-19  1:02 ` [PATCH v2 16/18] target/ppc: " Richard Henderson
@ 2023-08-21 12:34   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21 12:34 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routine for 64-bit carry-less multiply.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/ppc/int_helper.c | 17 +++--------------
>   1 file changed, 3 insertions(+), 14 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 11/18] target/s390x: Use clmul_32* routines
  2023-08-19  1:02 ` [PATCH v2 11/18] target/s390x: " Richard Henderson
@ 2023-08-21 12:39   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21 12:39 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routines for 32-bit carry-less multiply.
> Remove our local version of galois_multiply32.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/s390x/tcg/vec_int_helper.c | 75 +++++++++----------------------
>   1 file changed, 22 insertions(+), 53 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 07/18] target/s390x: Use clmul_16* routines
  2023-08-19  1:02 ` [PATCH v2 07/18] target/s390x: " Richard Henderson
@ 2023-08-21 12:44   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21 12:44 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routines for 16-bit carry-less multiply.
> Remove our local version of galois_multiply16.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++---
>   1 file changed, 24 insertions(+), 3 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 03/18] target/s390x: Use clmul_8* routines
  2023-08-19  1:02 ` [PATCH v2 03/18] target/s390x: " Richard Henderson
@ 2023-08-21 12:45   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 34+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-08-21 12:45 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: berrange, ardb

On 19/8/23 03:02, Richard Henderson wrote:
> Use generic routines for 8-bit carry-less multiply.
> Remove our local version of galois_multiply8.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   target/s390x/tcg/vec_int_helper.c | 32 ++++++++++++++++++++++++++++---
>   1 file changed, 29 insertions(+), 3 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 00/18] crypto: Provide clmul.h and host accel
  2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
                   ` (17 preceding siblings ...)
  2023-08-19  1:02 ` [PATCH v2 18/18] host/include/aarch64: " Richard Henderson
@ 2023-08-21 14:57 ` Ard Biesheuvel
  2023-08-21 15:14   ` Richard Henderson
  18 siblings, 1 reply; 34+ messages in thread
From: Ard Biesheuvel @ 2023-08-21 14:57 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, berrange

On Sat, 19 Aug 2023 at 03:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Inspired by Ard Biesheuvel's RFC patches [1] for accelerating
> carry-less multiply under emulation.
>
> Changes for v2:
>   * Only accelerate clmul_64; keep generic helpers for other sizes.
>   * Drop most of the Int128 interfaces, except for clmul_64.
>   * Use the same acceleration format as aes-round.h.
>
>
> r~
>
>
> [1] https://patchew.org/QEMU/20230601123332.3297404-1-ardb@kernel.org/
>
> Richard Henderson (18):
>   crypto: Add generic 8-bit carry-less multiply routines
>   target/arm: Use clmul_8* routines
>   target/s390x: Use clmul_8* routines
>   target/ppc: Use clmul_8* routines
>   crypto: Add generic 16-bit carry-less multiply routines
>   target/arm: Use clmul_16* routines
>   target/s390x: Use clmul_16* routines
>   target/ppc: Use clmul_16* routines
>   crypto: Add generic 32-bit carry-less multiply routines
>   target/arm: Use clmul_32* routines
>   target/s390x: Use clmul_32* routines
>   target/ppc: Use clmul_32* routines
>   crypto: Add generic 64-bit carry-less multiply routine
>   target/arm: Use clmul_64
>   target/s390x: Use clmul_64
>   target/ppc: Use clmul_64
>   host/include/i386: Implement clmul.h
>   host/include/aarch64: Implement clmul.h
>

I didn't re-run the OpenSSL benchmark, but the x86 Linux kernel still
passes all its crypto selftests when running under TCG emulation on a
TX2 arm64 host, so

Tested-by: Ard Biesheuvel <ardb@kernel.org>

for the series.

Thanks,
Ard.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 00/18] crypto: Provide clmul.h and host accel
  2023-08-21 14:57 ` [PATCH v2 00/18] crypto: Provide clmul.h and host accel Ard Biesheuvel
@ 2023-08-21 15:14   ` Richard Henderson
  2023-08-21 15:44     ` Ard Biesheuvel
  0 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2023-08-21 15:14 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: qemu-devel, berrange

On 8/21/23 07:57, Ard Biesheuvel wrote:
>> Richard Henderson (18):
>>    crypto: Add generic 8-bit carry-less multiply routines
>>    target/arm: Use clmul_8* routines
>>    target/s390x: Use clmul_8* routines
>>    target/ppc: Use clmul_8* routines
>>    crypto: Add generic 16-bit carry-less multiply routines
>>    target/arm: Use clmul_16* routines
>>    target/s390x: Use clmul_16* routines
>>    target/ppc: Use clmul_16* routines
>>    crypto: Add generic 32-bit carry-less multiply routines
>>    target/arm: Use clmul_32* routines
>>    target/s390x: Use clmul_32* routines
>>    target/ppc: Use clmul_32* routines
>>    crypto: Add generic 64-bit carry-less multiply routine
>>    target/arm: Use clmul_64
>>    target/s390x: Use clmul_64
>>    target/ppc: Use clmul_64
>>    host/include/i386: Implement clmul.h
>>    host/include/aarch64: Implement clmul.h
>>
> 
> I didn't re-run the OpenSSL benchmark, but the x86 Linux kernel still
> passes all its crypto selftests when running under TCG emulation on a
> TX2 arm64 host, so
> 
> Tested-by: Ard Biesheuvel <ardb@kernel.org>

Oh, whoops.  What's missing here?  Any target/i386 changes.


r~



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 00/18] crypto: Provide clmul.h and host accel
  2023-08-21 15:14   ` Richard Henderson
@ 2023-08-21 15:44     ` Ard Biesheuvel
  0 siblings, 0 replies; 34+ messages in thread
From: Ard Biesheuvel @ 2023-08-21 15:44 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, berrange

On Mon, 21 Aug 2023 at 17:15, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 8/21/23 07:57, Ard Biesheuvel wrote:
> >> Richard Henderson (18):
> >>    crypto: Add generic 8-bit carry-less multiply routines
> >>    target/arm: Use clmul_8* routines
> >>    target/s390x: Use clmul_8* routines
> >>    target/ppc: Use clmul_8* routines
> >>    crypto: Add generic 16-bit carry-less multiply routines
> >>    target/arm: Use clmul_16* routines
> >>    target/s390x: Use clmul_16* routines
> >>    target/ppc: Use clmul_16* routines
> >>    crypto: Add generic 32-bit carry-less multiply routines
> >>    target/arm: Use clmul_32* routines
> >>    target/s390x: Use clmul_32* routines
> >>    target/ppc: Use clmul_32* routines
> >>    crypto: Add generic 64-bit carry-less multiply routine
> >>    target/arm: Use clmul_64
> >>    target/s390x: Use clmul_64
> >>    target/ppc: Use clmul_64
> >>    host/include/i386: Implement clmul.h
> >>    host/include/aarch64: Implement clmul.h
> >>
> >
> > I didn't re-run the OpenSSL benchmark, but the x86 Linux kernel still
> > passes all its crypto selftests when running under TCG emulation on a
> > TX2 arm64 host, so
> >
> > Tested-by: Ard Biesheuvel <ardb@kernel.org>
>
> Oh, whoops.  What's missing here?  Any target/i386 changes.
>

Ah yes - I hadn't spotted that. The below seems to do the trick.

--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -2156,7 +2156,10 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State
*env, Reg *d, Reg *v, Reg *s,
     for (i = 0; i < 1 << SHIFT; i += 2) {
         a = v->Q(((ctrl & 1) != 0) + i);
         b = s->Q(((ctrl & 16) != 0) + i);
-        clmulq(&d->Q(i), &d->Q(i + 1), a, b);
+
+        Int128 r = clmul_64(a, b);
+        d->Q(i) = int128_getlo(r);
+        d->Q(i + 1) = int128_gethi(r);
     }
 }

[and the #include added and clmulq() dropped]

I did a quick RFC4106 benchmark with tcrypt (which doesn't speed up as
much as OpenSSL but it is a bit of a hassle cross-rebuilding that)

no acceleration:

tcrypt: test 7 (160 bit key, 8192 byte blocks): 1547 operations in 1
seconds (12673024 bytes)

AES only:

tcrypt: test 7 (160 bit key, 8192 byte blocks): 1679 operations in 1
seconds (13754368 bytes)

AES and PMULL

tcrypt: test 7 (160 bit key, 8192 byte blocks): 3298 operations in 1
seconds (27017216 bytes)


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 01/18] crypto: Add generic 8-bit carry-less multiply routines
  2023-08-19  1:02 ` [PATCH v2 01/18] crypto: Add generic 8-bit carry-less multiply routines Richard Henderson
@ 2023-09-10 12:19   ` Ard Biesheuvel
  0 siblings, 0 replies; 34+ messages in thread
From: Ard Biesheuvel @ 2023-09-10 12:19 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, berrange

On Sat, 19 Aug 2023 at 03:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>

> ---
>  include/crypto/clmul.h | 41 +++++++++++++++++++++++++++++
>  crypto/clmul.c         | 60 ++++++++++++++++++++++++++++++++++++++++++
>  crypto/meson.build     |  9 ++++---
>  3 files changed, 107 insertions(+), 3 deletions(-)
>  create mode 100644 include/crypto/clmul.h
>  create mode 100644 crypto/clmul.c
>
> diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
> new file mode 100644
> index 0000000000..153b5e3057
> --- /dev/null
> +++ b/include/crypto/clmul.h
> @@ -0,0 +1,41 @@
> +/*
> + * Carry-less multiply operations.
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright (C) 2023 Linaro, Ltd.
> + */
> +
> +#ifndef CRYPTO_CLMUL_H
> +#define CRYPTO_CLMUL_H
> +
> +/**
> + * clmul_8x8_low:
> + *
> + * Perform eight 8x8->8 carry-less multiplies.
> + */
> +uint64_t clmul_8x8_low(uint64_t, uint64_t);
> +
> +/**
> + * clmul_8x4_even:
> + *
> + * Perform four 8x8->16 carry-less multiplies.
> + * The odd bytes of the inputs are ignored.
> + */
> +uint64_t clmul_8x4_even(uint64_t, uint64_t);
> +
> +/**
> + * clmul_8x4_odd:
> + *
> + * Perform four 8x8->16 carry-less multiplies.
> + * The even bytes of the inputs are ignored.
> + */
> +uint64_t clmul_8x4_odd(uint64_t, uint64_t);
> +
> +/**
> + * clmul_8x4_packed:
> + *
> + * Perform four 8x8->16 carry-less multiplies.
> + */
> +uint64_t clmul_8x4_packed(uint32_t, uint32_t);
> +
> +#endif /* CRYPTO_CLMUL_H */
> diff --git a/crypto/clmul.c b/crypto/clmul.c
> new file mode 100644
> index 0000000000..82d873fee5
> --- /dev/null
> +++ b/crypto/clmul.c
> @@ -0,0 +1,60 @@
> +/*
> + * Carry-less multiply operations.
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright (C) 2023 Linaro, Ltd.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "crypto/clmul.h"
> +
> +uint64_t clmul_8x8_low(uint64_t n, uint64_t m)
> +{
> +    uint64_t r = 0;
> +
> +    for (int i = 0; i < 8; ++i) {
> +        uint64_t mask = (n & 0x0101010101010101ull) * 0xff;
> +        r ^= m & mask;
> +        m = (m << 1) & 0xfefefefefefefefeull;
> +        n >>= 1;
> +    }
> +    return r;
> +}
> +
> +static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m)
> +{
> +    uint64_t r = 0;
> +
> +    for (int i = 0; i < 8; ++i) {
> +        uint64_t mask = (n & 0x0001000100010001ull) * 0xffff;
> +        r ^= m & mask;
> +        n >>= 1;
> +        m <<= 1;
> +    }
> +    return r;
> +}
> +
> +uint64_t clmul_8x4_even(uint64_t n, uint64_t m)
> +{
> +    n &= 0x00ff00ff00ff00ffull;
> +    m &= 0x00ff00ff00ff00ffull;
> +    return clmul_8x4_even_int(n, m);
> +}
> +
> +uint64_t clmul_8x4_odd(uint64_t n, uint64_t m)
> +{
> +    return clmul_8x4_even(n >> 8, m >> 8);
> +}
> +
> +static uint64_t unpack_8_to_16(uint64_t x)
> +{
> +    return  (x & 0x000000ff)
> +         | ((x & 0x0000ff00) << 8)
> +         | ((x & 0x00ff0000) << 16)
> +         | ((x & 0xff000000) << 24);
> +}
> +
> +uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
> +{
> +    return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
> +}
> diff --git a/crypto/meson.build b/crypto/meson.build
> index 5f03a30d34..9ac1a89802 100644
> --- a/crypto/meson.build
> +++ b/crypto/meson.build
> @@ -48,9 +48,12 @@ if have_afalg
>  endif
>  crypto_ss.add(when: gnutls, if_true: files('tls-cipher-suites.c'))
>
> -util_ss.add(files('sm4.c'))
> -util_ss.add(files('aes.c'))
> -util_ss.add(files('init.c'))
> +util_ss.add(files(
> +  'aes.c',
> +  'clmul.c',
> +  'init.c',
> +  'sm4.c',
> +))
>  if gnutls.found()
>    util_ss.add(gnutls)
>  endif
> --
> 2.34.1
>


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 05/18] crypto: Add generic 16-bit carry-less multiply routines
  2023-08-19  1:02 ` [PATCH v2 05/18] crypto: Add generic 16-bit carry-less multiply routines Richard Henderson
@ 2023-09-10 12:22   ` Ard Biesheuvel
  0 siblings, 0 replies; 34+ messages in thread
From: Ard Biesheuvel @ 2023-09-10 12:22 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, berrange

On Sat, 19 Aug 2023 at 03:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>

> ---
>  include/crypto/clmul.h | 16 ++++++++++++++++
>  crypto/clmul.c         | 21 +++++++++++++++++++++
>  2 files changed, 37 insertions(+)
>
> diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
> index 153b5e3057..c7ad28aa85 100644
> --- a/include/crypto/clmul.h
> +++ b/include/crypto/clmul.h
> @@ -38,4 +38,20 @@ uint64_t clmul_8x4_odd(uint64_t, uint64_t);
>   */
>  uint64_t clmul_8x4_packed(uint32_t, uint32_t);
>
> +/**
> + * clmul_16x2_even:
> + *
> + * Perform two 16x16->32 carry-less multiplies.
> + * The odd words of the inputs are ignored.
> + */
> +uint64_t clmul_16x2_even(uint64_t, uint64_t);
> +
> +/**
> + * clmul_16x2_odd:
> + *
> + * Perform two 16x16->32 carry-less multiplies.
> + * The even bytes of the inputs are ignored.
> + */
> +uint64_t clmul_16x2_odd(uint64_t, uint64_t);
> +
>  #endif /* CRYPTO_CLMUL_H */
> diff --git a/crypto/clmul.c b/crypto/clmul.c
> index 82d873fee5..2c87cfbf8a 100644
> --- a/crypto/clmul.c
> +++ b/crypto/clmul.c
> @@ -58,3 +58,24 @@ uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
>  {
>      return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
>  }
> +
> +uint64_t clmul_16x2_even(uint64_t n, uint64_t m)
> +{
> +    uint64_t r = 0;
> +
> +    n &= 0x0000ffff0000ffffull;
> +    m &= 0x0000ffff0000ffffull;
> +
> +    for (int i = 0; i < 16; ++i) {
> +        uint64_t mask = (n & 0x0000000100000001ull) * 0xffffffffull;
> +        r ^= m & mask;
> +        n >>= 1;
> +        m <<= 1;
> +    }
> +    return r;
> +}
> +
> +uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
> +{
> +    return clmul_16x2_even(n >> 16, m >> 16);
> +}
> --
> 2.34.1
>


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 09/18] crypto: Add generic 32-bit carry-less multiply routines
  2023-08-19  1:02 ` [PATCH v2 09/18] crypto: Add generic 32-bit carry-less multiply routines Richard Henderson
@ 2023-09-10 12:23   ` Ard Biesheuvel
  0 siblings, 0 replies; 34+ messages in thread
From: Ard Biesheuvel @ 2023-09-10 12:23 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, berrange

On Sat, 19 Aug 2023 at 03:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>

> ---
>  include/crypto/clmul.h |  7 +++++++
>  crypto/clmul.c         | 13 +++++++++++++
>  2 files changed, 20 insertions(+)
>
> diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
> index c7ad28aa85..0ea25a252c 100644
> --- a/include/crypto/clmul.h
> +++ b/include/crypto/clmul.h
> @@ -54,4 +54,11 @@ uint64_t clmul_16x2_even(uint64_t, uint64_t);
>   */
>  uint64_t clmul_16x2_odd(uint64_t, uint64_t);
>
> +/**
> + * clmul_32:
> + *
> + * Perform a 32x32->64 carry-less multiply.
> + */
> +uint64_t clmul_32(uint32_t, uint32_t);
> +
>  #endif /* CRYPTO_CLMUL_H */
> diff --git a/crypto/clmul.c b/crypto/clmul.c
> index 2c87cfbf8a..36ada1be9d 100644
> --- a/crypto/clmul.c
> +++ b/crypto/clmul.c
> @@ -79,3 +79,16 @@ uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
>  {
>      return clmul_16x2_even(n >> 16, m >> 16);
>  }
> +
> +uint64_t clmul_32(uint32_t n, uint32_t m32)
> +{
> +    uint64_t r = 0;
> +    uint64_t m = m32;
> +
> +    for (int i = 0; i < 32; ++i) {
> +        r ^= n & 1 ? m : 0;
> +        n >>= 1;
> +        m <<= 1;
> +    }
> +    return r;
> +}
> --
> 2.34.1
>


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2023-09-10 12:23 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-19  1:02 [PATCH v2 00/18] crypto: Provide clmul.h and host accel Richard Henderson
2023-08-19  1:02 ` [PATCH v2 01/18] crypto: Add generic 8-bit carry-less multiply routines Richard Henderson
2023-09-10 12:19   ` Ard Biesheuvel
2023-08-19  1:02 ` [PATCH v2 02/18] target/arm: Use clmul_8* routines Richard Henderson
2023-08-21  7:49   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 03/18] target/s390x: " Richard Henderson
2023-08-21 12:45   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 04/18] target/ppc: " Richard Henderson
2023-08-19  1:02 ` [PATCH v2 05/18] crypto: Add generic 16-bit carry-less multiply routines Richard Henderson
2023-09-10 12:22   ` Ard Biesheuvel
2023-08-19  1:02 ` [PATCH v2 06/18] target/arm: Use clmul_16* routines Richard Henderson
2023-08-21  7:51   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 07/18] target/s390x: " Richard Henderson
2023-08-21 12:44   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 08/18] target/ppc: " Richard Henderson
2023-08-19  1:02 ` [PATCH v2 09/18] crypto: Add generic 32-bit carry-less multiply routines Richard Henderson
2023-09-10 12:23   ` Ard Biesheuvel
2023-08-19  1:02 ` [PATCH v2 10/18] target/arm: Use clmul_32* routines Richard Henderson
2023-08-21  7:53   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 11/18] target/s390x: " Richard Henderson
2023-08-21 12:39   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 12/18] target/ppc: " Richard Henderson
2023-08-19  1:02 ` [PATCH v2 13/18] crypto: Add generic 64-bit carry-less multiply routine Richard Henderson
2023-08-19  1:02 ` [PATCH v2 14/18] target/arm: Use clmul_64 Richard Henderson
2023-08-21 12:27   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 15/18] target/s390x: " Richard Henderson
2023-08-21 12:33   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 16/18] target/ppc: " Richard Henderson
2023-08-21 12:34   ` Philippe Mathieu-Daudé
2023-08-19  1:02 ` [PATCH v2 17/18] host/include/i386: Implement clmul.h Richard Henderson
2023-08-19  1:02 ` [PATCH v2 18/18] host/include/aarch64: " Richard Henderson
2023-08-21 14:57 ` [PATCH v2 00/18] crypto: Provide clmul.h and host accel Ard Biesheuvel
2023-08-21 15:14   ` Richard Henderson
2023-08-21 15:44     ` Ard Biesheuvel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).