[PULL 41/52] target/arm: Convert PMULL.8 to gvec

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Peter Maydell <peter.maydell@linaro.org>
To: qemu-devel@nongnu.org
Subject: [PULL 41/52] target/arm: Convert PMULL.8 to gvec
Date: Fri, 21 Feb 2020 13:07:29 +0000	[thread overview]
Message-ID: <20200221130740.7583-42-peter.maydell@linaro.org> (raw)
In-Reply-To: <20200221130740.7583-1-peter.maydell@linaro.org>

From: Richard Henderson <richard.henderson@linaro.org>

We still need two different helpers, since NEON and SVE2 get the
inputs from different locations within the source vector.  However,
we can convert both to the same internal form for computation.

The sve2 helper is not used yet, but adding it with this patch
helps illustrate why the neon changes are helpful.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200216214232.4230-5-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/helper-sve.h    |  2 ++
 target/arm/helper.h        |  3 +-
 target/arm/neon_helper.c   | 32 --------------------
 target/arm/translate-a64.c | 27 +++++++++++------
 target/arm/translate.c     | 26 ++++++++---------
 target/arm/vec_helper.c    | 60 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 95 insertions(+), 55 deletions(-)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 9e79182ab46..2f472791558 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -1574,3 +1574,5 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG,
                    void, env, ptr, ptr, ptr, tl, i32)
 DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG,
                    void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 4352fae3dbf..fcbf5041213 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -342,7 +342,6 @@ DEF_HELPER_2(neon_sub_u8, i32, i32, i32)
 DEF_HELPER_2(neon_sub_u16, i32, i32, i32)
 DEF_HELPER_2(neon_mul_u8, i32, i32, i32)
 DEF_HELPER_2(neon_mul_u16, i32, i32, i32)
-DEF_HELPER_2(neon_mull_p8, i64, i32, i32)
 
 DEF_HELPER_2(neon_tst_u8, i32, i32, i32)
 DEF_HELPER_2(neon_tst_u16, i32, i32, i32)
@@ -695,6 +694,8 @@ DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index 6a107da0e11..c7a8438b42a 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -1129,38 +1129,6 @@ NEON_VOP(mul_u8, neon_u8, 4)
 NEON_VOP(mul_u16, neon_u16, 2)
 #undef NEON_FN
 
-/* Polynomial multiplication is like integer multiplication except the
-   partial products are XORed, not added.  */
-uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
-{
-    uint64_t result = 0;
-    uint64_t mask;
-    uint64_t op2ex = op2;
-    op2ex = (op2ex & 0xff) |
-        ((op2ex & 0xff00) << 8) |
-        ((op2ex & 0xff0000) << 16) |
-        ((op2ex & 0xff000000) << 24);
-    while (op1) {
-        mask = 0;
-        if (op1 & 1) {
-            mask |= 0xffff;
-        }
-        if (op1 & (1 << 8)) {
-            mask |= (0xffffU << 16);
-        }
-        if (op1 & (1 << 16)) {
-            mask |= (0xffffULL << 32);
-        }
-        if (op1 & (1 << 24)) {
-            mask |= (0xffffULL << 48);
-        }
-        result ^= op2ex & mask;
-        op1 = (op1 >> 1) & 0x7f7f7f7f;
-        op2ex <<= 1;
-    }
-    return result;
-}
-
 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
 NEON_VOP(tst_u8, neon_u8, 4)
 NEON_VOP(tst_u16, neon_u16, 2)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 03ce879497d..596bf4cf734 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10542,10 +10542,6 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
                 gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
                                                   tcg_passres, tcg_passres);
                 break;
-            case 14: /* PMULL */
-                assert(size == 0);
-                gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2);
-                break;
             default:
                 g_assert_not_reached();
             }
@@ -10709,11 +10705,21 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
         handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
         break;
     case 14: /* PMULL, PMULL2 */
-        if (is_u || size == 1 || size == 2) {
+        if (is_u) {
             unallocated_encoding(s);
             return;
         }
-        if (size == 3) {
+        switch (size) {
+        case 0: /* PMULL.P8 */
+            if (!fp_access_check(s)) {
+                return;
+            }
+            /* The Q field specifies lo/hi half input for this insn.  */
+            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
+                             gen_helper_neon_pmull_h);
+            break;
+
+        case 3: /* PMULL.P64 */
             if (!dc_isar_feature(aa64_pmull, s)) {
                 unallocated_encoding(s);
                 return;
@@ -10724,9 +10730,13 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
             /* The Q field specifies lo/hi half input for this insn.  */
             gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
                              gen_helper_gvec_pmull_q);
-            return;
+            break;
+
+        default:
+            unallocated_encoding(s);
+            break;
         }
-        goto is_widening;
+        return;
     case 9: /* SQDMLAL, SQDMLAL2 */
     case 11: /* SQDMLSL, SQDMLSL2 */
     case 13: /* SQDMULL, SQDMULL2 */
@@ -10747,7 +10757,6 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
             unallocated_encoding(s);
             return;
         }
-    is_widening:
         if (!fp_access_check(s)) {
             return;
         }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 57d61c4aa57..ea6e984da65 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5866,15 +5866,20 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                     return 1;
                 }
 
-                /* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply)
-                 * outside the loop below as it only performs a single pass.
-                 */
-                if (op == 14 && size == 2) {
-                    if (!dc_isar_feature(aa32_pmull, s)) {
-                        return 1;
+                /* Handle polynomial VMULL in a single pass.  */
+                if (op == 14) {
+                    if (size == 0) {
+                        /* VMULL.P8 */
+                        tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
+                                           0, gen_helper_neon_pmull_h);
+                    } else {
+                        /* VMULL.P64 */
+                        if (!dc_isar_feature(aa32_pmull, s)) {
+                            return 1;
+                        }
+                        tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
+                                           0, gen_helper_gvec_pmull_q);
                     }
-                    tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
-                                       0, gen_helper_gvec_pmull_q);
                     return 0;
                 }
 
@@ -5952,11 +5957,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                         /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */
                         gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
                         break;
-                    case 14: /* Polynomial VMULL */
-                        gen_helper_neon_mull_p8(cpu_V0, tmp, tmp2);
-                        tcg_temp_free_i32(tmp2);
-                        tcg_temp_free_i32(tmp);
-                        break;
                     default: /* 15 is RESERVED: caught earlier  */
                         abort();
                     }
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 79d2624f7b1..8017bd88c4c 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1197,3 +1197,63 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
     }
     clear_tail(d, opr_sz, simd_maxsz(desc));
 }
+
+/*
+ * 8x8->16 polynomial multiply.
+ *
+ * The byte inputs are expanded to (or extracted from) half-words.
+ * Note that neon and sve2 get the inputs from different positions.
+ * This allows 4 bytes to be processed in parallel with uint64_t.
+ */
+
+static uint64_t expand_byte_to_half(uint64_t x)
+{
+    return  (x & 0x000000ff)
+         | ((x & 0x0000ff00) << 8)
+         | ((x & 0x00ff0000) << 16)
+         | ((x & 0xff000000) << 24);
+}
+
+static uint64_t pmull_h(uint64_t op1, uint64_t op2)
+{
+    uint64_t result = 0;
+    int i;
+
+    for (i = 0; i < 8; ++i) {
+        uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
+        result ^= op2 & mask;
+        op1 >>= 1;
+        op2 <<= 1;
+    }
+    return result;
+}
+
+void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    int hi = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+    uint64_t nn = n[hi], mm = m[hi];
+
+    d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+    nn >>= 32;
+    mm >>= 32;
+    d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+
+    clear_tail(d, 16, simd_maxsz(desc));
+}
+
+#ifdef TARGET_AARCH64
+void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    int shift = simd_data(desc) * 8;
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; ++i) {
+        uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
+        uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
+
+        d[i] = pmull_h(nn, mm);
+    }
+}
+#endif
-- 
2.20.1

next prev parent reply	other threads:[~2020-02-21 13:40 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-21 13:06 [PULL 00/52] target-arm queue Peter Maydell
2020-02-21 13:06 ` [PULL 01/52] aspeed/scu: Create separate write callbacks Peter Maydell
2020-02-21 13:06 ` [PULL 02/52] aspeed/scu: Implement chip ID register Peter Maydell
2020-02-21 13:06 ` [PULL 03/52] hw/misc/iotkit-secctl: Fix writing to 'PPC Interrupt Clear' register Peter Maydell
2020-02-21 13:06 ` [PULL 04/52] mainstone: Make providing flash images non-mandatory Peter Maydell
2020-02-21 13:06 ` [PULL 05/52] z2: " Peter Maydell
2020-02-21 13:06 ` [PULL 06/52] target/arm: Flush high bits of sve register after AdvSIMD EXT Peter Maydell
2020-02-21 13:06 ` [PULL 07/52] target/arm: Flush high bits of sve register after AdvSIMD TBL/TBX Peter Maydell
2020-02-21 13:06 ` [PULL 08/52] target/arm: Flush high bits of sve register after AdvSIMD ZIP/UZP/TRN Peter Maydell
2020-02-21 13:06 ` [PULL 09/52] target/arm: Flush high bits of sve register after AdvSIMD INS Peter Maydell
2020-02-21 13:06 ` [PULL 10/52] target/arm: Use bit 55 explicitly for pauth Peter Maydell
2020-02-21 13:06 ` [PULL 11/52] target/arm: Fix select for aa64_va_parameters_both Peter Maydell
2020-02-21 13:07 ` [PULL 12/52] target/arm: Remove ttbr1_valid check from get_phys_addr_lpae Peter Maydell
2020-02-21 13:07 ` [PULL 13/52] target/arm: Split out aa64_va_parameter_tbi, aa64_va_parameter_tbid Peter Maydell
2020-02-21 13:07 ` [PULL 14/52] target/arm: Add _aa32_ to isar_feature functions testing 32-bit ID registers Peter Maydell
2020-02-21 13:07 ` [PULL 15/52] target/arm: Check aa32_pan in take_aarch32_exception(), not aa64_pan Peter Maydell
2020-02-21 13:07 ` [PULL 16/52] target/arm: Add isar_feature_any_fp16 and document naming/usage conventions Peter Maydell
2020-02-21 13:07 ` [PULL 17/52] target/arm: Define and use any_predinv isar_feature test Peter Maydell
2020-02-21 13:07 ` [PULL 18/52] target/arm: Factor out PMU register definitions Peter Maydell
2020-02-21 13:07 ` [PULL 19/52] target/arm: Add and use FIELD definitions for ID_AA64DFR0_EL1 Peter Maydell
2020-02-21 13:07 ` [PULL 20/52] target/arm: Use FIELD macros for clearing ID_DFR0 PERFMON field Peter Maydell
2020-02-21 13:07 ` [PULL 21/52] target/arm: Define an aa32_pmu_8_1 isar feature test function Peter Maydell
2020-02-21 13:07 ` [PULL 22/52] target/arm: Add _aa64_ and _any_ versions of pmu_8_1 isar checks Peter Maydell
2020-02-21 13:07 ` [PULL 23/52] target/arm: Stop assuming DBGDIDR always exists Peter Maydell
2020-02-21 13:07 ` [PULL 24/52] target/arm: Move DBGDIDR into ARMISARegisters Peter Maydell
2020-02-21 13:07 ` [PULL 25/52] target/arm: Read debug-related ID registers from KVM Peter Maydell
2020-02-21 13:07 ` [PULL 26/52] target/arm: Implement ARMv8.1-PMU extension Peter Maydell
2020-02-21 13:07 ` [PULL 27/52] target/arm: Implement ARMv8.4-PMU extension Peter Maydell
2020-02-21 13:07 ` [PULL 28/52] target/arm: Provide ARMv8.4-PMU in '-cpu max' Peter Maydell
2020-02-21 13:07 ` [PULL 29/52] target/arm: Correct definition of PMCRDP Peter Maydell
2020-02-21 13:07 ` [PULL 30/52] target/arm: Correct handling of PMCR_EL0.LC bit Peter Maydell
2020-02-21 13:07 ` [PULL 31/52] target/arm: Test correct register in aa32_pan and aa32_ats1e1 checks Peter Maydell
2020-02-21 13:07 ` [PULL 32/52] target/arm: Use isar_feature function for testing AA32HPD feature Peter Maydell
2020-02-21 13:07 ` [PULL 33/52] target/arm: Use FIELD_EX32 for testing 32-bit fields Peter Maydell
2020-02-21 13:07 ` [PULL 34/52] target/arm: Correctly implement ACTLR2, HACTLR2 Peter Maydell
2020-02-21 13:07 ` [PULL 35/52] hw: usb: hcd-ohci: Move OHCISysBusState and TYPE_SYSBUS_OHCI to include file Peter Maydell
2020-02-21 13:07 ` [PULL 36/52] hcd-ehci: Introduce "companion-enable" sysbus property Peter Maydell
2020-02-21 13:07 ` [PULL 37/52] arm: allwinner: Wire up USB ports Peter Maydell
2020-02-21 13:07 ` [PULL 38/52] target/arm: Vectorize USHL and SSHL Peter Maydell
2020-02-21 13:07 ` [PULL 39/52] target/arm: Convert PMUL.8 to gvec Peter Maydell
2020-02-21 13:07 ` [PULL 40/52] target/arm: Convert PMULL.64 " Peter Maydell
2020-02-21 13:07 ` Peter Maydell [this message]
2020-02-21 13:07 ` [PULL 42/52] xilinx_spips: Correct the number of dummy cycles for the FAST_READ_4 cmd Peter Maydell
2020-02-21 13:07 ` [PULL 43/52] sh4: Fix PCI ISA IO memory subregion Peter Maydell
2020-02-21 13:07 ` [PULL 44/52] target/arm: Rename isar_feature_aa32_simd_r32 Peter Maydell
2020-02-21 13:07 ` [PULL 45/52] target/arm: Use isar_feature_aa32_simd_r32 more places Peter Maydell
2020-02-21 13:07 ` [PULL 46/52] target/arm: Set MVFR0.FPSP for ARMv5 cpus Peter Maydell
2020-02-21 13:07 ` [PULL 47/52] target/arm: Add isar_feature_aa32_simd_r16 Peter Maydell
2020-02-21 13:07 ` [PULL 48/52] target/arm: Rename isar_feature_aa32_fpdp_v2 Peter Maydell
2020-02-21 13:07 ` [PULL 49/52] target/arm: Add isar_feature_aa32_{fpsp_v2, fpsp_v3, fpdp_v3} Peter Maydell
2020-02-21 13:07 ` [PULL 50/52] target/arm: Perform fpdp_v2 check first Peter Maydell
2020-02-21 13:07 ` [PULL 51/52] target/arm: Replace ARM_FEATURE_VFP3 checks with fp{sp, dp}_v3 Peter Maydell
2020-02-21 13:07 ` [PULL 52/52] target/arm: Add missing checks for fpsp_v2 Peter Maydell
2020-02-21 14:17 ` [PULL 00/52] target-arm queue no-reply
2020-02-21 16:06 ` no-reply
2020-02-21 16:10 ` no-reply

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:9e79182ab4 dfblob:2f47279155 dfblob:4352fae3db
dfblob:fcbf504121 dfblob:6a107da0e1 dfblob:c7a8438b42 dfblob:03ce879497
dfblob:596bf4cf73 dfblob:57d61c4aa5 dfblob:ea6e984da6 dfblob:79d2624f7b
dfblob:8017bd88c4 )
 OR (
bs:"[PULL 41/52] target/arm: Convert PMULL.8 to gvec" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200221130740.7583-42-peter.maydell@linaro.org \
    --to=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).