From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Cc: rajav@quicinc.com, qemu-arm@nongnu.org, apazos@quicinc.com
Subject: [PATCH 23/31] target/arm: Create arm_gen_gvec_[us]sra
Date: Thu, 26 Mar 2020 16:08:30 -0700 [thread overview]
Message-ID: <20200326230838.31112-24-richard.henderson@linaro.org> (raw)
In-Reply-To: <20200326230838.31112-1-richard.henderson@linaro.org>
The functions eliminate duplication of the special cases for
this operation. They match up with the GVecGen2iFn typedef.
Add out-of-line helpers. We got away with only having inline
expanders because the neon vector size is only 16 bytes, and
we know that the inline expansion will always succeed.
When we reuse this for SVE, tcg-gvec-op may decide to use an
out-of-line helper due to longer vector lengths.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 10 +++
target/arm/translate.h | 7 +-
target/arm/translate-a64.c | 15 +---
target/arm/translate.c | 161 ++++++++++++++++++++++---------------
target/arm/vec_helper.c | 25 ++++++
5 files changed, 139 insertions(+), 79 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 938fdbc362..dc6a43dbd8 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -708,6 +708,16 @@ DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ssra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ssra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ssra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_ssra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_usra_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_usra_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_usra_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_usra_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
diff --git a/target/arm/translate.h b/target/arm/translate.h
index 5552ee5a94..1c5cdf13e3 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -291,8 +291,6 @@ extern const GVecGen3 mls_op[4];
extern const GVecGen3 cmtst_op[4];
extern const GVecGen3 sshl_op[4];
extern const GVecGen3 ushl_op[4];
-extern const GVecGen2i ssra_op[4];
-extern const GVecGen2i usra_op[4];
extern const GVecGen2i sri_op[4];
extern const GVecGen2i sli_op[4];
extern const GVecGen4 uqadd_op[4];
@@ -305,6 +303,11 @@ void gen_sshl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
void gen_ushl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void gen_sshl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void arm_gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void arm_gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
/*
* Forward to the isar_feature_* tests given a DisasContext pointer.
*/
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 2bcf643069..d50207fcfb 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10682,19 +10682,8 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
switch (opcode) {
case 0x02: /* SSRA / USRA (accumulate) */
- if (is_u) {
- /* Shift count same as element size produces zero to add. */
- if (shift == 8 << size) {
- goto done;
- }
- gen_gvec_op2i(s, is_q, rd, rn, shift, &usra_op[size]);
- } else {
- /* Shift count same as element size produces all sign to add. */
- if (shift == 8 << size) {
- shift -= 1;
- }
- gen_gvec_op2i(s, is_q, rd, rn, shift, &ssra_op[size]);
- }
+ gen_gvec_fn2i(s, is_q, rd, rn, shift,
+ is_u ? arm_gen_gvec_usra : arm_gen_gvec_ssra, size);
return;
case 0x08: /* SRI */
/* Shift count same as element size is valid but does nothing. */
diff --git a/target/arm/translate.c b/target/arm/translate.c
index cba84987db..f5768014d1 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -3947,33 +3947,51 @@ static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
tcg_gen_add_vec(vece, d, d, a);
}
-static const TCGOpcode vecop_list_ssra[] = {
- INDEX_op_sari_vec, INDEX_op_add_vec, 0
-};
+void arm_gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_sari_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_ssra8_i64,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_b,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni8 = gen_ssra16_i64,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_h,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = gen_ssra32_i32,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_s,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = gen_ssra64_i64,
+ .fniv = gen_ssra_vec,
+ .fno = gen_helper_gvec_ssra_b,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
-const GVecGen2i ssra_op[4] = {
- { .fni8 = gen_ssra8_i64,
- .fniv = gen_ssra_vec,
- .load_dest = true,
- .opt_opc = vecop_list_ssra,
- .vece = MO_8 },
- { .fni8 = gen_ssra16_i64,
- .fniv = gen_ssra_vec,
- .load_dest = true,
- .opt_opc = vecop_list_ssra,
- .vece = MO_16 },
- { .fni4 = gen_ssra32_i32,
- .fniv = gen_ssra_vec,
- .load_dest = true,
- .opt_opc = vecop_list_ssra,
- .vece = MO_32 },
- { .fni8 = gen_ssra64_i64,
- .fniv = gen_ssra_vec,
- .prefer_i64 = TCG_TARGET_REG_BITS == 64,
- .opt_opc = vecop_list_ssra,
- .load_dest = true,
- .vece = MO_64 },
-};
+ /* tszimm encoding produces immediates in the range [1..esize]. */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ /*
+ * Shifts larger than the element size are architecturally valid.
+ * Signed results in all sign bits.
+ */
+ shift = MIN(shift, (8 << vece) - 1);
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+}
static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
{
@@ -4005,33 +4023,55 @@ static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
tcg_gen_add_vec(vece, d, d, a);
}
-static const TCGOpcode vecop_list_usra[] = {
- INDEX_op_shri_vec, INDEX_op_add_vec, 0
-};
+void arm_gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ static const TCGOpcode vecop_list[] = {
+ INDEX_op_shri_vec, INDEX_op_add_vec, 0
+ };
+ static const GVecGen2i ops[4] = {
+ { .fni8 = gen_usra8_i64,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_b,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_8, },
+ { .fni8 = gen_usra16_i64,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_h,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_16, },
+ { .fni4 = gen_usra32_i32,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_s,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_32, },
+ { .fni8 = gen_usra64_i64,
+ .fniv = gen_usra_vec,
+ .fno = gen_helper_gvec_usra_d,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opt_opc = vecop_list,
+ .vece = MO_64, },
+ };
-const GVecGen2i usra_op[4] = {
- { .fni8 = gen_usra8_i64,
- .fniv = gen_usra_vec,
- .load_dest = true,
- .opt_opc = vecop_list_usra,
- .vece = MO_8, },
- { .fni8 = gen_usra16_i64,
- .fniv = gen_usra_vec,
- .load_dest = true,
- .opt_opc = vecop_list_usra,
- .vece = MO_16, },
- { .fni4 = gen_usra32_i32,
- .fniv = gen_usra_vec,
- .load_dest = true,
- .opt_opc = vecop_list_usra,
- .vece = MO_32, },
- { .fni8 = gen_usra64_i64,
- .fniv = gen_usra_vec,
- .prefer_i64 = TCG_TARGET_REG_BITS == 64,
- .load_dest = true,
- .opt_opc = vecop_list_usra,
- .vece = MO_64, },
-};
+ /* tszimm encoding produces immediates in the range [1..esize]. */
+ tcg_debug_assert(shift > 0);
+ tcg_debug_assert(shift <= (8 << vece));
+
+ /*
+ * Shifts larger than the element size are architecturally valid.
+ * Unsigned results in all zeros as input to accumulate: nop.
+ */
+ if (shift < (8 << vece)) {
+ tcg_gen_gvec_2i(rd_ofs, rm_ofs, opr_sz, max_sz, shift, &ops[vece]);
+ } else {
+ /* Nop, but we do need to clear the tail. */
+ tcg_gen_gvec_mov(vece, rd_ofs, rd_ofs, opr_sz, max_sz);
+ }
+}
static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
{
@@ -5396,19 +5436,12 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
case 1: /* VSRA */
/* Right shift comes here negative. */
shift = -shift;
- /* Shifts larger than the element size are architecturally
- * valid. Unsigned results in all zeros; signed results
- * in all sign bits.
- */
- if (!u) {
- tcg_gen_gvec_2i(rd_ofs, rm_ofs, vec_size, vec_size,
- MIN(shift, (8 << size) - 1),
- &ssra_op[size]);
- } else if (shift >= 8 << size) {
- /* rd += 0 */
+ if (u) {
+ arm_gen_gvec_usra(size, rd_ofs, rm_ofs, shift,
+ vec_size, vec_size);
} else {
- tcg_gen_gvec_2i(rd_ofs, rm_ofs, vec_size, vec_size,
- shift, &usra_op[size]);
+ arm_gen_gvec_ssra(size, rd_ofs, rm_ofs, shift,
+ vec_size, vec_size);
}
return 0;
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 154d32518a..aaaccc0a2d 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -899,6 +899,31 @@ void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
clear_tail(d, oprsz, simd_maxsz(desc));
}
+
+#define DO_SRA(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] += n[i] >> shift; \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_SRA(gvec_ssra_b, int8_t)
+DO_SRA(gvec_ssra_h, int16_t)
+DO_SRA(gvec_ssra_s, int32_t)
+DO_SRA(gvec_ssra_d, int64_t)
+
+DO_SRA(gvec_usra_b, uint8_t)
+DO_SRA(gvec_usra_h, uint16_t)
+DO_SRA(gvec_usra_s, uint32_t)
+DO_SRA(gvec_usra_d, uint64_t)
+
+#undef DO_SRA
+
/*
* Convert float16 to float32, raising no exceptions and
* preserving exceptional values, including SNaN.
--
2.20.1
next prev parent reply other threads:[~2020-03-26 23:21 UTC|newest]
Thread overview: 44+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-03-26 23:08 [PATCH for-5.1 00/31] target/arm: SVE2, part 1 Richard Henderson
2020-03-26 23:08 ` [PATCH 01/31] target/arm: Add ID_AA64ZFR0 fields and isar_feature_aa64_sve2 Richard Henderson
2020-04-11 18:45 ` Alex Bennée
2020-03-26 23:08 ` [PATCH 02/31] target/arm: Implement SVE2 Integer Multiply - Unpredicated Richard Henderson
2020-03-26 23:08 ` [PATCH 03/31] target/arm: Implement SVE2 integer pairwise add and accumulate long Richard Henderson
2020-03-26 23:08 ` [PATCH 04/31] target/arm: Remove fp_status from helper_{recpe, rsqrte}_u32 Richard Henderson
2020-03-26 23:08 ` [PATCH 05/31] target/arm: Implement SVE2 integer unary operations (predicated) Richard Henderson
2020-03-26 23:08 ` [PATCH 06/31] target/arm: Split out saturating/rounding shifts from neon Richard Henderson
2020-03-26 23:08 ` [PATCH 07/31] target/arm: Implement SVE2 saturating/rounding bitwise shift left (predicated) Richard Henderson
2020-03-26 23:08 ` [PATCH 08/31] target/arm: Implement SVE2 integer halving add/subtract (predicated) Richard Henderson
2020-03-26 23:08 ` [PATCH 09/31] target/arm: Implement SVE2 integer pairwise arithmetic Richard Henderson
2020-04-13 16:02 ` Laurent Desnogues
2020-03-26 23:08 ` [PATCH 10/31] target/arm: Implement SVE2 saturating add/subtract (predicated) Richard Henderson
2020-03-26 23:08 ` [PATCH 11/31] target/arm: Implement SVE2 integer add/subtract long Richard Henderson
2020-04-13 16:09 ` Laurent Desnogues
2020-03-26 23:08 ` [PATCH 12/31] target/arm: Implement SVE2 integer add/subtract interleaved long Richard Henderson
2020-03-26 23:08 ` [PATCH 13/31] target/arm: Implement SVE2 integer add/subtract wide Richard Henderson
2020-04-13 16:11 ` Laurent Desnogues
2020-03-26 23:08 ` [PATCH 14/31] target/arm: Implement SVE2 integer multiply long Richard Henderson
2020-03-26 23:08 ` [PATCH 15/31] target/arm: Implement PMULLB and PMULLT Richard Henderson
2020-03-26 23:08 ` [PATCH 16/31] target/arm: Tidy SVE tszimm shift formats Richard Henderson
2020-03-26 23:08 ` [PATCH 17/31] target/arm: Implement SVE2 bitwise shift left long Richard Henderson
2020-03-26 23:08 ` [PATCH 18/31] target/arm: Implement SVE2 bitwise exclusive-or interleaved Richard Henderson
2020-03-26 23:08 ` [PATCH 19/31] target/arm: Implement SVE2 bitwise permute Richard Henderson
2020-03-26 23:08 ` [PATCH 20/31] target/arm: Implement SVE2 complex integer add Richard Henderson
2020-04-13 16:20 ` Laurent Desnogues
2020-03-26 23:08 ` [PATCH 21/31] target/arm: Implement SVE2 integer absolute difference and accumulate long Richard Henderson
2020-04-13 16:15 ` Laurent Desnogues
2020-04-13 23:19 ` Richard Henderson
2020-04-14 7:04 ` Laurent Desnogues
2020-03-26 23:08 ` [PATCH 22/31] target/arm: Implement SVE2 integer add/subtract long with carry Richard Henderson
2020-04-13 16:18 ` Laurent Desnogues
2020-03-26 23:08 ` Richard Henderson [this message]
2020-03-26 23:08 ` [PATCH 24/31] target/arm: Create arm_gen_gvec_{u,s}{rshr,rsra} Richard Henderson
2020-03-26 23:08 ` [PATCH 25/31] target/arm: Implement SVE2 bitwise shift right and accumulate Richard Henderson
2020-03-26 23:08 ` [PATCH 26/31] target/arm: Create arm_gen_gvec_{sri,sli} Richard Henderson
2020-03-26 23:08 ` [PATCH 27/31] target/arm: Tidy handle_vec_simd_shri Richard Henderson
2020-03-26 23:08 ` [PATCH 28/31] target/arm: Implement SVE2 bitwise shift and insert Richard Henderson
2020-03-26 23:08 ` [PATCH 29/31] target/arm: Vectorize SABD/UABD Richard Henderson
2020-03-26 23:08 ` [PATCH 30/31] target/arm: Vectorize SABA/UABA Richard Henderson
2020-03-26 23:08 ` [PATCH 31/31] target/arm: Implement SVE2 integer absolute difference and accumulate Richard Henderson
2020-04-01 21:17 ` [PATCH for-5.1 00/31] target/arm: SVE2, part 1 Ana Pazos
2020-04-22 2:51 ` LIU Zhiwei
2020-04-22 2:55 ` Richard Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200326230838.31112-24-richard.henderson@linaro.org \
--to=richard.henderson@linaro.org \
--cc=apazos@quicinc.com \
--cc=qemu-arm@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=rajav@quicinc.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).