* [Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 16:42 ` Richard Henderson
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 2/8] target-arm: A64: Add SIMD three-different ABDL instructions Peter Maydell
` (6 subsequent siblings)
7 siblings, 1 reply; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
Add support for the multiply-accumulate instructions from the
SIMD three-different instructions group (C3.6.15):
* skeleton decode of unallocated encodings and split of
the group into its three sub-parts
* framework for handling the 64x64->128 widening subpart
* implementation of the multiply-accumulate instructions
SMLAL, SMLAL2, UMLAL, UMLAL2, SMLSL, SMLSL2, UMLSL, UMLSL2,
UMULL, UMULL2, SMULL, SMULL2
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
target-arm/translate-a64.c | 233 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 232 insertions(+), 1 deletion(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 7cfb55b..924a539 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -700,6 +700,9 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
* zero extend as we are filling a partial chunk of the vector register.
* These functions don't support 128 bit loads/stores, which would be
* normal load/store operations.
+ *
+ * The _i32 versions are useful when operating on 32 bit quantities
+ * (eg for floating point single or using Neon helper functions).
*/
/* Get value of an element within a vector register */
@@ -735,6 +738,32 @@ static void read_vec_element(DisasContext *s, TCGv_i64 tcg_dest, int srcidx,
}
}
+static void read_vec_element_i32(DisasContext *s, TCGv_i32 tcg_dest, int srcidx,
+ int element, TCGMemOp memop)
+{
+ int vect_off = vec_reg_offset(srcidx, element, memop & MO_SIZE);
+ switch (memop) {
+ case MO_8:
+ tcg_gen_ld8u_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_16:
+ tcg_gen_ld16u_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_8|MO_SIGN:
+ tcg_gen_ld8s_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_16|MO_SIGN:
+ tcg_gen_ld16s_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ case MO_32:
+ case MO_32|MO_SIGN:
+ tcg_gen_ld_i32(tcg_dest, cpu_env, vect_off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
/* Set value of an element within a vector register */
static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx,
int element, TCGMemOp memop)
@@ -5546,6 +5575,150 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
unsupported_encoding(s, insn);
}
+static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
+ int opcode, int rd, int rn, int rm)
+{
+ /* 3-reg-different widening insns: 64 x 64 -> 128 */
+ TCGv_i64 tcg_res[2];
+ int pass, accop;
+
+ tcg_res[0] = tcg_temp_new_i64();
+ tcg_res[1] = tcg_temp_new_i64();
+
+ /* Does this op do an adding accumulate, a subtracting accumulate,
+ * or no accumulate at all?
+ */
+ switch (opcode) {
+ case 5:
+ case 8:
+ case 9:
+ accop = 1;
+ break;
+ case 10:
+ case 11:
+ accop = -1;
+ break;
+ default:
+ accop = 0;
+ break;
+ }
+
+ if (accop != 0) {
+ read_vec_element(s, tcg_res[0], rd, 0, MO_64);
+ read_vec_element(s, tcg_res[1], rd, 1, MO_64);
+ }
+
+ /* size == 2 means two 32x32->64 operations; this is worth special
+ * casing because we can generally handle it inline.
+ */
+ if (size == 2) {
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_passres;
+ TCGMemOp memop = MO_32 | (is_u ? 0 : MO_SIGN);
+
+ int elt = pass + is_q * 2;
+
+ read_vec_element(s, tcg_op1, rn, elt, memop);
+ read_vec_element(s, tcg_op2, rm, elt, memop);
+
+ if (accop == 0) {
+ tcg_passres = tcg_res[pass];
+ } else {
+ tcg_passres = tcg_temp_new_i64();
+ }
+
+ switch (opcode) {
+ case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+ case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+ case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+ tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (accop > 0) {
+ tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+ tcg_temp_free_i64(tcg_passres);
+ } else if (accop < 0) {
+ tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+ tcg_temp_free_i64(tcg_passres);
+ }
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ }
+ } else {
+ /* size 0 or 1, generally helper functions */
+ for (pass = 0; pass < 2; pass++) {
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i64 tcg_passres;
+ int elt = pass + is_q * 2;
+
+ read_vec_element_i32(s, tcg_op1, rn, elt, MO_32);
+ read_vec_element_i32(s, tcg_op2, rm, elt, MO_32);
+
+ if (accop == 0) {
+ tcg_passres = tcg_res[pass];
+ } else {
+ tcg_passres = tcg_temp_new_i64();
+ }
+
+ switch (opcode) {
+ case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+ case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+ case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+ if (size == 0) {
+ if (is_u) {
+ gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2);
+ }
+ } else {
+ if (is_u) {
+ gen_helper_neon_mull_u16(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_mull_s16(tcg_passres, tcg_op1, tcg_op2);
+ }
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+
+ if (accop > 0) {
+ if (size == 0) {
+ gen_helper_neon_addl_u16(tcg_res[pass], tcg_res[pass],
+ tcg_passres);
+ } else {
+ gen_helper_neon_addl_u32(tcg_res[pass], tcg_res[pass],
+ tcg_passres);
+ }
+ tcg_temp_free_i64(tcg_passres);
+ } else if (accop < 0) {
+ if (size == 0) {
+ gen_helper_neon_subl_u16(tcg_res[pass], tcg_res[pass],
+ tcg_passres);
+ } else {
+ gen_helper_neon_subl_u32(tcg_res[pass], tcg_res[pass],
+ tcg_passres);
+ }
+ tcg_temp_free_i64(tcg_passres);
+ }
+ }
+ }
+
+ write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+ write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+ tcg_temp_free_i64(tcg_res[0]);
+ tcg_temp_free_i64(tcg_res[1]);
+}
+
/* C3.6.15 AdvSIMD three different
* 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0
* +---+---+---+-----------+------+---+------+--------+-----+------+------+
@@ -5554,7 +5727,65 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
*/
static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ /* Instructions in this group fall into three basic classes
+ * (in each case with the operation working on each element in
+ * the input vectors):
+ * (1) widening 64 x 64 -> 128 (with possibly Vd as an extra
+ * 128 bit input)
+ * (2) wide 64 x 128 -> 128
+ * (3) narrowing 128 x 128 -> 64
+ * Here we do initial decode, catch unallocated cases and
+ * dispatch to separate functions for each class.
+ */
+ int is_q = extract32(insn, 30, 1);
+ int is_u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 12, 4);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ switch (opcode) {
+ case 1: /* SADDW, SADDW2, UADDW, UADDW2 */
+ case 3: /* SSUBW, SSUBW2, USUBW, USUBW2 */
+ /* 64 x 128 -> 128 */
+ unsupported_encoding(s, insn);
+ break;
+ case 4: /* ADDHN, ADDHN2, RADDHN, RADDHN2 */
+ case 6: /* SUBHN, SUBHN2, RSUBHN, RSUBHN2 */
+ /* 128 x 128 -> 64 */
+ unsupported_encoding(s, insn);
+ break;
+ case 9:
+ case 11:
+ case 13:
+ case 14:
+ if (is_u) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0:
+ case 2:
+ case 5:
+ case 7:
+ unsupported_encoding(s, insn);
+ break;
+ case 8:
+ case 10:
+ case 12:
+ /* 64 x 64 -> 128 */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm);
+ break;
+ default:
+ /* opcode 15 not allocated */
+ unallocated_encoding(s);
+ break;
+ }
}
/* C3.6.16 AdvSIMD three same
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns Peter Maydell
@ 2014-01-23 16:42 ` Richard Henderson
0 siblings, 0 replies; 16+ messages in thread
From: Richard Henderson @ 2014-01-23 16:42 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall
On 01/23/2014 07:28 AM, Peter Maydell wrote:
> Add support for the multiply-accumulate instructions from the
> SIMD three-different instructions group (C3.6.15):
> * skeleton decode of unallocated encodings and split of
> the group into its three sub-parts
> * framework for handling the 64x64->128 widening subpart
> * implementation of the multiply-accumulate instructions
> SMLAL, SMLAL2, UMLAL, UMLAL2, SMLSL, SMLSL2, UMLSL, UMLSL2,
> UMULL, UMULL2, SMULL, SMULL2
>
> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
> ---
> target-arm/translate-a64.c | 233 ++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 232 insertions(+), 1 deletion(-)
Reviewed-by: Richard Henderson <rth@twiddle.net>
r~
^ permalink raw reply [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 2/8] target-arm: A64: Add SIMD three-different ABDL instructions
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 3/8] target-arm: A64: Add SIMD scalar 3 same add, sub and compare ops Peter Maydell
` (5 subsequent siblings)
7 siblings, 0 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
Implement the absolute-difference instructions in the SIMD
three-different group: SABAL, SABAL2, UABAL, UABAL2, SABDL,
SABDL2, UABDL, UABDL2.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
---
target-arm/translate-a64.c | 35 +++++++++++++++++++++++++++++++++--
1 file changed, 33 insertions(+), 2 deletions(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 924a539..145125e 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5630,6 +5630,21 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
}
switch (opcode) {
+ case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+ case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+ {
+ TCGv_i64 tcg_tmp1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_tmp2 = tcg_temp_new_i64();
+
+ tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2);
+ tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1);
+ tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE,
+ tcg_passres,
+ tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2);
+ tcg_temp_free_i64(tcg_tmp1);
+ tcg_temp_free_i64(tcg_tmp2);
+ break;
+ }
case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
@@ -5668,6 +5683,22 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
}
switch (opcode) {
+ case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+ case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+ if (size == 0) {
+ if (is_u) {
+ gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, tcg_op2);
+ }
+ } else {
+ if (is_u) {
+ gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, tcg_op2);
+ } else {
+ gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, tcg_op2);
+ }
+ }
+ break;
case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
@@ -5767,10 +5798,10 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
/* fall through */
case 0:
case 2:
- case 5:
- case 7:
unsupported_encoding(s, insn);
break;
+ case 5:
+ case 7:
case 8:
case 10:
case 12:
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 3/8] target-arm: A64: Add SIMD scalar 3 same add, sub and compare ops
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 2/8] target-arm: A64: Add SIMD three-different ABDL instructions Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 4/8] target-arm: A64: Add top level decode for SIMD 3-same group Peter Maydell
` (4 subsequent siblings)
7 siblings, 0 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
Implement the add, sub and compare ops from the SIMD "scalar three same"
group.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
---
target-arm/translate-a64.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 130 insertions(+), 1 deletion(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 145125e..6ff3e43 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5531,6 +5531,58 @@ static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
unsupported_encoding(s, insn);
}
+static void handle_3same_64(DisasContext *s, int opcode, bool u,
+ TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm)
+{
+ /* Handle 64x64->64 opcodes which are shared between the scalar
+ * and vector 3-same groups. We cover every opcode where size == 3
+ * is valid in either the three-reg-same (integer, not pairwise)
+ * or scalar-three-reg-same groups. (Some opcodes are not yet
+ * implemented.)
+ */
+ TCGCond cond;
+
+ switch (opcode) {
+ case 0x6: /* CMGT, CMHI */
+ /* 64 bit integer comparison, result = test ? (2^64 - 1) : 0.
+ * We implement this using setcond (test) and then negating.
+ */
+ cond = u ? TCG_COND_GTU : TCG_COND_GT;
+ do_cmop:
+ tcg_gen_setcond_i64(cond, tcg_rd, tcg_rn, tcg_rm);
+ tcg_gen_neg_i64(tcg_rd, tcg_rd);
+ break;
+ case 0x7: /* CMGE, CMHS */
+ cond = u ? TCG_COND_GEU : TCG_COND_GE;
+ goto do_cmop;
+ case 0x11: /* CMTST, CMEQ */
+ if (u) {
+ cond = TCG_COND_EQ;
+ goto do_cmop;
+ }
+ /* CMTST : test is "if (X & Y != 0)". */
+ tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
+ tcg_gen_setcondi_i64(TCG_COND_NE, tcg_rd, tcg_rd, 0);
+ tcg_gen_neg_i64(tcg_rd, tcg_rd);
+ break;
+ case 0x10: /* ADD, SUB */
+ if (u) {
+ tcg_gen_sub_i64(tcg_rd, tcg_rn, tcg_rm);
+ } else {
+ tcg_gen_add_i64(tcg_rd, tcg_rn, tcg_rm);
+ }
+ break;
+ case 0x1: /* SQADD */
+ case 0x5: /* SQSUB */
+ case 0x8: /* SSHL, USHL */
+ case 0x9: /* SQSHL, UQSHL */
+ case 0xa: /* SRSHL, URSHL */
+ case 0xb: /* SQRSHL, UQRSHL */
+ default:
+ g_assert_not_reached();
+ }
+}
+
/* C3.6.11 AdvSIMD scalar three same
* 31 30 29 28 24 23 22 21 20 16 15 11 10 9 5 4 0
* +-----+---+-----------+------+---+------+--------+---+------+------+
@@ -5539,7 +5591,84 @@ static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
*/
static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 5);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool u = extract32(insn, 29, 1);
+ TCGv_i64 tcg_rn;
+ TCGv_i64 tcg_rm;
+ TCGv_i64 tcg_rd;
+
+ if (opcode >= 0x18) {
+ /* Floating point: U, size[1] and opcode indicate operation */
+ int fpopcode = opcode | (extract32(size, 1, 1) << 5) | (u << 6);
+ switch (fpopcode) {
+ case 0x1b: /* FMULX */
+ case 0x1c: /* FCMEQ */
+ case 0x1f: /* FRECPS */
+ case 0x3f: /* FRSQRTS */
+ case 0x5c: /* FCMGE */
+ case 0x5d: /* FACGE */
+ case 0x7a: /* FABD */
+ case 0x7c: /* FCMGT */
+ case 0x7d: /* FACGT */
+ unsupported_encoding(s, insn);
+ return;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ }
+
+ switch (opcode) {
+ case 0x1: /* SQADD, UQADD */
+ case 0x5: /* SQSUB, UQSUB */
+ case 0x8: /* SSHL, USHL */
+ case 0xa: /* SRSHL, URSHL */
+ unsupported_encoding(s, insn);
+ return;
+ case 0x6: /* CMGT, CMHI */
+ case 0x7: /* CMGE, CMHS */
+ case 0x11: /* CMTST, CMEQ */
+ case 0x10: /* ADD, SUB (vector) */
+ if (size != 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 0x9: /* SQSHL, UQSHL */
+ case 0xb: /* SQRSHL, UQRSHL */
+ unsupported_encoding(s, insn);
+ return;
+ case 0x16: /* SQDMULH, SQRDMULH (vector) */
+ if (size != 1 && size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ unsupported_encoding(s, insn);
+ return;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+
+ tcg_rn = read_fp_dreg(s, rn); /* op1 */
+ tcg_rm = read_fp_dreg(s, rm); /* op2 */
+ tcg_rd = tcg_temp_new_i64();
+
+ /* For the moment we only support the opcodes which are
+ * 64-bit-width only. The size != 3 cases will
+ * be handled later when the relevant ops are implemented.
+ */
+ handle_3same_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rm);
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rm);
+ tcg_temp_free_i64(tcg_rd);
}
/* C3.6.12 AdvSIMD scalar two reg misc
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 4/8] target-arm: A64: Add top level decode for SIMD 3-same group
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
` (2 preceding siblings ...)
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 3/8] target-arm: A64: Add SIMD scalar 3 same add, sub and compare ops Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 5/8] target-arm: A64: Add logic ops from SIMD 3 same group Peter Maydell
` (3 subsequent siblings)
7 siblings, 0 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
Add top level decode for the A64 SIMD three regs same group
(C3.6.16), splitting it into the pairwise, logical, float and
integer subgroups.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
---
target-arm/translate-a64.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 44 insertions(+), 1 deletion(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 6ff3e43..2079c96 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5948,6 +5948,30 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
}
}
+/* Logic op (opcode == 3) subgroup of C3.6.16. */
+static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
+{
+ unsupported_encoding(s, insn);
+}
+
+/* Pairwise op subgroup of C3.6.16. */
+static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
+{
+ unsupported_encoding(s, insn);
+}
+
+/* Floating point op subgroup of C3.6.16. */
+static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
+{
+ unsupported_encoding(s, insn);
+}
+
+/* Integer op subgroup of C3.6.16. */
+static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
+{
+ unsupported_encoding(s, insn);
+}
+
/* C3.6.16 AdvSIMD three same
* 31 30 29 28 24 23 22 21 20 16 15 11 10 9 5 4 0
* +---+---+---+-----------+------+---+------+--------+---+------+------+
@@ -5956,7 +5980,26 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
*/
static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ int opcode = extract32(insn, 11, 5);
+
+ switch (opcode) {
+ case 0x3: /* logic ops */
+ disas_simd_3same_logic(s, insn);
+ break;
+ case 0x17: /* ADDP */
+ case 0x14: /* SMAXP, UMAXP */
+ case 0x15: /* SMINP, UMINP */
+ /* Pairwise operations */
+ disas_simd_3same_pair(s, insn);
+ break;
+ case 0x18 ... 0x31:
+ /* floating point ops, sz[1] and U are part of opcode */
+ disas_simd_3same_float(s, insn);
+ break;
+ default:
+ disas_simd_3same_int(s, insn);
+ break;
+ }
}
/* C3.6.17 AdvSIMD two reg misc
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 5/8] target-arm: A64: Add logic ops from SIMD 3 same group
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
` (3 preceding siblings ...)
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 4/8] target-arm: A64: Add top level decode for SIMD 3-same group Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group Peter Maydell
` (2 subsequent siblings)
7 siblings, 0 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
Add support for the logical operations (ORR, AND, BIC, ORN, EOR, BSL,
BIT and BIF) from the SIMD 3 register same group (C3.6.16).
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
---
target-arm/translate-a64.c | 73 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 72 insertions(+), 1 deletion(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 2079c96..4767cbf 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5951,7 +5951,78 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
/* Logic op (opcode == 3) subgroup of C3.6.16. */
static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool is_u = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_res[2];
+ int pass;
+
+ tcg_res[0] = tcg_temp_new_i64();
+ tcg_res[1] = tcg_temp_new_i64();
+
+ for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+ if (!is_u) {
+ switch (size) {
+ case 0: /* AND */
+ tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2);
+ break;
+ case 1: /* BIC */
+ tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2);
+ break;
+ case 2: /* ORR */
+ tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2);
+ break;
+ case 3: /* ORN */
+ tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2);
+ break;
+ }
+ } else {
+ if (size != 0) {
+ /* B* ops need res loaded to operate on */
+ read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
+ }
+
+ switch (size) {
+ case 0: /* EOR */
+ tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2);
+ break;
+ case 1: /* BSL bitwise select */
+ tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2);
+ tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]);
+ tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1);
+ break;
+ case 2: /* BIT, bitwise insert if true */
+ tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
+ tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2);
+ tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+ break;
+ case 3: /* BIF, bitwise insert if false */
+ tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
+ tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2);
+ tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
+ break;
+ }
+ }
+ }
+
+ write_vec_element(s, tcg_res[0], rd, 0, MO_64);
+ if (!is_q) {
+ tcg_gen_movi_i64(tcg_res[1], 0);
+ }
+ write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ tcg_temp_free_i64(tcg_res[0]);
+ tcg_temp_free_i64(tcg_res[1]);
}
/* Pairwise op subgroup of C3.6.16. */
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
` (4 preceding siblings ...)
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 5/8] target-arm: A64: Add logic ops from SIMD 3 same group Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 16:53 ` Richard Henderson
2014-01-25 18:35 ` Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops Peter Maydell
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate Peter Maydell
7 siblings, 2 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
Add some of the integer operations in the SIMD 3-same group:
specifically, the comparisons, addition and subtraction.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
target-arm/translate-a64.c | 158 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 157 insertions(+), 1 deletion(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 4767cbf..d24ce97 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -72,6 +72,9 @@ typedef struct AArch64DecodeTable {
AArch64DecodeFn *disas_fn;
} AArch64DecodeTable;
+/* Function prototype for gen_ functions for calling Neon helpers */
+typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
+
/* initialize TCG globals. */
void a64_translate_init(void)
{
@@ -787,6 +790,25 @@ static void write_vec_element(DisasContext *s, TCGv_i64 tcg_src, int destidx,
}
}
+static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src, int destidx,
+ int element, TCGMemOp memop)
+{
+ int vect_off = vec_reg_offset(destidx, element, memop & MO_SIZE);
+ switch (memop) {
+ case MO_8:
+ tcg_gen_st8_i32(tcg_src, cpu_env, vect_off);
+ break;
+ case MO_16:
+ tcg_gen_st16_i32(tcg_src, cpu_env, vect_off);
+ break;
+ case MO_32:
+ tcg_gen_st_i32(tcg_src, cpu_env, vect_off);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
/* Clear the high 64 bits of a 128 bit vector (in general non-quad
* vector ops all need to do this).
*/
@@ -6040,7 +6062,141 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
/* Integer op subgroup of C3.6.16. */
static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ int is_q = extract32(insn, 30, 1);
+ int u = extract32(insn, 29, 1);
+ int size = extract32(insn, 22, 2);
+ int opcode = extract32(insn, 11, 5);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+ int pass;
+
+ switch (opcode) {
+ case 0x13: /* MUL, PMUL */
+ if (u && size != 0) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* fall through */
+ case 0x0: /* SHADD, UHADD */
+ case 0x2: /* SRHADD, URHADD */
+ case 0x4: /* SHSUB, UHSUB */
+ case 0xc: /* SMAX, UMAX */
+ case 0xd: /* SMIN, UMIN */
+ case 0xe: /* SABD, UABD */
+ case 0xf: /* SABA, UABA */
+ case 0x12: /* MLA, MLS */
+ if (size == 3) {
+ unallocated_encoding(s);
+ return;
+ }
+ unsupported_encoding(s, insn);
+ return;
+ case 0x1: /* SQADD */
+ case 0x5: /* SQSUB */
+ case 0x8: /* SSHL, USHL */
+ case 0x9: /* SQSHL, UQSHL */
+ case 0xa: /* SRSHL, URSHL */
+ case 0xb: /* SQRSHL, UQRSHL */
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ unsupported_encoding(s, insn);
+ return;
+ default:
+ if (size == 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ }
+
+ if (size == 3) {
+ for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+ handle_3same_64(s, opcode, u, tcg_res, tcg_op1, tcg_op2);
+
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ }
+ } else {
+ for (pass = 0; pass < (is_q ? 4 : 2); pass++) {
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+ NeonGenTwoOpFn *genfn;
+
+ read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+ read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
+
+ switch (opcode) {
+ case 0x6: /* CMGT, CMHI */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_cgt_s8, gen_helper_neon_cgt_u8 },
+ { gen_helper_neon_cgt_s16, gen_helper_neon_cgt_u16 },
+ { gen_helper_neon_cgt_s32, gen_helper_neon_cgt_u32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0x7: /* CMGE, CMHS */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_cge_s8, gen_helper_neon_cge_u8 },
+ { gen_helper_neon_cge_s16, gen_helper_neon_cge_u16 },
+ { gen_helper_neon_cge_s32, gen_helper_neon_cge_u32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0x10: /* ADD, SUB */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
+ { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
+ { tcg_gen_add_i32, tcg_gen_sub_i32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ case 0x11: /* CMTST, CMEQ */
+ {
+ static NeonGenTwoOpFn * const fns[3][2] = {
+ { gen_helper_neon_tst_u8, gen_helper_neon_ceq_u8 },
+ { gen_helper_neon_tst_u16, gen_helper_neon_ceq_u16 },
+ { gen_helper_neon_tst_u32, gen_helper_neon_ceq_u32 },
+ };
+ genfn = fns[size][u];
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+
+ genfn(tcg_res, tcg_op1, tcg_op2);
+
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
+
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ }
+ }
+
+ if (!is_q) {
+ clear_vec_high(s, rd);
+ }
}
/* C3.6.16 AdvSIMD three same
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group Peter Maydell
@ 2014-01-23 16:53 ` Richard Henderson
2014-01-25 18:35 ` Peter Maydell
1 sibling, 0 replies; 16+ messages in thread
From: Richard Henderson @ 2014-01-23 16:53 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall
On 01/23/2014 07:28 AM, Peter Maydell wrote:
> Add some of the integer operations in the SIMD 3-same group:
> specifically, the comparisons, addition and subtraction.
>
> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
> ---
> target-arm/translate-a64.c | 158 ++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 157 insertions(+), 1 deletion(-)
Reviewed-by: Richard Henderson <rth@twiddle.net>
r~
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group Peter Maydell
2014-01-23 16:53 ` Richard Henderson
@ 2014-01-25 18:35 ` Peter Maydell
1 sibling, 0 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-25 18:35 UTC (permalink / raw)
To: QEMU Developers
Cc: Laurent Desnogues, Patch Tracking, Michael Matz, Claudio Fontana,
Dirk Mueller, Will Newton, kvmarm@lists.cs.columbia.edu,
Richard Henderson
On 23 January 2014 15:28, Peter Maydell <peter.maydell@linaro.org> wrote:
> Add some of the integer operations in the SIMD 3-same group:
> specifically, the comparisons, addition and subtraction.
>
> @@ -6040,7 +6062,141 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
> /* Integer op subgroup of C3.6.16. */
> static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
> {
> - unsupported_encoding(s, insn);
> + int is_q = extract32(insn, 30, 1);
> + int u = extract32(insn, 29, 1);
> + int size = extract32(insn, 22, 2);
> + int opcode = extract32(insn, 11, 5);
> + int rm = extract32(insn, 16, 5);
> + int rn = extract32(insn, 5, 5);
> + int rd = extract32(insn, 0, 5);
> + int pass;
> +
> + switch (opcode) {
> + case 0x13: /* MUL, PMUL */
> + if (u && size != 0) {
> + unallocated_encoding(s);
> + return;
> + }
> + /* fall through */
> + case 0x0: /* SHADD, UHADD */
> + case 0x2: /* SRHADD, URHADD */
> + case 0x4: /* SHSUB, UHSUB */
> + case 0xc: /* SMAX, UMAX */
> + case 0xd: /* SMIN, UMIN */
> + case 0xe: /* SABD, UABD */
> + case 0xf: /* SABA, UABA */
> + case 0x12: /* MLA, MLS */
> + if (size == 3) {
> + unallocated_encoding(s);
> + return;
> + }
> + unsupported_encoding(s, insn);
> + return;
> + case 0x1: /* SQADD */
> + case 0x5: /* SQSUB */
> + case 0x8: /* SSHL, USHL */
> + case 0x9: /* SQSHL, UQSHL */
> + case 0xa: /* SRSHL, URSHL */
> + case 0xb: /* SQRSHL, UQRSHL */
> + if (size == 3 && !is_q) {
> + unallocated_encoding(s);
> + return;
> + }
> + unsupported_encoding(s, insn);
> + return;
> + default:
> + if (size == 3 && !is_q) {
> + unallocated_encoding(s);
> + return;
> + }
> + break;
> + }
Just noticed this switch is missing a case:
case 0x16: /* SQDMULH, SQRDMULH */
if (size == 0 || size == 3) {
unallocated_encoding(s);
return;
}
break;
thanks
-- PMM
^ permalink raw reply [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
` (5 preceding siblings ...)
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 6/8] target-arm: A64: Add integer ops from SIMD 3-same group Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 16:55 ` Richard Henderson
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate Peter Maydell
7 siblings, 1 reply; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
Implement a simple subset of the SIMD 3-same floating point
operations. This includes a common helper function used for both
scalar and vector ops; FABD is the only currently implemented
shared op.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
---
target-arm/translate-a64.c | 191 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 189 insertions(+), 2 deletions(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index d24ce97..5eabf24 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5605,6 +5605,132 @@ static void handle_3same_64(DisasContext *s, int opcode, bool u,
}
}
+/* Handle the 3-same-operands float operations; shared by the scalar
+ * and vector encodings. The caller must filter out any encodings
+ * not allocated for the encoding it is dealing with.
+ */
+static void handle_3same_float(DisasContext *s, int size, int elements,
+ int fpopcode, int rd, int rn, int rm)
+{
+ int pass;
+ TCGv_ptr fpst = get_fpstatus_ptr();
+
+ for (pass = 0; pass < elements; pass++) {
+ if (size) {
+ /* Double */
+ TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+ TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+ TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_op1, rn, pass, MO_64);
+ read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+ switch (fpopcode) {
+ case 0x18: /* FMAXNM */
+ gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1a: /* FADD */
+ gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1e: /* FMAX */
+ gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x38: /* FMINNM */
+ gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3a: /* FSUB */
+ gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3e: /* FMIN */
+ gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5b: /* FMUL */
+ gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5f: /* FDIV */
+ gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7a: /* FABD */
+ gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+ gen_helper_vfp_absd(tcg_res, tcg_res);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+ tcg_temp_free_i64(tcg_res);
+ tcg_temp_free_i64(tcg_op1);
+ tcg_temp_free_i64(tcg_op2);
+ } else {
+ /* Single */
+ TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+ TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+ TCGv_i32 tcg_res = tcg_temp_new_i32();
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_tmp, rn, pass, MO_32);
+ tcg_gen_trunc_i64_i32(tcg_op1, tcg_tmp);
+ read_vec_element(s, tcg_tmp, rm, pass, MO_32);
+ tcg_gen_trunc_i64_i32(tcg_op2, tcg_tmp);
+
+ switch (fpopcode) {
+ case 0x1a: /* FADD */
+ gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x1e: /* FMAX */
+ gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x18: /* FMAXNM */
+ gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x38: /* FMINNM */
+ gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3a: /* FSUB */
+ gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x3e: /* FMIN */
+ gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5b: /* FMUL */
+ gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x5f: /* FDIV */
+ gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
+ break;
+ case 0x7a: /* FABD */
+ gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+ gen_helper_vfp_abss(tcg_res, tcg_res);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
+ if (elements == 1) {
+ /* scalar single so clear high part */
+ write_vec_element(s, tcg_tmp, rd, pass, MO_64);
+ } else {
+ write_vec_element(s, tcg_tmp, rd, pass, MO_32);
+ }
+
+ tcg_temp_free_i64(tcg_tmp);
+ tcg_temp_free_i32(tcg_res);
+ tcg_temp_free_i32(tcg_op1);
+ tcg_temp_free_i32(tcg_op2);
+ }
+ }
+
+ tcg_temp_free_ptr(fpst);
+
+ if ((elements << size) < 4) {
+ /* scalar, or non-quad vector op */
+ clear_vec_high(s, rd);
+ }
+}
+
/* C3.6.11 AdvSIMD scalar three same
* 31 30 29 28 24 23 22 21 20 16 15 11 10 9 5 4 0
* +-----+---+-----------+------+---+------+--------+---+------+------+
@@ -5633,15 +5759,19 @@ static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
case 0x3f: /* FRSQRTS */
case 0x5c: /* FCMGE */
case 0x5d: /* FACGE */
- case 0x7a: /* FABD */
case 0x7c: /* FCMGT */
case 0x7d: /* FACGT */
unsupported_encoding(s, insn);
return;
+ case 0x7a: /* FABD */
+ break;
default:
unallocated_encoding(s);
return;
}
+
+ handle_3same_float(s, extract32(size, 0, 1), 1, fpopcode, rd, rn, rm);
+ return;
}
switch (opcode) {
@@ -6056,7 +6186,64 @@ static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
/* Floating point op subgroup of C3.6.16. */
static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ /* For floating point ops, the U, size[1] and opcode bits
+ * together indicate the operation. size[0] indicates single
+ * or double.
+ */
+ int fpopcode = extract32(insn, 11, 5)
+ | (extract32(insn, 23, 1) << 5)
+ | (extract32(insn, 29, 1) << 6);
+ int is_q = extract32(insn, 30, 1);
+ int size = extract32(insn, 22, 1);
+ int rm = extract32(insn, 16, 5);
+ int rn = extract32(insn, 5, 5);
+ int rd = extract32(insn, 0, 5);
+
+ int datasize = is_q ? 128 : 64;
+ int esize = 32 << size;
+ int elements = datasize / esize;
+
+ if (size == 1 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (fpopcode) {
+ case 0x58: /* FMAXNMP */
+ case 0x5a: /* FADDP */
+ case 0x5e: /* FMAXP */
+ case 0x78: /* FMINNMP */
+ case 0x7e: /* FMINP */
+ /* pairwise ops */
+ unsupported_encoding(s, insn);
+ return;
+ case 0x1b: /* FMULX */
+ case 0x1c: /* FCMEQ */
+ case 0x1f: /* FRECPS */
+ case 0x3f: /* FRSQRTS */
+ case 0x5c: /* FCMGE */
+ case 0x5d: /* FACGE */
+ case 0x7c: /* FCMGT */
+ case 0x7d: /* FACGT */
+ case 0x19: /* FMLA */
+ case 0x39: /* FMLS */
+ unsupported_encoding(s, insn);
+ return;
+ case 0x18: /* FMAXNM */
+ case 0x1a: /* FADD */
+ case 0x1e: /* FMAX */
+ case 0x38: /* FMINNM */
+ case 0x3a: /* FSUB */
+ case 0x3e: /* FMIN */
+ case 0x5b: /* FMUL */
+ case 0x5f: /* FDIV */
+ case 0x7a: /* FABD */
+ handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
+ return;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
}
/* Integer op subgroup of C3.6.16. */
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops Peter Maydell
@ 2014-01-23 16:55 ` Richard Henderson
2014-01-23 17:26 ` Peter Maydell
0 siblings, 1 reply; 16+ messages in thread
From: Richard Henderson @ 2014-01-23 16:55 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall
On 01/23/2014 07:28 AM, Peter Maydell wrote:
> + /* Single */
> + TCGv_i32 tcg_op1 = tcg_temp_new_i32();
> + TCGv_i32 tcg_op2 = tcg_temp_new_i32();
> + TCGv_i32 tcg_res = tcg_temp_new_i32();
> + TCGv_i64 tcg_tmp = tcg_temp_new_i64();
> +
> + read_vec_element(s, tcg_tmp, rn, pass, MO_32);
> + tcg_gen_trunc_i64_i32(tcg_op1, tcg_tmp);
> + read_vec_element(s, tcg_tmp, rm, pass, MO_32);
> + tcg_gen_trunc_i64_i32(tcg_op2, tcg_tmp);
Update for new _i32 helpers.
r~
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops
2014-01-23 16:55 ` Richard Henderson
@ 2014-01-23 17:26 ` Peter Maydell
0 siblings, 0 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 17:26 UTC (permalink / raw)
To: Richard Henderson
Cc: Patch Tracking, Michael Matz, Alexander Graf, QEMU Developers,
Claudio Fontana, Dirk Mueller, Will Newton, Laurent Desnogues,
Alex Bennée, kvmarm@lists.cs.columbia.edu, Christoffer Dall
On 23 January 2014 16:55, Richard Henderson <rth@twiddle.net> wrote:
> On 01/23/2014 07:28 AM, Peter Maydell wrote:
>> + /* Single */
>> + TCGv_i32 tcg_op1 = tcg_temp_new_i32();
>> + TCGv_i32 tcg_op2 = tcg_temp_new_i32();
>> + TCGv_i32 tcg_res = tcg_temp_new_i32();
>> + TCGv_i64 tcg_tmp = tcg_temp_new_i64();
>> +
>> + read_vec_element(s, tcg_tmp, rn, pass, MO_32);
>> + tcg_gen_trunc_i64_i32(tcg_op1, tcg_tmp);
>> + read_vec_element(s, tcg_tmp, rm, pass, MO_32);
>> + tcg_gen_trunc_i64_i32(tcg_op2, tcg_tmp);
>
> Update for new _i32 helpers.
Doh. Fixup patch (respin of series available on demand):
===begin===
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 5eabf24..6bc0314 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5668,12 +5668,9 @@ static void handle_3same_float(DisasContext *s,
int size, int elements,
TCGv_i32 tcg_op1 = tcg_temp_new_i32();
TCGv_i32 tcg_op2 = tcg_temp_new_i32();
TCGv_i32 tcg_res = tcg_temp_new_i32();
- TCGv_i64 tcg_tmp = tcg_temp_new_i64();
- read_vec_element(s, tcg_tmp, rn, pass, MO_32);
- tcg_gen_trunc_i64_i32(tcg_op1, tcg_tmp);
- read_vec_element(s, tcg_tmp, rm, pass, MO_32);
- tcg_gen_trunc_i64_i32(tcg_op2, tcg_tmp);
+ read_vec_element_i32(s, tcg_op1, rn, pass, MO_32);
+ read_vec_element_i32(s, tcg_op2, rm, pass, MO_32);
switch (fpopcode) {
case 0x1a: /* FADD */
@@ -5708,15 +5705,17 @@ static void handle_3same_float(DisasContext
*s, int size, int elements,
g_assert_not_reached();
}
- tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
if (elements == 1) {
/* scalar single so clear high part */
+ TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+ tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
write_vec_element(s, tcg_tmp, rd, pass, MO_64);
+ tcg_temp_free_i64(tcg_tmp);
} else {
- write_vec_element(s, tcg_tmp, rd, pass, MO_32);
+ write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
}
- tcg_temp_free_i64(tcg_tmp);
tcg_temp_free_i32(tcg_res);
tcg_temp_free_i32(tcg_op1);
tcg_temp_free_i32(tcg_op2);
===endit===
thanks
-- PMM
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate
2014-01-23 15:28 [Qemu-devel] [PATCH v2 0/8] target-arm: A64 Neon instructions, set 2 Peter Maydell
` (6 preceding siblings ...)
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops Peter Maydell
@ 2014-01-23 15:28 ` Peter Maydell
2014-01-23 20:08 ` C Fontana
7 siblings, 1 reply; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 15:28 UTC (permalink / raw)
To: qemu-devel
Cc: patches, Michael Matz, Alexander Graf, Claudio Fontana,
Dirk Mueller, Will Newton, Laurent Desnogues, Alex Bennée,
kvmarm, Christoffer Dall, Richard Henderson
From: Alex Bennée <alex.bennee@linaro.org>
This implements a subset of the AdvSIMD shift operations (namely all the
none saturating or narrowing ones). The actual shift generation code
itself is common for both the scalar and vector cases but wrapped with
either vector element iteration or the fp reg access.
The rounding operations need to take special care to correctly reflect
the result of adding rounding bits on high bits as the intermediates do
not truncate.
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
---
target-arm/translate-a64.c | 381 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 379 insertions(+), 2 deletions(-)
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 5eabf24..9eb91fc4 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5531,15 +5531,220 @@ static void disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
unsupported_encoding(s, insn);
}
+/*
+ * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
+ *
+ * This code is handles the common shifting code and is used by both
+ * the vector and scalar code.
+ */
+static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+ TCGv_i64 tcg_rnd, bool accumulate,
+ bool is_u, int size, int shift)
+{
+ bool extended_result = false;
+ bool round = !TCGV_IS_UNUSED_I64(tcg_rnd);
+ int ext_lshift = 0;
+ TCGv_i64 tcg_src_hi;
+
+ if (round && size == 3) {
+ extended_result = true;
+ ext_lshift = 64 - shift;
+ tcg_src_hi = tcg_temp_new_i64();
+ } else if (shift == 64) {
+ if (!accumulate && is_u) {
+ /* result is zero */
+ tcg_gen_movi_i64(tcg_res, 0);
+ return;
+ }
+ }
+
+ /* Deal with the rounding step */
+ if (round) {
+ if (extended_result) {
+ TCGv_i64 tcg_zero = tcg_const_i64(0);
+ if (!is_u) {
+ /* take care of sign extending tcg_res */
+ tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
+ tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_src_hi,
+ tcg_rnd, tcg_zero);
+ } else {
+ tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_zero,
+ tcg_rnd, tcg_zero);
+ }
+ tcg_temp_free_i64(tcg_zero);
+ } else {
+ tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
+ }
+ }
+
+ /* Now do the shift right */
+ if (round && extended_result) {
+ /* extended case, >64 bit precision required */
+ if (ext_lshift == 0) {
+ /* special case, only high bits matter */
+ tcg_gen_mov_i64(tcg_src, tcg_src_hi);
+ } else {
+ tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+ tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
+ tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
+ }
+ } else {
+ if (is_u) {
+ if (shift == 64) {
+ /* essentially shifting in 64 zeros */
+ tcg_gen_movi_i64(tcg_src, 0);
+ } else {
+ tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+ }
+ } else {
+ if (shift == 64) {
+ /* effectively extending the sign-bit */
+ tcg_gen_sari_i64(tcg_src, tcg_src, 63);
+ } else {
+ tcg_gen_sari_i64(tcg_src, tcg_src, shift);
+ }
+ }
+ }
+
+ if (accumulate) {
+ tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
+ } else {
+ tcg_gen_mov_i64(tcg_res, tcg_src);
+ }
+
+ if (extended_result) {
+ tcg_temp_free(tcg_src_hi);
+ }
+}
+
+/* Common SHL/SLI - Shift left with an optional insert */
+static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+ bool insert, int shift)
+{
+ if (insert) { /* SLI */
+ tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift);
+ } else { /* SHL */
+ tcg_gen_shli_i64(tcg_res, tcg_src, shift);
+ }
+}
+
+/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
+static void handle_scalar_simd_shri(DisasContext *s,
+ bool is_u, int immh, int immb,
+ int opcode, int rn, int rd)
+{
+ const int size = 3;
+ int immhb = immh << 3 | immb;
+ int shift = 2 * (8 << size) - immhb;
+ bool accumulate = false;
+ bool round = false;
+ TCGv_i64 tcg_rn;
+ TCGv_i64 tcg_rd;
+ TCGv_i64 tcg_round;
+
+ if (!extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x02: /* SSRA / USRA (accumulate) */
+ accumulate = true;
+ break;
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ round = true;
+ break;
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ accumulate = round = true;
+ break;
+ }
+
+ if (round) {
+ uint64_t round_const = 1ULL << (shift - 1);
+ tcg_round = tcg_const_i64(round_const);
+ } else {
+ TCGV_UNUSED_I64(tcg_round);
+ }
+
+ tcg_rn = read_fp_dreg(s, rn);
+ tcg_rd = accumulate ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ accumulate, is_u, size, shift);
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+ if (round) {
+ tcg_temp_free_i64(tcg_round);
+ }
+}
+
+/* SHL/SLI - Scalar shift left */
+static void handle_scalar_simd_shli(DisasContext *s, bool insert,
+ int immh, int immb, int opcode,
+ int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+
+ if (!extract32(immh, 3, 1)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ tcg_rn = read_fp_dreg(s, rn);
+ tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
+
+ handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
+
+ write_fp_dreg(s, rd, tcg_rd);
+
+ tcg_temp_free_i64(tcg_rn);
+ tcg_temp_free_i64(tcg_rd);
+
+ return;
+}
+
/* C3.6.9 AdvSIMD scalar shift by immediate
* 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0
* +-----+---+-------------+------+------+--------+---+------+------+
* | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 | Rn | Rd |
* +-----+---+-------------+------+------+--------+---+------+------+
+ *
+ * This is the scalar version so it works on a fixed sized registers
*/
static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 5);
+ int immb = extract32(insn, 16, 3);
+ int immh = extract32(insn, 19, 4);
+ bool is_u = extract32(insn, 29, 1);
+
+ switch (opcode) {
+ case 0x00: /* SSHR / USHR */
+ case 0x02: /* SSRA / USRA */
+ case 0x04: /* SRSHR / URSHR */
+ case 0x06: /* SRSRA / URSRA */
+ handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x0a: /* SHL / SLI */
+ handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
+ break;
+ default:
+ unsupported_encoding(s, insn);
+ break;
+ }
+
+ return;
}
/* C3.6.10 AdvSIMD scalar three different
@@ -5845,6 +6050,150 @@ static void disas_simd_scalar_indexed(DisasContext *s, uint32_t insn)
unsupported_encoding(s, insn);
}
+/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
+static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = 2 * (8 << size) - immhb;
+ bool accumulate = false;
+ bool round = false;
+ int dsize = is_q ? 128 : 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ TCGMemOp memop = size | (is_u ? 0 : MO_SIGN);
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+ TCGv_i64 tcg_round;
+ int i;
+
+ if (extract32(immh, 3, 1) && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (size > 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ switch (opcode) {
+ case 0x02: /* SSRA / USRA (accumulate) */
+ accumulate = true;
+ break;
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ round = true;
+ break;
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ accumulate = round = true;
+ break;
+ }
+
+ if (round) {
+ uint64_t round_const = 1ULL << (shift - 1);
+ tcg_round = tcg_const_i64(round_const);
+ } else {
+ TCGV_UNUSED_I64(tcg_round);
+ }
+
+ for (i = 0; i < elements; i++) {
+ read_vec_element(s, tcg_rn, rn, i, memop);
+ if (accumulate) {
+ read_vec_element(s, tcg_rd, rd, i, memop);
+ }
+
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ accumulate, is_u, size, shift);
+
+ write_vec_element(s, tcg_rd, rd, i, size);
+ }
+
+ if (!is_q) {
+ clear_vec_high(s, rd);
+ }
+
+ if (round) {
+ tcg_temp_free_i64(tcg_round);
+ }
+}
+
+/* SHL/SLI - Vector shift left */
+static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ int dsize = is_q ? 128 : 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+ int i;
+
+ if (extract32(immh, 3, 1) && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ if (size > 3 && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ for (i = 0; i < elements; i++) {
+ read_vec_element(s, tcg_rn, rn, i, size);
+ if (insert) {
+ read_vec_element(s, tcg_rd, rd, i, size);
+ }
+
+ handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
+
+ write_vec_element(s, tcg_rd, rd, i, size);
+ }
+
+ if (!is_q) {
+ clear_vec_high(s, rd);
+ }
+
+ return;
+}
+
+/* USHLL/SHLL - Vector shift left with widening */
+static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
+ int immh, int immb, int opcode, int rn, int rd)
+{
+ int size = 32 - clz32(immh) - 1;
+ int immhb = immh << 3 | immb;
+ int shift = immhb - (8 << size);
+ int dsize = 64;
+ int esize = 8 << size;
+ int elements = dsize/esize;
+ TCGv_i64 tcg_rn = new_tmp_a64(s);
+ TCGv_i64 tcg_rd = new_tmp_a64(s);
+ int i;
+
+ if (size >= 3) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* For the LL variants the store is larger than the load,
+ * so if rd == rn we would overwrite parts of our input.
+ * So load everything right now and use shifts in the main loop.
+ */
+ read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
+
+ for (i = 0; i < elements; i++) {
+ tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
+ ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
+ tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
+ write_vec_element(s, tcg_rd, rd, i, size + 1);
+ }
+}
+
+
/* C3.6.14 AdvSIMD shift by immediate
* 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0
* +---+---+---+-------------+------+------+--------+---+------+------+
@@ -5853,7 +6202,35 @@ static void disas_simd_scalar_indexed(DisasContext *s, uint32_t insn)
*/
static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
{
- unsupported_encoding(s, insn);
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 5);
+ int immb = extract32(insn, 16, 3);
+ int immh = extract32(insn, 19, 4);
+ bool is_u = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+
+ switch (opcode) {
+ case 0x00: /* SSHR / USHR */
+ case 0x02: /* SSRA / USRA (accumulate) */
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x0a: /* SHL / SLI */
+ handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+ break;
+ case 0x14: /* SSHLL / USHLL */
+ handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
+ break;
+ default:
+ /* We don't currently implement any of the Narrow or saturating shifts;
+ * nor do we implement the fixed-point conversions in this
+ * encoding group (SCVTF, FCVTZS, UCVTF, FCVTZU).
+ */
+ unsupported_encoding(s, insn);
+ return;
+ }
}
static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
--
1.8.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate
2014-01-23 15:28 ` [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate Peter Maydell
@ 2014-01-23 20:08 ` C Fontana
2014-01-23 20:11 ` Peter Maydell
0 siblings, 1 reply; 16+ messages in thread
From: C Fontana @ 2014-01-23 20:08 UTC (permalink / raw)
To: Peter Maydell; +Cc: Alex Bennée, qemu-devel@nongnu.org
[-- Attachment #1: Type: text/plain, Size: 13643 bytes --]
Hi Peter, just two nits, answering from the tablet so sorry if arrives with
strange formatting, hope not..
On Thursday, January 23, 2014, Peter Maydell
<peter.maydell@linaro.org<javascript:_e({}, 'cvml',
'peter.maydell@linaro.org');>>
wrote:
> From: Alex Bennée <alex.bennee@linaro.org>
>
> This implements a subset of the AdvSIMD shift operations (namely all the
> none saturating or narrowing ones). The actual shift generation code
> itself is common for both the scalar and vector cases but wrapped with
> either vector element iteration or the fp reg access.
>
> The rounding operations need to take special care to correctly reflect
> the result of adding rounding bits on high bits as the intermediates do
> not truncate.
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Reviewed-by: Richard Henderson <rth@twiddle.net>
> ---
> target-arm/translate-a64.c | 381
> ++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 379 insertions(+), 2 deletions(-)
>
> diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
> index 5eabf24..9eb91fc4 100644
> --- a/target-arm/translate-a64.c
> +++ b/target-arm/translate-a64.c
> @@ -5531,15 +5531,220 @@ static void
> disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
> unsupported_encoding(s, insn);
> }
>
> +/*
> + * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
> + *
> + * This code is handles the common shifting code and is used by both
> + * the vector and scalar code.
> + */
> +static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
> + TCGv_i64 tcg_rnd, bool accumulate,
> + bool is_u, int size, int shift)
> +{
> + bool extended_result = false;
> + bool round = !TCGV_IS_UNUSED_I64(tcg_rnd);
> + int ext_lshift = 0;
> + TCGv_i64 tcg_src_hi;
> +
> + if (round && size == 3) {
> + extended_result = true;
> + ext_lshift = 64 - shift;
> + tcg_src_hi = tcg_temp_new_i64();
> + } else if (shift == 64) {
> + if (!accumulate && is_u) {
> + /* result is zero */
> + tcg_gen_movi_i64(tcg_res, 0);
> + return;
> + }
> + }
> +
> + /* Deal with the rounding step */
> + if (round) {
> + if (extended_result) {
> + TCGv_i64 tcg_zero = tcg_const_i64(0);
> + if (!is_u) {
> + /* take care of sign extending tcg_res */
> + tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
> + tcg_gen_add2_i64(tcg_src, tcg_src_hi,
> + tcg_src, tcg_src_hi,
> + tcg_rnd, tcg_zero);
> + } else {
> + tcg_gen_add2_i64(tcg_src, tcg_src_hi,
> + tcg_src, tcg_zero,
> + tcg_rnd, tcg_zero);
> + }
> + tcg_temp_free_i64(tcg_zero);
> + } else {
> + tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
> + }
> + }
> +
> + /* Now do the shift right */
> + if (round && extended_result) {
> + /* extended case, >64 bit precision required */
> + if (ext_lshift == 0) {
> + /* special case, only high bits matter */
> + tcg_gen_mov_i64(tcg_src, tcg_src_hi);
> + } else {
> + tcg_gen_shri_i64(tcg_src, tcg_src, shift);
> + tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
> + tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
> + }
> + } else {
> + if (is_u) {
> + if (shift == 64) {
> + /* essentially shifting in 64 zeros */
> + tcg_gen_movi_i64(tcg_src, 0);
> + } else {
> + tcg_gen_shri_i64(tcg_src, tcg_src, shift);
> + }
> + } else {
> + if (shift == 64) {
> + /* effectively extending the sign-bit */
> + tcg_gen_sari_i64(tcg_src, tcg_src, 63);
> + } else {
> + tcg_gen_sari_i64(tcg_src, tcg_src, shift);
> + }
> + }
> + }
> +
> + if (accumulate) {
> + tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
> + } else {
> + tcg_gen_mov_i64(tcg_res, tcg_src);
> + }
> +
> + if (extended_result) {
> + tcg_temp_free(tcg_src_hi);
should this be tcg_temp_free_i64 ?
> + }
> +}
> +
> +/* Common SHL/SLI - Shift left with an optional insert */
> +static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
> + bool insert, int shift)
> +{
> + if (insert) { /* SLI */
> + tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift);
> + } else { /* SHL */
> + tcg_gen_shli_i64(tcg_res, tcg_src, shift);
> + }
> +}
> +
> +/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate)
> */
> +static void handle_scalar_simd_shri(DisasContext *s,
> + bool is_u, int immh, int immb,
> + int opcode, int rn, int rd)
> +{
> + const int size = 3;
> + int immhb = immh << 3 | immb;
> + int shift = 2 * (8 << size) - immhb;
> + bool accumulate = false;
> + bool round = false;
> + TCGv_i64 tcg_rn;
> + TCGv_i64 tcg_rd;
> + TCGv_i64 tcg_round;
> +
> + if (!extract32(immh, 3, 1)) {
> + unallocated_encoding(s);
> + return;
> + }
> +
> + switch (opcode) {
> + case 0x02: /* SSRA / USRA (accumulate) */
> + accumulate = true;
> + break;
> + case 0x04: /* SRSHR / URSHR (rounding) */
> + round = true;
> + break;
> + case 0x06: /* SRSRA / URSRA (accum + rounding) */
> + accumulate = round = true;
> + break;
> + }
> +
> + if (round) {
> + uint64_t round_const = 1ULL << (shift - 1);
> + tcg_round = tcg_const_i64(round_const);
> + } else {
> + TCGV_UNUSED_I64(tcg_round);
> + }
> +
> + tcg_rn = read_fp_dreg(s, rn);
> + tcg_rd = accumulate ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
> +
> + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
> + accumulate, is_u, size, shift);
> +
> + write_fp_dreg(s, rd, tcg_rd);
> +
> + tcg_temp_free_i64(tcg_rn);
> + tcg_temp_free_i64(tcg_rd);
> + if (round) {
> + tcg_temp_free_i64(tcg_round);
> + }
> +}
> +
> +/* SHL/SLI - Scalar shift left */
> +static void handle_scalar_simd_shli(DisasContext *s, bool insert,
> + int immh, int immb, int opcode,
> + int rn, int rd)
> +{
> + int size = 32 - clz32(immh) - 1;
> + int immhb = immh << 3 | immb;
> + int shift = immhb - (8 << size);
> + TCGv_i64 tcg_rn = new_tmp_a64(s);
> + TCGv_i64 tcg_rd = new_tmp_a64(s);
> +
> + if (!extract32(immh, 3, 1)) {
> + unallocated_encoding(s);
> + return;
> + }
> +
> + tcg_rn = read_fp_dreg(s, rn);
> + tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
> +
> + handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
> +
> + write_fp_dreg(s, rd, tcg_rd);
> +
> + tcg_temp_free_i64(tcg_rn);
> + tcg_temp_free_i64(tcg_rd);
> +
> + return;
no harm but maybe remove return?
> +}
> +
> /* C3.6.9 AdvSIMD scalar shift by immediate
> * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0
> * +-----+---+-------------+------+------+--------+---+------+------+
> * | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 | Rn | Rd |
> * +-----+---+-------------+------+------+--------+---+------+------+
> + *
> + * This is the scalar version so it works on a fixed sized registers
> */
> static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
> {
> - unsupported_encoding(s, insn);
> + int rd = extract32(insn, 0, 5);
> + int rn = extract32(insn, 5, 5);
> + int opcode = extract32(insn, 11, 5);
> + int immb = extract32(insn, 16, 3);
> + int immh = extract32(insn, 19, 4);
> + bool is_u = extract32(insn, 29, 1);
> +
> + switch (opcode) {
> + case 0x00: /* SSHR / USHR */
> + case 0x02: /* SSRA / USRA */
> + case 0x04: /* SRSHR / URSHR */
> + case 0x06: /* SRSRA / URSRA */
> + handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
> + break;
> + case 0x0a: /* SHL / SLI */
> + handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
> + break;
> + default:
> + unsupported_encoding(s, insn);
> + break;
> + }
> +
> + return;
also here
> }
>
> /* C3.6.10 AdvSIMD scalar three different
> @@ -5845,6 +6050,150 @@ static void disas_simd_scalar_indexed(DisasContext
> *s, uint32_t insn)
> unsupported_encoding(s, insn);
> }
>
> +/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate)
> */
> +static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
> + int immh, int immb, int opcode, int rn,
> int rd)
> +{
> + int size = 32 - clz32(immh) - 1;
> + int immhb = immh << 3 | immb;
> + int shift = 2 * (8 << size) - immhb;
> + bool accumulate = false;
> + bool round = false;
> + int dsize = is_q ? 128 : 64;
> + int esize = 8 << size;
> + int elements = dsize/esize;
> + TCGMemOp memop = size | (is_u ? 0 : MO_SIGN);
> + TCGv_i64 tcg_rn = new_tmp_a64(s);
> + TCGv_i64 tcg_rd = new_tmp_a64(s);
> + TCGv_i64 tcg_round;
> + int i;
> +
> + if (extract32(immh, 3, 1) && !is_q) {
> + unallocated_encoding(s);
> + return;
> + }
> +
> + if (size > 3 && !is_q) {
> + unallocated_encoding(s);
> + return;
> + }
> +
> + switch (opcode) {
> + case 0x02: /* SSRA / USRA (accumulate) */
> + accumulate = true;
> + break;
> + case 0x04: /* SRSHR / URSHR (rounding) */
> + round = true;
> + break;
> + case 0x06: /* SRSRA / URSRA (accum + rounding) */
> + accumulate = round = true;
> + break;
> + }
> +
> + if (round) {
> + uint64_t round_const = 1ULL << (shift - 1);
> + tcg_round = tcg_const_i64(round_const);
> + } else {
> + TCGV_UNUSED_I64(tcg_round);
> + }
> +
> + for (i = 0; i < elements; i++) {
> + read_vec_element(s, tcg_rn, rn, i, memop);
> + if (accumulate) {
> + read_vec_element(s, tcg_rd, rd, i, memop);
> + }
> +
> + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
> + accumulate, is_u, size, shift);
> +
> + write_vec_element(s, tcg_rd, rd, i, size);
> + }
> +
> + if (!is_q) {
> + clear_vec_high(s, rd);
> + }
> +
> + if (round) {
> + tcg_temp_free_i64(tcg_round);
> + }
> +}
> +
> +/* SHL/SLI - Vector shift left */
> +static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
> + int immh, int immb, int opcode, int rn,
> int rd)
> +{
> + int size = 32 - clz32(immh) - 1;
> + int immhb = immh << 3 | immb;
> + int shift = immhb - (8 << size);
> + int dsize = is_q ? 128 : 64;
> + int esize = 8 << size;
> + int elements = dsize/esize;
> + TCGv_i64 tcg_rn = new_tmp_a64(s);
> + TCGv_i64 tcg_rd = new_tmp_a64(s);
> + int i;
> +
> + if (extract32(immh, 3, 1) && !is_q) {
> + unallocated_encoding(s);
> + return;
> + }
> +
> + if (size > 3 && !is_q) {
> + unallocated_encoding(s);
> + return;
> + }
> +
> + for (i = 0; i < elements; i++) {
> + read_vec_element(s, tcg_rn, rn, i, size);
> + if (insert) {
> + read_vec_element(s, tcg_rd, rd, i, size);
> + }
> +
> + handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
> +
> + write_vec_element(s, tcg_rd, rd, i, size);
> + }
> +
> + if (!is_q) {
> + clear_vec_high(s, rd);
> + }
> +
> + return;
also here.
Ciao
Claudio
> +}
> +
> +/* USHLL/SHLL - Vector shift left with widening */
> +static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
> + int immh, int immb, int opcode, int rn,
> int rd)
> +{
> + int size = 32 - clz32(immh) - 1;
> + int immhb = immh << 3 | immb;
> + int shift = immhb - (8 << size);
> + int dsize = 64;
> + int esize = 8 << size;
> + int elements = dsize/esize;
> + TCGv_i64 tcg_rn = new_tmp_a64(s);
> + TCGv_i64 tcg_rd = new_tmp_a64(s);
> + int i;
> +
> + if (size >= 3) {
> + unallocated_encoding(s);
> + return;
> + }
> +
> + /* For the LL variants the store is larger than the load,
> + * so if rd == rn we would overwrite parts of our input.
> + * So load everything right now and use shifts in the main loop.
> + */
> + read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
> +
> + for (i = 0; i < elements; i++) {
> + tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
> + ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
> + tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
> + write_vec_element(s, tcg_rd, rd, i, size + 1);
> + }
> +}
> +
> +
> /* C3.6.14 --
> 1.8.5
>
>
[-- Attachment #2: Type: text/html, Size: 15439 bytes --]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate
2014-01-23 20:08 ` C Fontana
@ 2014-01-23 20:11 ` Peter Maydell
0 siblings, 0 replies; 16+ messages in thread
From: Peter Maydell @ 2014-01-23 20:11 UTC (permalink / raw)
To: C Fontana; +Cc: Alex Bennée, qemu-devel@nongnu.org
On 23 January 2014 20:08, C Fontana <claudio.fontana@linaro.org> wrote:
>> + if (extended_result) {
>> + tcg_temp_free(tcg_src_hi);
>
>
>
> should this be tcg_temp_free_i64 ?
Yes, though it doesn't actually make a difference here
since we know we're building a 64 bit target QEMU.
>> + return;
>
>
> no harm but maybe remove return?
Agreed, and again on the second one.
thanks
-- PMM
^ permalink raw reply [flat|nested] 16+ messages in thread