* [PATCH v3 01/29] target/arm: Replace tcg_gen_dupi_vec with constants in gengvec.c
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 02/29] target/arm: Replace tcg_gen_dupi_vec with constants in translate-sve.c Richard Henderson
` (28 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Instead of copying a constant into a temporary with dupi,
use a vector constant directly.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/gengvec.c | 43 ++++++++++++++++++----------------------
1 file changed, 19 insertions(+), 24 deletions(-)
diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index 56a1dc1f75..726a1383ae 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -297,10 +297,9 @@ void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
{
TCGv_vec t = tcg_temp_new_vec_matching(d);
- TCGv_vec ones = tcg_temp_new_vec_matching(d);
+ TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
tcg_gen_shri_vec(vece, t, a, sh - 1);
- tcg_gen_dupi_vec(vece, ones, 1);
tcg_gen_and_vec(vece, t, t, ones);
tcg_gen_sari_vec(vece, d, a, sh);
tcg_gen_add_vec(vece, d, d, t);
@@ -492,10 +491,9 @@ void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
{
TCGv_vec t = tcg_temp_new_vec_matching(d);
- TCGv_vec ones = tcg_temp_new_vec_matching(d);
+ TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);
tcg_gen_shri_vec(vece, t, a, shift - 1);
- tcg_gen_dupi_vec(vece, ones, 1);
tcg_gen_and_vec(vece, t, t, ones);
tcg_gen_shri_vec(vece, d, a, shift);
tcg_gen_add_vec(vece, d, d, t);
@@ -685,9 +683,9 @@ static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
{
TCGv_vec t = tcg_temp_new_vec_matching(d);
- TCGv_vec m = tcg_temp_new_vec_matching(d);
+ int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh);
+ TCGv_vec m = tcg_constant_vec_matching(d, vece, mi);
- tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
tcg_gen_shri_vec(vece, t, a, sh);
tcg_gen_and_vec(vece, d, d, m);
tcg_gen_or_vec(vece, d, d, t);
@@ -773,10 +771,9 @@ static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
{
TCGv_vec t = tcg_temp_new_vec_matching(d);
- TCGv_vec m = tcg_temp_new_vec_matching(d);
+ TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh));
tcg_gen_shli_vec(vece, t, a, sh);
- tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
tcg_gen_and_vec(vece, d, d, m);
tcg_gen_or_vec(vece, d, d, t);
}
@@ -1044,14 +1041,13 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
TCGv_vec rval = tcg_temp_new_vec_matching(dst);
TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
- TCGv_vec msk, max;
+ TCGv_vec max;
tcg_gen_neg_vec(vece, rsh, shift);
if (vece == MO_8) {
tcg_gen_mov_vec(lsh, shift);
} else {
- msk = tcg_temp_new_vec_matching(dst);
- tcg_gen_dupi_vec(vece, msk, 0xff);
+ TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
tcg_gen_and_vec(vece, lsh, shift, msk);
tcg_gen_and_vec(vece, rsh, rsh, msk);
}
@@ -1064,9 +1060,6 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
tcg_gen_shlv_vec(vece, lval, src, lsh);
tcg_gen_shrv_vec(vece, rval, src, rsh);
- max = tcg_temp_new_vec_matching(dst);
- tcg_gen_dupi_vec(vece, max, 8 << vece);
-
/*
* The choice of LT (signed) and GEU (unsigned) are biased toward
* the instructions of the x86_64 host. For MO_8, the whole byte
@@ -1074,6 +1067,7 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
* have already masked to a byte and so a signed compare works.
* Other tcg hosts have a full set of comparisons and do not care.
*/
+ max = tcg_constant_vec_matching(dst, vece, 8 << vece);
if (vece == MO_8) {
tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
@@ -1170,6 +1164,7 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
+ TCGv_vec max, zero;
/*
* Rely on the TCG guarantee that out of range shifts produce
@@ -1180,15 +1175,15 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
if (vece == MO_8) {
tcg_gen_mov_vec(lsh, shift);
} else {
- tcg_gen_dupi_vec(vece, tmp, 0xff);
- tcg_gen_and_vec(vece, lsh, shift, tmp);
- tcg_gen_and_vec(vece, rsh, rsh, tmp);
+ TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
+ tcg_gen_and_vec(vece, lsh, shift, msk);
+ tcg_gen_and_vec(vece, rsh, rsh, msk);
}
/* Bound rsh so out of bound right shift gets -1. */
- tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
- tcg_gen_umin_vec(vece, rsh, rsh, tmp);
- tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
+ max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1);
+ tcg_gen_umin_vec(vece, rsh, rsh, max);
+ tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, max);
tcg_gen_shlv_vec(vece, lval, src, lsh);
tcg_gen_sarv_vec(vece, rval, src, rsh);
@@ -1197,12 +1192,12 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
tcg_gen_andc_vec(vece, lval, lval, tmp);
/* Select between left and right shift. */
+ zero = tcg_constant_vec_matching(dst, vece, 0);
if (vece == MO_8) {
- tcg_gen_dupi_vec(vece, tmp, 0);
- tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
+ tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval);
} else {
- tcg_gen_dupi_vec(vece, tmp, 0x80);
- tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
+ TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80);
+ tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval);
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 02/29] target/arm: Replace tcg_gen_dupi_vec with constants in translate-sve.c
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
2024-09-12 2:40 ` [PATCH v3 01/29] target/arm: Replace tcg_gen_dupi_vec with constants in gengvec.c Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 03/29] target/arm: Use cmpsel in gen_ushl_vec Richard Henderson
` (27 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Instead of copying a constant into a temporary with dupi,
use a vector constant directly.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-sve.c | 128 +++++++++++++--------------------
1 file changed, 49 insertions(+), 79 deletions(-)
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
index 9e2536dfe9..49d32fabc9 100644
--- a/target/arm/tcg/translate-sve.c
+++ b/target/arm/tcg/translate-sve.c
@@ -6081,9 +6081,9 @@ static void gen_sshll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
if (top) {
if (shl == halfbits) {
- TCGv_vec t = tcg_temp_new_vec_matching(d);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
- tcg_gen_and_vec(vece, d, n, t);
+ tcg_gen_and_vec(vece, d, n,
+ tcg_constant_vec_matching(d, vece,
+ MAKE_64BIT_MASK(halfbits, halfbits)));
} else {
tcg_gen_sari_vec(vece, d, n, halfbits);
tcg_gen_shli_vec(vece, d, d, shl);
@@ -6138,18 +6138,18 @@ static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)
if (top) {
if (shl == halfbits) {
- TCGv_vec t = tcg_temp_new_vec_matching(d);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
- tcg_gen_and_vec(vece, d, n, t);
+ tcg_gen_and_vec(vece, d, n,
+ tcg_constant_vec_matching(d, vece,
+ MAKE_64BIT_MASK(halfbits, halfbits)));
} else {
tcg_gen_shri_vec(vece, d, n, halfbits);
tcg_gen_shli_vec(vece, d, d, shl);
}
} else {
if (shl == 0) {
- TCGv_vec t = tcg_temp_new_vec_matching(d);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
- tcg_gen_and_vec(vece, d, n, t);
+ tcg_gen_and_vec(vece, d, n,
+ tcg_constant_vec_matching(d, vece,
+ MAKE_64BIT_MASK(0, halfbits)));
} else {
tcg_gen_shli_vec(vece, d, n, halfbits);
tcg_gen_shri_vec(vece, d, d, halfbits - shl);
@@ -6317,18 +6317,14 @@ static const TCGOpcode sqxtn_list[] = {
static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t mask = (1ull << halfbits) - 1;
int64_t min = -1ull << (halfbits - 1);
int64_t max = -min - 1;
- tcg_gen_dupi_vec(vece, t, min);
- tcg_gen_smax_vec(vece, d, n, t);
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_smin_vec(vece, d, d, t);
- tcg_gen_dupi_vec(vece, t, mask);
- tcg_gen_and_vec(vece, d, d, t);
+ tcg_gen_smax_vec(vece, d, n, tcg_constant_vec_matching(d, vece, min));
+ tcg_gen_smin_vec(vece, d, d, tcg_constant_vec_matching(d, vece, max));
+ tcg_gen_and_vec(vece, d, d, tcg_constant_vec_matching(d, vece, mask));
}
static const GVecGen2 sqxtnb_ops[3] = {
@@ -6349,19 +6345,15 @@ TRANS_FEAT(SQXTNB, aa64_sve2, do_narrow_extract, a, sqxtnb_ops)
static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t mask = (1ull << halfbits) - 1;
int64_t min = -1ull << (halfbits - 1);
int64_t max = -min - 1;
- tcg_gen_dupi_vec(vece, t, min);
- tcg_gen_smax_vec(vece, n, n, t);
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_smin_vec(vece, n, n, t);
+ tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min));
+ tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max));
tcg_gen_shli_vec(vece, n, n, halfbits);
- tcg_gen_dupi_vec(vece, t, mask);
- tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n);
}
static const GVecGen2 sqxtnt_ops[3] = {
@@ -6389,12 +6381,10 @@ static const TCGOpcode uqxtn_list[] = {
static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t max = (1ull << halfbits) - 1;
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_umin_vec(vece, d, n, t);
+ tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max));
}
static const GVecGen2 uqxtnb_ops[3] = {
@@ -6415,14 +6405,13 @@ TRANS_FEAT(UQXTNB, aa64_sve2, do_narrow_extract, a, uqxtnb_ops)
static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t max = (1ull << halfbits) - 1;
+ TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_umin_vec(vece, n, n, maxv);
tcg_gen_shli_vec(vece, n, n, halfbits);
- tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_gen_bitsel_vec(vece, d, maxv, d, n);
}
static const GVecGen2 uqxtnt_ops[3] = {
@@ -6450,14 +6439,11 @@ static const TCGOpcode sqxtun_list[] = {
static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t max = (1ull << halfbits) - 1;
- tcg_gen_dupi_vec(vece, t, 0);
- tcg_gen_smax_vec(vece, d, n, t);
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_umin_vec(vece, d, d, t);
+ tcg_gen_smax_vec(vece, d, n, tcg_constant_vec_matching(d, vece, 0));
+ tcg_gen_umin_vec(vece, d, d, tcg_constant_vec_matching(d, vece, max));
}
static const GVecGen2 sqxtunb_ops[3] = {
@@ -6478,16 +6464,14 @@ TRANS_FEAT(SQXTUNB, aa64_sve2, do_narrow_extract, a, sqxtunb_ops)
static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t max = (1ull << halfbits) - 1;
+ TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);
- tcg_gen_dupi_vec(vece, t, 0);
- tcg_gen_smax_vec(vece, n, n, t);
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0));
+ tcg_gen_umin_vec(vece, n, n, maxv);
tcg_gen_shli_vec(vece, n, n, halfbits);
- tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_gen_bitsel_vec(vece, d, maxv, d, n);
}
static const GVecGen2 sqxtunt_ops[3] = {
@@ -6551,13 +6535,11 @@ static void gen_shrnb64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
static void gen_shrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
tcg_gen_shri_vec(vece, n, n, shr);
- tcg_gen_dupi_vec(vece, t, mask);
- tcg_gen_and_vec(vece, d, n, t);
+ tcg_gen_and_vec(vece, d, n, tcg_constant_vec_matching(d, vece, mask));
}
static const TCGOpcode shrnb_vec_list[] = { INDEX_op_shri_vec, 0 };
@@ -6609,13 +6591,11 @@ static void gen_shrnt64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)
static void gen_shrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
uint64_t mask = MAKE_64BIT_MASK(0, halfbits);
tcg_gen_shli_vec(vece, n, n, halfbits - shr);
- tcg_gen_dupi_vec(vece, t, mask);
- tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n);
}
static const TCGOpcode shrnt_vec_list[] = { INDEX_op_shli_vec, 0 };
@@ -6658,14 +6638,12 @@ TRANS_FEAT(RSHRNT, aa64_sve2, do_shr_narrow, a, rshrnt_ops)
static void gen_sqshrunb_vec(unsigned vece, TCGv_vec d,
TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
+ uint64_t max = MAKE_64BIT_MASK(0, halfbits);
tcg_gen_sari_vec(vece, n, n, shr);
- tcg_gen_dupi_vec(vece, t, 0);
- tcg_gen_smax_vec(vece, n, n, t);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
- tcg_gen_umin_vec(vece, d, n, t);
+ tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0));
+ tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max));
}
static const TCGOpcode sqshrunb_vec_list[] = {
@@ -6690,16 +6668,15 @@ TRANS_FEAT(SQSHRUNB, aa64_sve2, do_shr_narrow, a, sqshrunb_ops)
static void gen_sqshrunt_vec(unsigned vece, TCGv_vec d,
TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
+ uint64_t max = MAKE_64BIT_MASK(0, halfbits);
+ TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);
tcg_gen_sari_vec(vece, n, n, shr);
- tcg_gen_dupi_vec(vece, t, 0);
- tcg_gen_smax_vec(vece, n, n, t);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
- tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0));
+ tcg_gen_umin_vec(vece, n, n, maxv);
tcg_gen_shli_vec(vece, n, n, halfbits);
- tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_gen_bitsel_vec(vece, d, maxv, d, n);
}
static const TCGOpcode sqshrunt_vec_list[] = {
@@ -6742,18 +6719,15 @@ TRANS_FEAT(SQRSHRUNT, aa64_sve2, do_shr_narrow, a, sqrshrunt_ops)
static void gen_sqshrnb_vec(unsigned vece, TCGv_vec d,
TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
int64_t min = -max - 1;
+ int64_t mask = MAKE_64BIT_MASK(0, halfbits);
tcg_gen_sari_vec(vece, n, n, shr);
- tcg_gen_dupi_vec(vece, t, min);
- tcg_gen_smax_vec(vece, n, n, t);
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_smin_vec(vece, n, n, t);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
- tcg_gen_and_vec(vece, d, n, t);
+ tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min));
+ tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max));
+ tcg_gen_and_vec(vece, d, n, tcg_constant_vec_matching(d, vece, mask));
}
static const TCGOpcode sqshrnb_vec_list[] = {
@@ -6778,19 +6752,16 @@ TRANS_FEAT(SQSHRNB, aa64_sve2, do_shr_narrow, a, sqshrnb_ops)
static void gen_sqshrnt_vec(unsigned vece, TCGv_vec d,
TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
int64_t min = -max - 1;
+ int64_t mask = MAKE_64BIT_MASK(0, halfbits);
tcg_gen_sari_vec(vece, n, n, shr);
- tcg_gen_dupi_vec(vece, t, min);
- tcg_gen_smax_vec(vece, n, n, t);
- tcg_gen_dupi_vec(vece, t, max);
- tcg_gen_smin_vec(vece, n, n, t);
+ tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min));
+ tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max));
tcg_gen_shli_vec(vece, n, n, halfbits);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
- tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n);
}
static const TCGOpcode sqshrnt_vec_list[] = {
@@ -6833,12 +6804,11 @@ TRANS_FEAT(SQRSHRNT, aa64_sve2, do_shr_narrow, a, sqrshrnt_ops)
static void gen_uqshrnb_vec(unsigned vece, TCGv_vec d,
TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
+ int64_t max = MAKE_64BIT_MASK(0, halfbits);
tcg_gen_shri_vec(vece, n, n, shr);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
- tcg_gen_umin_vec(vece, d, n, t);
+ tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max));
}
static const TCGOpcode uqshrnb_vec_list[] = {
@@ -6863,14 +6833,14 @@ TRANS_FEAT(UQSHRNB, aa64_sve2, do_shr_narrow, a, uqshrnb_ops)
static void gen_uqshrnt_vec(unsigned vece, TCGv_vec d,
TCGv_vec n, int64_t shr)
{
- TCGv_vec t = tcg_temp_new_vec_matching(d);
int halfbits = 4 << vece;
+ int64_t max = MAKE_64BIT_MASK(0, halfbits);
+ TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);
tcg_gen_shri_vec(vece, n, n, shr);
- tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
- tcg_gen_umin_vec(vece, n, n, t);
+ tcg_gen_umin_vec(vece, n, n, maxv);
tcg_gen_shli_vec(vece, n, n, halfbits);
- tcg_gen_bitsel_vec(vece, d, t, d, n);
+ tcg_gen_bitsel_vec(vece, d, maxv, d, n);
}
static const TCGOpcode uqshrnt_vec_list[] = {
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 03/29] target/arm: Use cmpsel in gen_ushl_vec
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
2024-09-12 2:40 ` [PATCH v3 01/29] target/arm: Replace tcg_gen_dupi_vec with constants in gengvec.c Richard Henderson
2024-09-12 2:40 ` [PATCH v3 02/29] target/arm: Replace tcg_gen_dupi_vec with constants in translate-sve.c Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 04/29] target/arm: Use cmpsel in gen_sshl_vec Richard Henderson
` (26 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Instead of cmp+and or cmp+andc, use cmpsel. This will
be better for hosts that use predicate registers for cmp.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/gengvec.c | 19 ++++++++-----------
1 file changed, 8 insertions(+), 11 deletions(-)
diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index 726a1383ae..3edbf3a262 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -1041,7 +1041,7 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
TCGv_vec rval = tcg_temp_new_vec_matching(dst);
TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
- TCGv_vec max;
+ TCGv_vec max, zero;
tcg_gen_neg_vec(vece, rsh, shift);
if (vece == MO_8) {
@@ -1061,23 +1061,20 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
tcg_gen_shrv_vec(vece, rval, src, rsh);
/*
- * The choice of LT (signed) and GEU (unsigned) are biased toward
+ * The choice of GE (signed) and GEU (unsigned) are biased toward
* the instructions of the x86_64 host. For MO_8, the whole byte
* is significant so we must use an unsigned compare; otherwise we
* have already masked to a byte and so a signed compare works.
* Other tcg hosts have a full set of comparisons and do not care.
*/
+ zero = tcg_constant_vec_matching(dst, vece, 0);
max = tcg_constant_vec_matching(dst, vece, 8 << vece);
if (vece == MO_8) {
- tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
- tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
- tcg_gen_andc_vec(vece, lval, lval, lsh);
- tcg_gen_andc_vec(vece, rval, rval, rsh);
+ tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval);
+ tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval);
} else {
- tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
- tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
- tcg_gen_and_vec(vece, lval, lval, lsh);
- tcg_gen_and_vec(vece, rval, rval, rsh);
+ tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval);
+ tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval);
}
tcg_gen_or_vec(vece, dst, lval, rval);
}
@@ -1087,7 +1084,7 @@ void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
{
static const TCGOpcode vecop_list[] = {
INDEX_op_neg_vec, INDEX_op_shlv_vec,
- INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
+ INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0
};
static const GVecGen3 ops[4] = {
{ .fniv = gen_ushl_vec,
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 04/29] target/arm: Use cmpsel in gen_sshl_vec
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (2 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 03/29] target/arm: Use cmpsel in gen_ushl_vec Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 05/29] target/arm: Use tcg_gen_extract2_i64 for EXT Richard Henderson
` (25 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Instead of cmp+and or cmp+andc, use cmpsel. This will
be better for hosts that use predicate registers for cmp.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/gengvec.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index 3edbf3a262..c5fc1b6cfb 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -1160,7 +1160,6 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
TCGv_vec rval = tcg_temp_new_vec_matching(dst);
TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
- TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
TCGv_vec max, zero;
/*
@@ -1180,16 +1179,15 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
/* Bound rsh so out of bound right shift gets -1. */
max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1);
tcg_gen_umin_vec(vece, rsh, rsh, max);
- tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, max);
tcg_gen_shlv_vec(vece, lval, src, lsh);
tcg_gen_sarv_vec(vece, rval, src, rsh);
/* Select in-bound left shift. */
- tcg_gen_andc_vec(vece, lval, lval, tmp);
+ zero = tcg_constant_vec_matching(dst, vece, 0);
+ tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval);
/* Select between left and right shift. */
- zero = tcg_constant_vec_matching(dst, vece, 0);
if (vece == MO_8) {
tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval);
} else {
@@ -1203,7 +1201,7 @@ void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
{
static const TCGOpcode vecop_list[] = {
INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
- INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
+ INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0
};
static const GVecGen3 ops[4] = {
{ .fniv = gen_sshl_vec,
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 05/29] target/arm: Use tcg_gen_extract2_i64 for EXT
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (3 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 04/29] target/arm: Use cmpsel in gen_sshl_vec Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 06/29] target/arm: Convert EXT to decodetree Richard Henderson
` (24 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Philippe Mathieu-Daudé, Peter Maydell
The extract2 tcg op performs the same operation
as the do_ext64 function.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 23 +++--------------------
1 file changed, 3 insertions(+), 20 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 6d5f12e8f5..1a0b2bb33b 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -8890,23 +8890,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
}
}
-static void do_ext64(DisasContext *s, TCGv_i64 tcg_left, TCGv_i64 tcg_right,
- int pos)
-{
- /* Extract 64 bits from the middle of two concatenated 64 bit
- * vector register slices left:right. The extracted bits start
- * at 'pos' bits into the right (least significant) side.
- * We return the result in tcg_right, and guarantee not to
- * trash tcg_left.
- */
- TCGv_i64 tcg_tmp = tcg_temp_new_i64();
- assert(pos > 0 && pos < 64);
-
- tcg_gen_shri_i64(tcg_right, tcg_right, pos);
- tcg_gen_shli_i64(tcg_tmp, tcg_left, 64 - pos);
- tcg_gen_or_i64(tcg_right, tcg_right, tcg_tmp);
-}
-
/* EXT
* 31 30 29 24 23 22 21 20 16 15 14 11 10 9 5 4 0
* +---+---+-------------+-----+---+------+---+------+---+------+------+
@@ -8944,7 +8927,7 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
read_vec_element(s, tcg_resl, rn, 0, MO_64);
if (pos != 0) {
read_vec_element(s, tcg_resh, rm, 0, MO_64);
- do_ext64(s, tcg_resh, tcg_resl, pos);
+ tcg_gen_extract2_i64(tcg_resl, tcg_resl, tcg_resh, pos);
}
} else {
TCGv_i64 tcg_hh;
@@ -8965,10 +8948,10 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
elt++;
if (pos != 0) {
- do_ext64(s, tcg_resh, tcg_resl, pos);
+ tcg_gen_extract2_i64(tcg_resl, tcg_resl, tcg_resh, pos);
tcg_hh = tcg_temp_new_i64();
read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64);
- do_ext64(s, tcg_hh, tcg_resh, pos);
+ tcg_gen_extract2_i64(tcg_resh, tcg_resh, tcg_hh, pos);
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 06/29] target/arm: Convert EXT to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (4 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 05/29] target/arm: Use tcg_gen_extract2_i64 for EXT Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 07/29] target/arm: Convert TBL, TBX " Richard Henderson
` (23 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Philippe Mathieu-Daudé, Peter Maydell
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 121 +++++++++++++--------------------
target/arm/tcg/a64.decode | 5 ++
2 files changed, 53 insertions(+), 73 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 1a0b2bb33b..48188d4116 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6582,6 +6582,54 @@ static bool trans_FCSEL(DisasContext *s, arg_FCSEL *a)
return true;
}
+/*
+ * Advanced SIMD Extract
+ */
+
+static bool trans_EXT_d(DisasContext *s, arg_EXT_d *a)
+{
+ if (fp_access_check(s)) {
+ TCGv_i64 lo = read_fp_dreg(s, a->rn);
+ if (a->imm != 0) {
+ TCGv_i64 hi = read_fp_dreg(s, a->rm);
+ tcg_gen_extract2_i64(lo, lo, hi, a->imm * 8);
+ }
+ write_fp_dreg(s, a->rd, lo);
+ }
+ return true;
+}
+
+static bool trans_EXT_q(DisasContext *s, arg_EXT_q *a)
+{
+ TCGv_i64 lo, hi;
+ int pos = (a->imm & 7) * 8;
+ int elt = a->imm >> 3;
+
+ if (!fp_access_check(s)) {
+ return true;
+ }
+
+ lo = tcg_temp_new_i64();
+ hi = tcg_temp_new_i64();
+
+ read_vec_element(s, lo, a->rn, elt, MO_64);
+ elt++;
+ read_vec_element(s, hi, elt & 2 ? a->rm : a->rn, elt & 1, MO_64);
+ elt++;
+
+ if (pos != 0) {
+ TCGv_i64 hh = tcg_temp_new_i64();
+ tcg_gen_extract2_i64(lo, lo, hi, pos);
+ read_vec_element(s, hh, a->rm, elt & 1, MO_64);
+ tcg_gen_extract2_i64(hi, hi, hh, pos);
+ }
+
+ write_vec_element(s, lo, a->rd, 0, MO_64);
+ write_vec_element(s, hi, a->rd, 1, MO_64);
+ clear_vec_high(s, true, a->rd);
+ return true;
+}
+
/*
* Floating-point data-processing (3 source)
*/
@@ -8890,78 +8938,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
}
}
-/* EXT
- * 31 30 29 24 23 22 21 20 16 15 14 11 10 9 5 4 0
- * +---+---+-------------+-----+---+------+---+------+---+------+------+
- * | 0 | Q | 1 0 1 1 1 0 | op2 | 0 | Rm | 0 | imm4 | 0 | Rn | Rd |
- * +---+---+-------------+-----+---+------+---+------+---+------+------+
- */
-static void disas_simd_ext(DisasContext *s, uint32_t insn)
-{
- int is_q = extract32(insn, 30, 1);
- int op2 = extract32(insn, 22, 2);
- int imm4 = extract32(insn, 11, 4);
- int rm = extract32(insn, 16, 5);
- int rn = extract32(insn, 5, 5);
- int rd = extract32(insn, 0, 5);
- int pos = imm4 << 3;
- TCGv_i64 tcg_resl, tcg_resh;
-
- if (op2 != 0 || (!is_q && extract32(imm4, 3, 1))) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- tcg_resh = tcg_temp_new_i64();
- tcg_resl = tcg_temp_new_i64();
-
- /* Vd gets bits starting at pos bits into Vm:Vn. This is
- * either extracting 128 bits from a 128:128 concatenation, or
- * extracting 64 bits from a 64:64 concatenation.
- */
- if (!is_q) {
- read_vec_element(s, tcg_resl, rn, 0, MO_64);
- if (pos != 0) {
- read_vec_element(s, tcg_resh, rm, 0, MO_64);
- tcg_gen_extract2_i64(tcg_resl, tcg_resl, tcg_resh, pos);
- }
- } else {
- TCGv_i64 tcg_hh;
- typedef struct {
- int reg;
- int elt;
- } EltPosns;
- EltPosns eltposns[] = { {rn, 0}, {rn, 1}, {rm, 0}, {rm, 1} };
- EltPosns *elt = eltposns;
-
- if (pos >= 64) {
- elt++;
- pos -= 64;
- }
-
- read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
- elt++;
- read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
- elt++;
- if (pos != 0) {
- tcg_gen_extract2_i64(tcg_resl, tcg_resl, tcg_resh, pos);
- tcg_hh = tcg_temp_new_i64();
- read_vec_element(s, tcg_hh, elt->reg, elt->elt, MO_64);
- tcg_gen_extract2_i64(tcg_resh, tcg_resh, tcg_hh, pos);
- }
- }
-
- write_vec_element(s, tcg_resl, rd, 0, MO_64);
- if (is_q) {
- write_vec_element(s, tcg_resh, rd, 1, MO_64);
- }
- clear_vec_high(s, is_q, rd);
-}
-
/* TBL/TBX
* 31 30 29 24 23 22 21 20 16 15 14 13 12 11 10 9 5 4 0
* +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
@@ -11860,7 +11836,6 @@ static const AArch64DecodeTable data_proc_simd[] = {
{ 0x0f000400, 0x9f800400, disas_simd_shift_imm },
{ 0x0e000000, 0xbf208c00, disas_simd_tb },
{ 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
- { 0x2e000000, 0xbf208400, disas_simd_ext },
{ 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
{ 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
{ 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 },
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 62df4c4ceb..f72f95865f 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1136,3 +1136,8 @@ FMADD 0001 1111 .. 0 ..... 0 ..... ..... ..... @rrrr_hsd
FMSUB 0001 1111 .. 0 ..... 1 ..... ..... ..... @rrrr_hsd
FNMADD 0001 1111 .. 1 ..... 0 ..... ..... ..... @rrrr_hsd
FNMSUB 0001 1111 .. 1 ..... 1 ..... ..... ..... @rrrr_hsd
+
+# Advanced SIMD Extract
+
+EXT_d 0010 1110 00 0 rm:5 00 imm:3 0 rn:5 rd:5
+EXT_q 0110 1110 00 0 rm:5 0 imm:4 0 rn:5 rd:5
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 07/29] target/arm: Convert TBL, TBX to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (5 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 06/29] target/arm: Convert EXT to decodetree Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 08/29] target/arm: Convert UZP, TRN, ZIP " Richard Henderson
` (22 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Philippe Mathieu-Daudé, Peter Maydell
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 47 ++++++++++------------------------
target/arm/tcg/a64.decode | 4 +++
2 files changed, 18 insertions(+), 33 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 48188d4116..70173c67c2 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -4680,6 +4680,20 @@ static bool trans_EXTR(DisasContext *s, arg_extract *a)
return true;
}
+static bool trans_TBL_TBX(DisasContext *s, arg_TBL_TBX *a)
+{
+ if (fp_access_check(s)) {
+ int len = (a->len + 1) * 16;
+
+ tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rm), tcg_env,
+ a->q ? 16 : 8, vec_full_reg_size(s),
+ (len << 6) | (a->tbx << 5) | a->rn,
+ gen_helper_simd_tblx);
+ }
+ return true;
+}
+
/*
* Cryptographic AES, SHA, SHA512
*/
@@ -8938,38 +8952,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
}
}
-/* TBL/TBX
- * 31 30 29 24 23 22 21 20 16 15 14 13 12 11 10 9 5 4 0
- * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
- * | 0 | Q | 0 0 1 1 1 0 | op2 | 0 | Rm | 0 | len | op | 0 0 | Rn | Rd |
- * +---+---+-------------+-----+---+------+---+-----+----+-----+------+------+
- */
-static void disas_simd_tb(DisasContext *s, uint32_t insn)
-{
- int op2 = extract32(insn, 22, 2);
- int is_q = extract32(insn, 30, 1);
- int rm = extract32(insn, 16, 5);
- int rn = extract32(insn, 5, 5);
- int rd = extract32(insn, 0, 5);
- int is_tbx = extract32(insn, 12, 1);
- int len = (extract32(insn, 13, 2) + 1) * 16;
-
- if (op2 != 0) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
- vec_full_reg_offset(s, rm), tcg_env,
- is_q ? 16 : 8, vec_full_reg_size(s),
- (len << 6) | (is_tbx << 5) | rn,
- gen_helper_simd_tblx);
-}
-
/* ZIP/UZP/TRN
* 31 30 29 24 23 22 21 20 16 15 14 12 11 10 9 5 4 0
* +---+---+-------------+------+---+------+---+------------------+------+
@@ -11834,7 +11816,6 @@ static const AArch64DecodeTable data_proc_simd[] = {
/* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
{ 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
{ 0x0f000400, 0x9f800400, disas_simd_shift_imm },
- { 0x0e000000, 0xbf208c00, disas_simd_tb },
{ 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
{ 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
{ 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index f72f95865f..e2a3ef62ef 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1141,3 +1141,7 @@ FNMSUB 0001 1111 .. 1 ..... 1 ..... ..... ..... @rrrr_hsd
EXT_d 0010 1110 00 0 rm:5 00 imm:3 0 rn:5 rd:5
EXT_q 0110 1110 00 0 rm:5 0 imm:4 0 rn:5 rd:5
+
+# Advanced SIMD Table Lookup
+
+TBL_TBX 0 q:1 00 1110 000 rm:5 0 len:2 tbx:1 00 rn:5 rd:5
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 08/29] target/arm: Convert UZP, TRN, ZIP to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (6 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 07/29] target/arm: Convert TBL, TBX " Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 09/29] target/arm: Simplify do_reduction_op Richard Henderson
` (21 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 158 ++++++++++++++-------------------
target/arm/tcg/a64.decode | 9 ++
2 files changed, 77 insertions(+), 90 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 70173c67c2..04160b2513 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -4694,6 +4694,74 @@ static bool trans_TBL_TBX(DisasContext *s, arg_TBL_TBX *a)
return true;
}
+typedef int simd_permute_idx_fn(int i, int part, int elements);
+
+static bool do_simd_permute(DisasContext *s, arg_qrrr_e *a,
+ simd_permute_idx_fn *fn, int part)
+{
+ MemOp esz = a->esz;
+ int datasize = a->q ? 16 : 8;
+ int elements = datasize >> esz;
+ TCGv_i64 tcg_res[2], tcg_ele;
+
+ if (esz == MO_64 && !a->q) {
+ return false;
+ }
+ if (!fp_access_check(s)) {
+ return true;
+ }
+
+ tcg_res[0] = tcg_temp_new_i64();
+ tcg_res[1] = a->q ? tcg_temp_new_i64() : NULL;
+ tcg_ele = tcg_temp_new_i64();
+
+ for (int i = 0; i < elements; i++) {
+ int o, w, idx;
+
+ idx = fn(i, part, elements);
+ read_vec_element(s, tcg_ele, (idx & elements ? a->rm : a->rn),
+ idx & (elements - 1), esz);
+
+ w = (i << (esz + 3)) / 64;
+ o = (i << (esz + 3)) % 64;
+ if (o == 0) {
+ tcg_gen_mov_i64(tcg_res[w], tcg_ele);
+ } else {
+ tcg_gen_deposit_i64(tcg_res[w], tcg_res[w], tcg_ele, o, 8 << esz);
+ }
+ }
+
+ for (int i = a->q; i >= 0; --i) {
+ write_vec_element(s, tcg_res[i], a->rd, i, MO_64);
+ }
+ clear_vec_high(s, a->q, a->rd);
+ return true;
+}
+
+static int permute_load_uzp(int i, int part, int elements)
+{
+ return 2 * i + part;
+}
+
+TRANS(UZP1, do_simd_permute, a, permute_load_uzp, 0)
+TRANS(UZP2, do_simd_permute, a, permute_load_uzp, 1)
+
+static int permute_load_trn(int i, int part, int elements)
+{
+ return (i & 1) * elements + (i & ~1) + part;
+}
+
+TRANS(TRN1, do_simd_permute, a, permute_load_trn, 0)
+TRANS(TRN2, do_simd_permute, a, permute_load_trn, 1)
+
+static int permute_load_zip(int i, int part, int elements)
+{
+ return (i & 1) * elements + ((part * elements + i) >> 1);
+}
+
+TRANS(ZIP1, do_simd_permute, a, permute_load_zip, 0)
+TRANS(ZIP2, do_simd_permute, a, permute_load_zip, 1)
+
/*
* Cryptographic AES, SHA, SHA512
*/
@@ -8952,95 +9020,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
}
}
-/* ZIP/UZP/TRN
- * 31 30 29 24 23 22 21 20 16 15 14 12 11 10 9 5 4 0
- * +---+---+-------------+------+---+------+---+------------------+------+
- * | 0 | Q | 0 0 1 1 1 0 | size | 0 | Rm | 0 | opc | 1 0 | Rn | Rd |
- * +---+---+-------------+------+---+------+---+------------------+------+
- */
-static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
-{
- int rd = extract32(insn, 0, 5);
- int rn = extract32(insn, 5, 5);
- int rm = extract32(insn, 16, 5);
- int size = extract32(insn, 22, 2);
- /* opc field bits [1:0] indicate ZIP/UZP/TRN;
- * bit 2 indicates 1 vs 2 variant of the insn.
- */
- int opcode = extract32(insn, 12, 2);
- bool part = extract32(insn, 14, 1);
- bool is_q = extract32(insn, 30, 1);
- int esize = 8 << size;
- int i;
- int datasize = is_q ? 128 : 64;
- int elements = datasize / esize;
- TCGv_i64 tcg_res[2], tcg_ele;
-
- if (opcode == 0 || (size == 3 && !is_q)) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- tcg_res[0] = tcg_temp_new_i64();
- tcg_res[1] = is_q ? tcg_temp_new_i64() : NULL;
- tcg_ele = tcg_temp_new_i64();
-
- for (i = 0; i < elements; i++) {
- int o, w;
-
- switch (opcode) {
- case 1: /* UZP1/2 */
- {
- int midpoint = elements / 2;
- if (i < midpoint) {
- read_vec_element(s, tcg_ele, rn, 2 * i + part, size);
- } else {
- read_vec_element(s, tcg_ele, rm,
- 2 * (i - midpoint) + part, size);
- }
- break;
- }
- case 2: /* TRN1/2 */
- if (i & 1) {
- read_vec_element(s, tcg_ele, rm, (i & ~1) + part, size);
- } else {
- read_vec_element(s, tcg_ele, rn, (i & ~1) + part, size);
- }
- break;
- case 3: /* ZIP1/2 */
- {
- int base = part * elements / 2;
- if (i & 1) {
- read_vec_element(s, tcg_ele, rm, base + (i >> 1), size);
- } else {
- read_vec_element(s, tcg_ele, rn, base + (i >> 1), size);
- }
- break;
- }
- default:
- g_assert_not_reached();
- }
-
- w = (i * esize) / 64;
- o = (i * esize) % 64;
- if (o == 0) {
- tcg_gen_mov_i64(tcg_res[w], tcg_ele);
- } else {
- tcg_gen_shli_i64(tcg_ele, tcg_ele, o);
- tcg_gen_or_i64(tcg_res[w], tcg_res[w], tcg_ele);
- }
- }
-
- for (i = 0; i <= is_q; ++i) {
- write_vec_element(s, tcg_res[i], rd, i, MO_64);
- }
- clear_vec_high(s, is_q, rd);
-}
-
/*
* do_reduction_op helper
*
@@ -11816,7 +11795,6 @@ static const AArch64DecodeTable data_proc_simd[] = {
/* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
{ 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
{ 0x0f000400, 0x9f800400, disas_simd_shift_imm },
- { 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
{ 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
{ 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
{ 0x0e780800, 0x8f7e0c00, disas_simd_two_reg_misc_fp16 },
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index e2a3ef62ef..7f71c56f83 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1145,3 +1145,12 @@ EXT_q 0110 1110 00 0 rm:5 0 imm:4 0 rn:5 rd:5
# Advanced SIMD Table Lookup
TBL_TBX 0 q:1 00 1110 000 rm:5 0 len:2 tbx:1 00 rn:5 rd:5
+
+# Advanced SIMD Permute
+
+UZP1 0.00 1110 .. 0 ..... 0 001 10 ..... ..... @qrrr_e
+UZP2 0.00 1110 .. 0 ..... 0 101 10 ..... ..... @qrrr_e
+TRN1 0.00 1110 .. 0 ..... 0 010 10 ..... ..... @qrrr_e
+TRN2 0.00 1110 .. 0 ..... 0 110 10 ..... ..... @qrrr_e
+ZIP1 0.00 1110 .. 0 ..... 0 011 10 ..... ..... @qrrr_e
+ZIP2 0.00 1110 .. 0 ..... 0 111 10 ..... ..... @qrrr_e
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 09/29] target/arm: Simplify do_reduction_op
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (7 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 08/29] target/arm: Convert UZP, TRN, ZIP " Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 10/29] target/arm: Convert ADDV, *ADDLV, *MAXV, *MINV to decodetree Richard Henderson
` (20 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Use simple shift and add instead of ctpop, ctz, shift and mask.
Unlike SVE, there is no predicate to disable elements.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 40 +++++++++++-----------------------
1 file changed, 13 insertions(+), 27 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 04160b2513..74efb35164 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -9027,34 +9027,23 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
* important for correct NaN propagation that we do these
* operations in exactly the order specified by the pseudocode.
*
- * This is a recursive function, TCG temps should be freed by the
- * calling function once it is done with the values.
+ * This is a recursive function.
*/
static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
- int esize, int size, int vmap, TCGv_ptr fpst)
+ MemOp esz, int ebase, int ecount, TCGv_ptr fpst)
{
- if (esize == size) {
- int element;
- MemOp msize = esize == 16 ? MO_16 : MO_32;
- TCGv_i32 tcg_elem;
-
- /* We should have one register left here */
- assert(ctpop8(vmap) == 1);
- element = ctz32(vmap);
- assert(element < 8);
-
- tcg_elem = tcg_temp_new_i32();
- read_vec_element_i32(s, tcg_elem, rn, element, msize);
+ if (ecount == 1) {
+ TCGv_i32 tcg_elem = tcg_temp_new_i32();
+ read_vec_element_i32(s, tcg_elem, rn, ebase, esz);
return tcg_elem;
} else {
- int bits = size / 2;
- int shift = ctpop8(vmap) / 2;
- int vmap_lo = (vmap >> shift) & vmap;
- int vmap_hi = (vmap & ~vmap_lo);
+ int half = ecount >> 1;
TCGv_i32 tcg_hi, tcg_lo, tcg_res;
- tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
- tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
+ tcg_hi = do_reduction_op(s, fpopcode, rn, esz,
+ ebase + half, half, fpst);
+ tcg_lo = do_reduction_op(s, fpopcode, rn, esz,
+ ebase, half, fpst);
tcg_res = tcg_temp_new_i32();
switch (fpopcode) {
@@ -9105,7 +9094,6 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
bool is_u = extract32(insn, 29, 1);
bool is_fp = false;
bool is_min = false;
- int esize;
int elements;
int i;
TCGv_i64 tcg_res, tcg_elt;
@@ -9152,8 +9140,7 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
return;
}
- esize = 8 << size;
- elements = (is_q ? 128 : 64) / esize;
+ elements = (is_q ? 16 : 8) >> size;
tcg_res = tcg_temp_new_i64();
tcg_elt = tcg_temp_new_i64();
@@ -9208,9 +9195,8 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
*/
TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
int fpopcode = opcode | is_min << 4 | is_u << 5;
- int vmap = (1 << elements) - 1;
- TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize,
- (is_q ? 128 : 64), vmap, fpst);
+ TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, size,
+ 0, elements, fpst);
tcg_gen_extu_i32_i64(tcg_res, tcg_res32);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 10/29] target/arm: Convert ADDV, *ADDLV, *MAXV, *MINV to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (8 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 09/29] target/arm: Simplify do_reduction_op Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 11/29] target/arm: Convert FMAXNMV, FMINNMV, FMAXV, FMINV " Richard Henderson
` (19 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 140 ++++++++++++---------------------
target/arm/tcg/a64.decode | 12 +++
2 files changed, 61 insertions(+), 91 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 74efb35164..593a1774d8 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6794,6 +6794,47 @@ TRANS(FNMADD, do_fmadd, a, true, true)
TRANS(FMSUB, do_fmadd, a, false, true)
TRANS(FNMSUB, do_fmadd, a, true, false)
+/*
+ * Advanced SIMD Across Lanes
+ */
+
+static bool do_int_reduction(DisasContext *s, arg_qrr_e *a, bool widen,
+ MemOp src_sign, NeonGenTwo64OpFn *fn)
+{
+ TCGv_i64 tcg_res, tcg_elt;
+ MemOp src_mop = a->esz | src_sign;
+ int elements = (a->q ? 16 : 8) >> a->esz;
+
+ /* Reject MO_64, and MO_32 without Q: a minimum of 4 elements. */
+ if (elements < 4) {
+ return false;
+ }
+ if (!fp_access_check(s)) {
+ return true;
+ }
+
+ tcg_res = tcg_temp_new_i64();
+ tcg_elt = tcg_temp_new_i64();
+
+ read_vec_element(s, tcg_res, a->rn, 0, src_mop);
+ for (int i = 1; i < elements; i++) {
+ read_vec_element(s, tcg_elt, a->rn, i, src_mop);
+ fn(tcg_res, tcg_res, tcg_elt);
+ }
+
+ tcg_gen_ext_i64(tcg_res, tcg_res, a->esz + widen);
+ write_fp_dreg(s, a->rd, tcg_res);
+ return true;
+}
+
+TRANS(ADDV, do_int_reduction, a, false, 0, tcg_gen_add_i64)
+TRANS(SADDLV, do_int_reduction, a, true, MO_SIGN, tcg_gen_add_i64)
+TRANS(UADDLV, do_int_reduction, a, true, 0, tcg_gen_add_i64)
+TRANS(SMAXV, do_int_reduction, a, false, MO_SIGN, tcg_gen_smax_i64)
+TRANS(UMAXV, do_int_reduction, a, false, 0, tcg_gen_umax_i64)
+TRANS(SMINV, do_int_reduction, a, false, MO_SIGN, tcg_gen_smin_i64)
+TRANS(UMINV, do_int_reduction, a, false, 0, tcg_gen_umin_i64)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9092,27 +9133,10 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
int opcode = extract32(insn, 12, 5);
bool is_q = extract32(insn, 30, 1);
bool is_u = extract32(insn, 29, 1);
- bool is_fp = false;
bool is_min = false;
int elements;
- int i;
- TCGv_i64 tcg_res, tcg_elt;
switch (opcode) {
- case 0x1b: /* ADDV */
- if (is_u) {
- unallocated_encoding(s);
- return;
- }
- /* fall through */
- case 0x3: /* SADDLV, UADDLV */
- case 0xa: /* SMAXV, UMAXV */
- case 0x1a: /* SMINV, UMINV */
- if (size == 3 || (size == 2 && !is_q)) {
- unallocated_encoding(s);
- return;
- }
- break;
case 0xc: /* FMAXNMV, FMINNMV */
case 0xf: /* FMAXV, FMINV */
/* Bit 1 of size field encodes min vs max and the actual size
@@ -9121,7 +9145,6 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
* precision.
*/
is_min = extract32(size, 1, 1);
- is_fp = true;
if (!is_u && dc_isar_feature(aa64_fp16, s)) {
size = 1;
} else if (!is_u || !is_q || extract32(size, 0, 1)) {
@@ -9132,6 +9155,10 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
}
break;
default:
+ case 0x3: /* SADDLV, UADDLV */
+ case 0xa: /* SMAXV, UMAXV */
+ case 0x1a: /* SMINV, UMINV */
+ case 0x1b: /* ADDV */
unallocated_encoding(s);
return;
}
@@ -9142,52 +9169,7 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
elements = (is_q ? 16 : 8) >> size;
- tcg_res = tcg_temp_new_i64();
- tcg_elt = tcg_temp_new_i64();
-
- /* These instructions operate across all lanes of a vector
- * to produce a single result. We can guarantee that a 64
- * bit intermediate is sufficient:
- * + for [US]ADDLV the maximum element size is 32 bits, and
- * the result type is 64 bits
- * + for FMAX*V, FMIN*V, ADDV the intermediate type is the
- * same as the element size, which is 32 bits at most
- * For the integer operations we can choose to work at 64
- * or 32 bits and truncate at the end; for simplicity
- * we use 64 bits always. The floating point
- * ops do require 32 bit intermediates, though.
- */
- if (!is_fp) {
- read_vec_element(s, tcg_res, rn, 0, size | (is_u ? 0 : MO_SIGN));
-
- for (i = 1; i < elements; i++) {
- read_vec_element(s, tcg_elt, rn, i, size | (is_u ? 0 : MO_SIGN));
-
- switch (opcode) {
- case 0x03: /* SADDLV / UADDLV */
- case 0x1b: /* ADDV */
- tcg_gen_add_i64(tcg_res, tcg_res, tcg_elt);
- break;
- case 0x0a: /* SMAXV / UMAXV */
- if (is_u) {
- tcg_gen_umax_i64(tcg_res, tcg_res, tcg_elt);
- } else {
- tcg_gen_smax_i64(tcg_res, tcg_res, tcg_elt);
- }
- break;
- case 0x1a: /* SMINV / UMINV */
- if (is_u) {
- tcg_gen_umin_i64(tcg_res, tcg_res, tcg_elt);
- } else {
- tcg_gen_smin_i64(tcg_res, tcg_res, tcg_elt);
- }
- break;
- default:
- g_assert_not_reached();
- }
-
- }
- } else {
+ {
/* Floating point vector reduction ops which work across 32
* bit (single) or 16 bit (half-precision) intermediates.
* Note that correct NaN propagation requires that we do these
@@ -9195,34 +9177,10 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
*/
TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
int fpopcode = opcode | is_min << 4 | is_u << 5;
- TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, size,
- 0, elements, fpst);
- tcg_gen_extu_i32_i64(tcg_res, tcg_res32);
+ TCGv_i32 tcg_res = do_reduction_op(s, fpopcode, rn, size,
+ 0, elements, fpst);
+ write_fp_sreg(s, rd, tcg_res);
}
-
- /* Now truncate the result to the width required for the final output */
- if (opcode == 0x03) {
- /* SADDLV, UADDLV: result is 2*esize */
- size++;
- }
-
- switch (size) {
- case 0:
- tcg_gen_ext8u_i64(tcg_res, tcg_res);
- break;
- case 1:
- tcg_gen_ext16u_i64(tcg_res, tcg_res);
- break;
- case 2:
- tcg_gen_ext32u_i64(tcg_res, tcg_res);
- break;
- case 3:
- break;
- default:
- g_assert_not_reached();
- }
-
- write_fp_dreg(s, rd, tcg_res);
}
/* AdvSIMD modified immediate
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 7f71c56f83..5ab4b11781 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -59,6 +59,8 @@
@rrr_q1e3 ........ ... rm:5 ...... rn:5 rd:5 &qrrr_e q=1 esz=3
@rrrr_q1e3 ........ ... rm:5 . ra:5 rn:5 rd:5 &qrrrr_e q=1 esz=3
+@qrr_e . q:1 ...... esz:2 ...... ...... rn:5 rd:5 &qrr_e
+
@qrrr_b . q:1 ...... ... rm:5 ...... rn:5 rd:5 &qrrr_e esz=0
@qrrr_h . q:1 ...... ... rm:5 ...... rn:5 rd:5 &qrrr_e esz=1
@qrrr_s . q:1 ...... ... rm:5 ...... rn:5 rd:5 &qrrr_e esz=2
@@ -1154,3 +1156,13 @@ TRN1 0.00 1110 .. 0 ..... 0 010 10 ..... ..... @qrrr_e
TRN2 0.00 1110 .. 0 ..... 0 110 10 ..... ..... @qrrr_e
ZIP1 0.00 1110 .. 0 ..... 0 011 10 ..... ..... @qrrr_e
ZIP2 0.00 1110 .. 0 ..... 0 111 10 ..... ..... @qrrr_e
+
+# Advanced SIMD Across Lanes
+
+ADDV 0.00 1110 .. 11000 11011 10 ..... ..... @qrr_e
+SADDLV 0.00 1110 .. 11000 00011 10 ..... ..... @qrr_e
+UADDLV 0.10 1110 .. 11000 00011 10 ..... ..... @qrr_e
+SMAXV 0.00 1110 .. 11000 01010 10 ..... ..... @qrr_e
+UMAXV 0.10 1110 .. 11000 01010 10 ..... ..... @qrr_e
+SMINV 0.00 1110 .. 11000 11010 10 ..... ..... @qrr_e
+UMINV 0.10 1110 .. 11000 11010 10 ..... ..... @qrr_e
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 11/29] target/arm: Convert FMAXNMV, FMINNMV, FMAXV, FMINV to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (9 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 10/29] target/arm: Convert ADDV, *ADDLV, *MAXV, *MINV to decodetree Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 12/29] target/arm: Convert FMOVI (scalar, immediate) " Richard Henderson
` (18 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 176 ++++++++++-----------------------
target/arm/tcg/a64.decode | 14 +++
2 files changed, 67 insertions(+), 123 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 593a1774d8..aec2f6a542 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6835,6 +6835,59 @@ TRANS(UMAXV, do_int_reduction, a, false, 0, tcg_gen_umax_i64)
TRANS(SMINV, do_int_reduction, a, false, MO_SIGN, tcg_gen_smin_i64)
TRANS(UMINV, do_int_reduction, a, false, 0, tcg_gen_umin_i64)
+/*
+ * do_fp_reduction helper
+ *
+ * This mirrors the Reduce() pseudocode in the ARM ARM. It is
+ * important for correct NaN propagation that we do these
+ * operations in exactly the order specified by the pseudocode.
+ *
+ * This is a recursive function.
+ */
+static TCGv_i32 do_reduction_op(DisasContext *s, int rn, MemOp esz,
+ int ebase, int ecount, TCGv_ptr fpst,
+ NeonGenTwoSingleOpFn *fn)
+{
+ if (ecount == 1) {
+ TCGv_i32 tcg_elem = tcg_temp_new_i32();
+ read_vec_element_i32(s, tcg_elem, rn, ebase, esz);
+ return tcg_elem;
+ } else {
+ int half = ecount >> 1;
+ TCGv_i32 tcg_hi, tcg_lo, tcg_res;
+
+ tcg_hi = do_reduction_op(s, rn, esz, ebase + half, half, fpst, fn);
+ tcg_lo = do_reduction_op(s, rn, esz, ebase, half, fpst, fn);
+ tcg_res = tcg_temp_new_i32();
+
+ fn(tcg_res, tcg_lo, tcg_hi, fpst);
+ return tcg_res;
+ }
+}
+
+static bool do_fp_reduction(DisasContext *s, arg_qrr_e *a,
+ NeonGenTwoSingleOpFn *fn)
+{
+ if (fp_access_check(s)) {
+ MemOp esz = a->esz;
+ int elts = (a->q ? 16 : 8) >> esz;
+ TCGv_ptr fpst = fpstatus_ptr(esz == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
+ TCGv_i32 res = do_reduction_op(s, a->rn, esz, 0, elts, fpst, fn);
+ write_fp_sreg(s, a->rd, res);
+ }
+ return true;
+}
+
+TRANS_FEAT(FMAXNMV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_maxnumh)
+TRANS_FEAT(FMINNMV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_minnumh)
+TRANS_FEAT(FMAXV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_maxh)
+TRANS_FEAT(FMINV_h, aa64_fp16, do_fp_reduction, a, gen_helper_advsimd_minh)
+
+TRANS(FMAXNMV_s, do_fp_reduction, a, gen_helper_vfp_maxnums)
+TRANS(FMINNMV_s, do_fp_reduction, a, gen_helper_vfp_minnums)
+TRANS(FMAXV_s, do_fp_reduction, a, gen_helper_vfp_maxs)
+TRANS(FMINV_s, do_fp_reduction, a, gen_helper_vfp_mins)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9061,128 +9114,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
}
}
-/*
- * do_reduction_op helper
- *
- * This mirrors the Reduce() pseudocode in the ARM ARM. It is
- * important for correct NaN propagation that we do these
- * operations in exactly the order specified by the pseudocode.
- *
- * This is a recursive function.
- */
-static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
- MemOp esz, int ebase, int ecount, TCGv_ptr fpst)
-{
- if (ecount == 1) {
- TCGv_i32 tcg_elem = tcg_temp_new_i32();
- read_vec_element_i32(s, tcg_elem, rn, ebase, esz);
- return tcg_elem;
- } else {
- int half = ecount >> 1;
- TCGv_i32 tcg_hi, tcg_lo, tcg_res;
-
- tcg_hi = do_reduction_op(s, fpopcode, rn, esz,
- ebase + half, half, fpst);
- tcg_lo = do_reduction_op(s, fpopcode, rn, esz,
- ebase, half, fpst);
- tcg_res = tcg_temp_new_i32();
-
- switch (fpopcode) {
- case 0x0c: /* fmaxnmv half-precision */
- gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- case 0x0f: /* fmaxv half-precision */
- gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- case 0x1c: /* fminnmv half-precision */
- gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- case 0x1f: /* fminv half-precision */
- gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- case 0x2c: /* fmaxnmv */
- gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- case 0x2f: /* fmaxv */
- gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- case 0x3c: /* fminnmv */
- gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- case 0x3f: /* fminv */
- gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst);
- break;
- default:
- g_assert_not_reached();
- }
- return tcg_res;
- }
-}
-
-/* AdvSIMD across lanes
- * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0
- * +---+---+---+-----------+------+-----------+--------+-----+------+------+
- * | 0 | Q | U | 0 1 1 1 0 | size | 1 1 0 0 0 | opcode | 1 0 | Rn | Rd |
- * +---+---+---+-----------+------+-----------+--------+-----+------+------+
- */
-static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
-{
- int rd = extract32(insn, 0, 5);
- int rn = extract32(insn, 5, 5);
- int size = extract32(insn, 22, 2);
- int opcode = extract32(insn, 12, 5);
- bool is_q = extract32(insn, 30, 1);
- bool is_u = extract32(insn, 29, 1);
- bool is_min = false;
- int elements;
-
- switch (opcode) {
- case 0xc: /* FMAXNMV, FMINNMV */
- case 0xf: /* FMAXV, FMINV */
- /* Bit 1 of size field encodes min vs max and the actual size
- * depends on the encoding of the U bit. If not set (and FP16
- * enabled) then we do half-precision float instead of single
- * precision.
- */
- is_min = extract32(size, 1, 1);
- if (!is_u && dc_isar_feature(aa64_fp16, s)) {
- size = 1;
- } else if (!is_u || !is_q || extract32(size, 0, 1)) {
- unallocated_encoding(s);
- return;
- } else {
- size = 2;
- }
- break;
- default:
- case 0x3: /* SADDLV, UADDLV */
- case 0xa: /* SMAXV, UMAXV */
- case 0x1a: /* SMINV, UMINV */
- case 0x1b: /* ADDV */
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- elements = (is_q ? 16 : 8) >> size;
-
- {
- /* Floating point vector reduction ops which work across 32
- * bit (single) or 16 bit (half-precision) intermediates.
- * Note that correct NaN propagation requires that we do these
- * operations in exactly the order specified by the pseudocode.
- */
- TCGv_ptr fpst = fpstatus_ptr(size == MO_16 ? FPST_FPCR_F16 : FPST_FPCR);
- int fpopcode = opcode | is_min << 4 | is_u << 5;
- TCGv_i32 tcg_res = do_reduction_op(s, fpopcode, rn, size,
- 0, elements, fpst);
- write_fp_sreg(s, rd, tcg_res);
- }
-}
-
/* AdvSIMD modified immediate
* 31 30 29 28 19 18 16 15 12 11 10 9 5 4 0
* +---+---+----+---------------------+-----+-------+----+---+-------+------+
@@ -11735,7 +11666,6 @@ static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn)
static const AArch64DecodeTable data_proc_simd[] = {
/* pattern , mask , fn */
{ 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
- { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes },
/* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
{ 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
{ 0x0f000400, 0x9f800400, disas_simd_shift_imm },
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 5ab4b11781..c77f9fc987 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -54,11 +54,13 @@
@rrx_d ........ .. . rm:5 .... idx:1 . rn:5 rd:5 &rrx_e esz=3
@rr_q1e0 ........ ........ ...... rn:5 rd:5 &qrr_e q=1 esz=0
+@rr_q1e2 ........ ........ ...... rn:5 rd:5 &qrr_e q=1 esz=2
@r2r_q1e0 ........ ........ ...... rm:5 rd:5 &qrrr_e rn=%rd q=1 esz=0
@rrr_q1e0 ........ ... rm:5 ...... rn:5 rd:5 &qrrr_e q=1 esz=0
@rrr_q1e3 ........ ... rm:5 ...... rn:5 rd:5 &qrrr_e q=1 esz=3
@rrrr_q1e3 ........ ... rm:5 . ra:5 rn:5 rd:5 &qrrrr_e q=1 esz=3
+@qrr_h . q:1 ...... .. ...... ...... rn:5 rd:5 &qrr_e esz=1
@qrr_e . q:1 ...... esz:2 ...... ...... rn:5 rd:5 &qrr_e
@qrrr_b . q:1 ...... ... rm:5 ...... rn:5 rd:5 &qrrr_e esz=0
@@ -1166,3 +1168,15 @@ SMAXV 0.00 1110 .. 11000 01010 10 ..... ..... @qrr_e
UMAXV 0.10 1110 .. 11000 01010 10 ..... ..... @qrr_e
SMINV 0.00 1110 .. 11000 11010 10 ..... ..... @qrr_e
UMINV 0.10 1110 .. 11000 11010 10 ..... ..... @qrr_e
+
+FMAXNMV_h 0.00 1110 00 11000 01100 10 ..... ..... @qrr_h
+FMAXNMV_s 0110 1110 00 11000 01100 10 ..... ..... @rr_q1e2
+
+FMINNMV_h 0.00 1110 10 11000 01100 10 ..... ..... @qrr_h
+FMINNMV_s 0110 1110 10 11000 01100 10 ..... ..... @rr_q1e2
+
+FMAXV_h 0.00 1110 00 11000 01111 10 ..... ..... @qrr_h
+FMAXV_s 0110 1110 00 11000 01111 10 ..... ..... @rr_q1e2
+
+FMINV_h 0.00 1110 10 11000 01111 10 ..... ..... @qrr_h
+FMINV_s 0110 1110 10 11000 01111 10 ..... ..... @rr_q1e2
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 12/29] target/arm: Convert FMOVI (scalar, immediate) to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (10 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 11/29] target/arm: Convert FMAXNMV, FMINNMV, FMAXV, FMINV " Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 13/29] target/arm: Convert MOVI, FMOV, ORR, BIC (vector " Richard Henderson
` (17 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 74 ++++++++++++----------------------
target/arm/tcg/a64.decode | 4 ++
2 files changed, 30 insertions(+), 48 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index aec2f6a542..0e290062ef 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6888,6 +6888,31 @@ TRANS(FMINNMV_s, do_fp_reduction, a, gen_helper_vfp_minnums)
TRANS(FMAXV_s, do_fp_reduction, a, gen_helper_vfp_maxs)
TRANS(FMINV_s, do_fp_reduction, a, gen_helper_vfp_mins)
+/*
+ * Floating-point Immediate
+ */
+
+static bool trans_FMOVI_s(DisasContext *s, arg_FMOVI_s *a)
+{
+ switch (a->esz) {
+ case MO_32:
+ case MO_64:
+ break;
+ case MO_16:
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+ if (fp_access_check(s)) {
+ uint64_t imm = vfp_expand_imm(a->esz, a->imm);
+ write_fp_dreg(s, a->rd, tcg_constant_i64(imm));
+ }
+ return true;
+}
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -8625,53 +8650,6 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn)
}
}
-/* Floating point immediate
- * 31 30 29 28 24 23 22 21 20 13 12 10 9 5 4 0
- * +---+---+---+-----------+------+---+------------+-------+------+------+
- * | M | 0 | S | 1 1 1 1 0 | type | 1 | imm8 | 1 0 0 | imm5 | Rd |
- * +---+---+---+-----------+------+---+------------+-------+------+------+
- */
-static void disas_fp_imm(DisasContext *s, uint32_t insn)
-{
- int rd = extract32(insn, 0, 5);
- int imm5 = extract32(insn, 5, 5);
- int imm8 = extract32(insn, 13, 8);
- int type = extract32(insn, 22, 2);
- int mos = extract32(insn, 29, 3);
- uint64_t imm;
- MemOp sz;
-
- if (mos || imm5) {
- unallocated_encoding(s);
- return;
- }
-
- switch (type) {
- case 0:
- sz = MO_32;
- break;
- case 1:
- sz = MO_64;
- break;
- case 3:
- sz = MO_16;
- if (dc_isar_feature(aa64_fp16, s)) {
- break;
- }
- /* fallthru */
- default:
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- imm = vfp_expand_imm(sz, imm8);
- write_fp_dreg(s, rd, tcg_constant_i64(imm));
-}
-
/* Handle floating point <=> fixed point conversions. Note that we can
* also deal with fp <=> integer conversions as a special case (scale == 64)
* OPTME: consider handling that special case specially or at least skipping
@@ -9091,7 +9069,7 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
switch (ctz32(extract32(insn, 12, 4))) {
case 0: /* [15:12] == xxx1 */
/* Floating point immediate */
- disas_fp_imm(s, insn);
+ unallocated_encoding(s); /* in decodetree */
break;
case 1: /* [15:12] == xx10 */
/* Floating point compare */
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index c77f9fc987..e11e293631 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1180,3 +1180,7 @@ FMAXV_s 0110 1110 00 11000 01111 10 ..... ..... @rr_q1e2
FMINV_h 0.00 1110 10 11000 01111 10 ..... ..... @qrr_h
FMINV_s 0110 1110 10 11000 01111 10 ..... ..... @rr_q1e2
+
+# Floating-point Immediate
+
+FMOVI_s 0001 1110 .. 1 imm:8 100 00000 rd:5 esz=%esz_hsd
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 13/29] target/arm: Convert MOVI, FMOV, ORR, BIC (vector immediate) to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (11 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 12/29] target/arm: Convert FMOVI (scalar, immediate) " Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:40 ` [PATCH v3 14/29] target/arm: Introduce gen_gvec_sshr, gen_gvec_ushr Richard Henderson
` (16 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 117 ++++++++++++++-------------------
target/arm/tcg/a64.decode | 9 +++
2 files changed, 59 insertions(+), 67 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 0e290062ef..53022f4fc0 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6913,6 +6913,52 @@ static bool trans_FMOVI_s(DisasContext *s, arg_FMOVI_s *a)
return true;
}
+/*
+ * Advanced SIMD Modified Immediate
+ */
+
+static bool trans_FMOVI_v_h(DisasContext *s, arg_FMOVI_v_h *a)
+{
+ if (!dc_isar_feature(aa64_fp16, s)) {
+ return false;
+ }
+ if (fp_access_check(s)) {
+ tcg_gen_gvec_dup_imm(MO_16, vec_full_reg_offset(s, a->rd),
+ a->q ? 16 : 8, vec_full_reg_size(s),
+ vfp_expand_imm(MO_16, a->abcdefgh));
+ }
+ return true;
+}
+
+static void gen_movi(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
+}
+
+static bool trans_Vimm(DisasContext *s, arg_Vimm *a)
+{
+ GVecGen2iFn *fn;
+
+ /* Handle decode of cmode/op here between ORR/BIC/MOVI */
+ if ((a->cmode & 1) && a->cmode < 12) {
+ /* For op=1, the imm will be inverted, so BIC becomes AND. */
+ fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
+ } else {
+ /* There is one unallocated cmode/op combination in this space */
+ if (a->cmode == 15 && a->op == 1 && a->q == 0) {
+ return false;
+ }
+ fn = gen_movi;
+ }
+
+ if (fp_access_check(s)) {
+ uint64_t imm = asimd_imm_const(a->abcdefgh, a->cmode, a->op);
+ gen_gvec_fn2i(s, a->q, a->rd, a->rd, imm, fn, MO_64);
+ }
+ return true;
+}
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9092,69 +9138,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
}
}
-/* AdvSIMD modified immediate
- * 31 30 29 28 19 18 16 15 12 11 10 9 5 4 0
- * +---+---+----+---------------------+-----+-------+----+---+-------+------+
- * | 0 | Q | op | 0 1 1 1 1 0 0 0 0 0 | abc | cmode | o2 | 1 | defgh | Rd |
- * +---+---+----+---------------------+-----+-------+----+---+-------+------+
- *
- * There are a number of operations that can be carried out here:
- * MOVI - move (shifted) imm into register
- * MVNI - move inverted (shifted) imm into register
- * ORR - bitwise OR of (shifted) imm with register
- * BIC - bitwise clear of (shifted) imm with register
- * With ARMv8.2 we also have:
- * FMOV half-precision
- */
-static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
-{
- int rd = extract32(insn, 0, 5);
- int cmode = extract32(insn, 12, 4);
- int o2 = extract32(insn, 11, 1);
- uint64_t abcdefgh = extract32(insn, 5, 5) | (extract32(insn, 16, 3) << 5);
- bool is_neg = extract32(insn, 29, 1);
- bool is_q = extract32(insn, 30, 1);
- uint64_t imm = 0;
-
- if (o2) {
- if (cmode != 0xf || is_neg) {
- unallocated_encoding(s);
- return;
- }
- /* FMOV (vector, immediate) - half-precision */
- if (!dc_isar_feature(aa64_fp16, s)) {
- unallocated_encoding(s);
- return;
- }
- imm = vfp_expand_imm(MO_16, abcdefgh);
- /* now duplicate across the lanes */
- imm = dup_const(MO_16, imm);
- } else {
- if (cmode == 0xf && is_neg && !is_q) {
- unallocated_encoding(s);
- return;
- }
- imm = asimd_imm_const(abcdefgh, cmode, is_neg);
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
- /* MOVI or MVNI, with MVNI negation handled above. */
- tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8,
- vec_full_reg_size(s), imm);
- } else {
- /* ORR or BIC, with BIC negation to AND handled above. */
- if (is_neg) {
- gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64);
- } else {
- gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64);
- }
- }
-}
-
/*
* Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
*
@@ -10635,8 +10618,10 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
bool is_u = extract32(insn, 29, 1);
bool is_q = extract32(insn, 30, 1);
- /* data_proc_simd[] has sent immh == 0 to disas_simd_mod_imm. */
- assert(immh != 0);
+ if (immh == 0) {
+ unallocated_encoding(s);
+ return;
+ }
switch (opcode) {
case 0x08: /* SRI */
@@ -11644,8 +11629,6 @@ static void disas_simd_two_reg_misc_fp16(DisasContext *s, uint32_t insn)
static const AArch64DecodeTable data_proc_simd[] = {
/* pattern , mask , fn */
{ 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
- /* simd_mod_imm decode is a subset of simd_shift_imm, so must precede it */
- { 0x0f000400, 0x9ff80400, disas_simd_mod_imm },
{ 0x0f000400, 0x9f800400, disas_simd_shift_imm },
{ 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
{ 0x5f000400, 0xdf800400, disas_simd_scalar_shift_imm },
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index e11e293631..278d7873c2 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1184,3 +1184,12 @@ FMINV_s 0110 1110 10 11000 01111 10 ..... ..... @rr_q1e2
# Floating-point Immediate
FMOVI_s 0001 1110 .. 1 imm:8 100 00000 rd:5 esz=%esz_hsd
+
+# Advanced SIMD Modified Immediate
+
+%abcdefgh 16:3 5:5
+
+FMOVI_v_h 0 q:1 00 1111 00000 ... 1111 11 ..... rd:5 %abcdefgh
+
+# MOVI, MVNI, ORR, BIC, FMOV are all intermixed via cmode.
+Vimm 0 q:1 op:1 0 1111 00000 ... cmode:4 01 ..... rd:5 %abcdefgh
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 14/29] target/arm: Introduce gen_gvec_sshr, gen_gvec_ushr
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (12 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 13/29] target/arm: Convert MOVI, FMOV, ORR, BIC (vector " Richard Henderson
@ 2024-09-12 2:40 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 15/29] target/arm: Fix whitespace near gen_srshr64_i64 Richard Henderson
` (15 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:40 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Handle the two special cases within these new
functions instead of higher in the call stack.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate.h | 5 +++++
target/arm/tcg/gengvec.c | 19 +++++++++++++++++++
target/arm/tcg/translate-a64.c | 16 +---------------
target/arm/tcg/translate-neon.c | 25 ++-----------------------
4 files changed, 27 insertions(+), 38 deletions(-)
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index 3f0e9ceaa3..45990ae292 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -514,6 +514,11 @@ void gen_sqsub_d(TCGv_i64 d, TCGv_i64 q, TCGv_i64 a, TCGv_i64 b);
void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
int64_t shift, uint32_t opr_sz, uint32_t max_sz);
void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index c5fc1b6cfb..33c5084ea6 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -88,6 +88,25 @@ GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)
#undef GEN_CMP0
+void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ /* Signed shift out of range results in all-sign-bits */
+ shift = MIN(shift, (8 << vece) - 1);
+ tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
+}
+
+void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+ int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+ /* Unsigned shift out of range results in all-zero-bits */
+ if (shift >= (8 << vece)) {
+ tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
+ } else {
+ tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
+ }
+}
+
static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
{
tcg_gen_vec_sar8i_i64(a, a, shift);
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 53022f4fc0..032bd33650 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -10452,21 +10452,7 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
break;
case 0x00: /* SSHR / USHR */
- if (is_u) {
- if (shift == 8 << size) {
- /* Shift count the same size as element size produces zero. */
- tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd),
- is_q ? 16 : 8, vec_full_reg_size(s), 0);
- return;
- }
- gvec_fn = tcg_gen_gvec_shri;
- } else {
- /* Shift count the same size as element size produces all sign. */
- if (shift == 8 << size) {
- shift -= 1;
- }
- gvec_fn = tcg_gen_gvec_sari;
- }
+ gvec_fn = is_u ? gen_gvec_ushr : gen_gvec_sshr;
break;
case 0x04: /* SRSHR / URSHR (rounding) */
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index 13cd31aad4..a31a78c347 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -1099,29 +1099,8 @@ DO_2SH(VRSHR_S, gen_gvec_srshr)
DO_2SH(VRSHR_U, gen_gvec_urshr)
DO_2SH(VRSRA_S, gen_gvec_srsra)
DO_2SH(VRSRA_U, gen_gvec_ursra)
-
-static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
-{
- /* Signed shift out of range results in all-sign-bits */
- a->shift = MIN(a->shift, (8 << a->size) - 1);
- return do_vector_2sh(s, a, tcg_gen_gvec_sari);
-}
-
-static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
- int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
- tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
-}
-
-static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
-{
- /* Shift out of range is architecturally valid and results in zero. */
- if (a->shift >= (8 << a->size)) {
- return do_vector_2sh(s, a, gen_zero_rd_2sh);
- } else {
- return do_vector_2sh(s, a, tcg_gen_gvec_shri);
- }
-}
+DO_2SH(VSHR_S, gen_gvec_sshr)
+DO_2SH(VSHR_U, gen_gvec_ushr)
static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
NeonGenTwo64OpEnvFn *fn)
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 15/29] target/arm: Fix whitespace near gen_srshr64_i64
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (13 preceding siblings ...)
2024-09-12 2:40 ` [PATCH v3 14/29] target/arm: Introduce gen_gvec_sshr, gen_gvec_ushr Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 16/29] target/arm: Convert handle_vec_simd_shri to decodetree Richard Henderson
` (14 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Philippe Mathieu-Daudé, Peter Maydell
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/gengvec.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index 33c5084ea6..3abdc57202 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -304,7 +304,7 @@ void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
tcg_gen_add_i32(d, d, t);
}
- void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
{
TCGv_i64 t = tcg_temp_new_i64();
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 16/29] target/arm: Convert handle_vec_simd_shri to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (14 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 15/29] target/arm: Fix whitespace near gen_srshr64_i64 Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 17/29] target/arm: Convert handle_vec_simd_shli " Richard Henderson
` (13 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
This includes SSHR, USHR, SSRA, USRA, SRSHR, URSHR, SRSRA, URSRA, SRI.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 86 +++++++++++-----------------------
target/arm/tcg/a64.decode | 63 ++++++++++++++++++++++++-
2 files changed, 89 insertions(+), 60 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 032bd33650..5c76cdf101 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6959,6 +6959,28 @@ static bool trans_Vimm(DisasContext *s, arg_Vimm *a)
return true;
}
+/*
+ * Advanced SIMD Shift by Immediate
+ */
+
+static bool do_vec_shift_imm(DisasContext *s, arg_qrri_e *a, GVecGen2iFn *fn)
+{
+ if (fp_access_check(s)) {
+ gen_gvec_fn2i(s, a->q, a->rd, a->rn, a->imm, fn, a->esz);
+ }
+ return true;
+}
+
+TRANS(SSHR_v, do_vec_shift_imm, a, gen_gvec_sshr)
+TRANS(USHR_v, do_vec_shift_imm, a, gen_gvec_ushr)
+TRANS(SSRA_v, do_vec_shift_imm, a, gen_gvec_ssra)
+TRANS(USRA_v, do_vec_shift_imm, a, gen_gvec_usra)
+TRANS(SRSHR_v, do_vec_shift_imm, a, gen_gvec_srshr)
+TRANS(URSHR_v, do_vec_shift_imm, a, gen_gvec_urshr)
+TRANS(SRSRA_v, do_vec_shift_imm, a, gen_gvec_srsra)
+TRANS(URSRA_v, do_vec_shift_imm, a, gen_gvec_ursra)
+TRANS(SRI_v, do_vec_shift_imm, a, gen_gvec_sri)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -10423,53 +10445,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
}
}
-/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
-static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
- int immh, int immb, int opcode, int rn, int rd)
-{
- int size = 32 - clz32(immh) - 1;
- int immhb = immh << 3 | immb;
- int shift = 2 * (8 << size) - immhb;
- GVecGen2iFn *gvec_fn;
-
- if (extract32(immh, 3, 1) && !is_q) {
- unallocated_encoding(s);
- return;
- }
- tcg_debug_assert(size <= 3);
-
- if (!fp_access_check(s)) {
- return;
- }
-
- switch (opcode) {
- case 0x02: /* SSRA / USRA (accumulate) */
- gvec_fn = is_u ? gen_gvec_usra : gen_gvec_ssra;
- break;
-
- case 0x08: /* SRI */
- gvec_fn = gen_gvec_sri;
- break;
-
- case 0x00: /* SSHR / USHR */
- gvec_fn = is_u ? gen_gvec_ushr : gen_gvec_sshr;
- break;
-
- case 0x04: /* SRSHR / URSHR (rounding) */
- gvec_fn = is_u ? gen_gvec_urshr : gen_gvec_srshr;
- break;
-
- case 0x06: /* SRSRA / URSRA (accum + rounding) */
- gvec_fn = is_u ? gen_gvec_ursra : gen_gvec_srsra;
- break;
-
- default:
- g_assert_not_reached();
- }
-
- gen_gvec_fn2i(s, is_q, rd, rn, shift, gvec_fn, size);
-}
-
/* SHL/SLI - Vector shift left */
static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
int immh, int immb, int opcode, int rn, int rd)
@@ -10610,18 +10585,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
}
switch (opcode) {
- case 0x08: /* SRI */
- if (!is_u) {
- unallocated_encoding(s);
- return;
- }
- /* fall through */
- case 0x00: /* SSHR / USHR */
- case 0x02: /* SSRA / USRA (accumulate) */
- case 0x04: /* SRSHR / URSHR (rounding) */
- case 0x06: /* SRSRA / URSRA (accum + rounding) */
- handle_vec_simd_shri(s, is_q, is_u, immh, immb, opcode, rn, rd);
- break;
case 0x0a: /* SHL / SLI */
handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
break;
@@ -10660,6 +10623,11 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd);
return;
default:
+ case 0x00: /* SSHR / USHR */
+ case 0x02: /* SSRA / USRA (accumulate) */
+ case 0x04: /* SRSHR / URSHR (rounding) */
+ case 0x06: /* SRSRA / URSRA (accum + rounding) */
+ case 0x08: /* SRI */
unallocated_encoding(s);
return;
}
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 278d7873c2..74ba1fa07c 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -34,6 +34,7 @@
&rrx_e rd rn rm idx esz
&rrrr_e rd rn rm ra esz
&qrr_e q rd rn esz
+&qrri_e q rd rn imm esz
&qrrr_e q rd rn rm esz
&qrrx_e q rd rn rm idx esz
&qrrrr_e q rd rn rm ra esz
@@ -1185,11 +1186,71 @@ FMINV_s 0110 1110 10 11000 01111 10 ..... ..... @rr_q1e2
FMOVI_s 0001 1110 .. 1 imm:8 100 00000 rd:5 esz=%esz_hsd
-# Advanced SIMD Modified Immediate
+# Advanced SIMD Modified Immediate / Shift by Immediate
%abcdefgh 16:3 5:5
+# Right shifts are encoded as N - shift, where N is the element size in bits.
+%neon_rshift_i6 16:6 !function=rsub_64
+%neon_rshift_i5 16:5 !function=rsub_32
+%neon_rshift_i4 16:4 !function=rsub_16
+%neon_rshift_i3 16:3 !function=rsub_8
+
+@q_shri_b . q:1 .. ..... 0001 ... ..... . rn:5 rd:5 \
+ &qrri_e esz=0 imm=%neon_rshift_i3
+@q_shri_h . q:1 .. ..... 001 .... ..... . rn:5 rd:5 \
+ &qrri_e esz=1 imm=%neon_rshift_i4
+@q_shri_s . q:1 .. ..... 01 ..... ..... . rn:5 rd:5 \
+ &qrri_e esz=2 imm=%neon_rshift_i5
+@q_shri_d . 1 .. ..... 1 ...... ..... . rn:5 rd:5 \
+ &qrri_e esz=3 imm=%neon_rshift_i6 q=1
+
FMOVI_v_h 0 q:1 00 1111 00000 ... 1111 11 ..... rd:5 %abcdefgh
# MOVI, MVNI, ORR, BIC, FMOV are all intermixed via cmode.
Vimm 0 q:1 op:1 0 1111 00000 ... cmode:4 01 ..... rd:5 %abcdefgh
+
+SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_b
+SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_h
+SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_s
+SSHR_v 0.00 11110 .... ... 00000 1 ..... ..... @q_shri_d
+
+USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_b
+USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_h
+USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_s
+USHR_v 0.10 11110 .... ... 00000 1 ..... ..... @q_shri_d
+
+SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_b
+SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_h
+SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_s
+SSRA_v 0.00 11110 .... ... 00010 1 ..... ..... @q_shri_d
+
+USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_b
+USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_h
+USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_s
+USRA_v 0.10 11110 .... ... 00010 1 ..... ..... @q_shri_d
+
+SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_b
+SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_h
+SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_s
+SRSHR_v 0.00 11110 .... ... 00100 1 ..... ..... @q_shri_d
+
+URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_b
+URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_h
+URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_s
+URSHR_v 0.10 11110 .... ... 00100 1 ..... ..... @q_shri_d
+
+SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_b
+SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_h
+SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_s
+SRSRA_v 0.00 11110 .... ... 00110 1 ..... ..... @q_shri_d
+
+URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_b
+URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_h
+URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_s
+URSRA_v 0.10 11110 .... ... 00110 1 ..... ..... @q_shri_d
+
+SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_b
+SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_h
+SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_s
+SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_d
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 17/29] target/arm: Convert handle_vec_simd_shli to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (15 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 16/29] target/arm: Convert handle_vec_simd_shri to decodetree Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 18/29] target/arm: Use {, s}extract in handle_vec_simd_wshli Richard Henderson
` (12 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
This includes SHL and SLI.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 33 +++------------------------------
target/arm/tcg/a64.decode | 15 +++++++++++++++
2 files changed, 18 insertions(+), 30 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 5c76cdf101..1225aac665 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6980,6 +6980,8 @@ TRANS(URSHR_v, do_vec_shift_imm, a, gen_gvec_urshr)
TRANS(SRSRA_v, do_vec_shift_imm, a, gen_gvec_srsra)
TRANS(URSRA_v, do_vec_shift_imm, a, gen_gvec_ursra)
TRANS(SRI_v, do_vec_shift_imm, a, gen_gvec_sri)
+TRANS(SHL_v, do_vec_shift_imm, a, tcg_gen_gvec_shli)
+TRANS(SLI_v, do_vec_shift_imm, a, gen_gvec_sli);
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
@@ -10445,33 +10447,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
}
}
-/* SHL/SLI - Vector shift left */
-static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
- int immh, int immb, int opcode, int rn, int rd)
-{
- int size = 32 - clz32(immh) - 1;
- int immhb = immh << 3 | immb;
- int shift = immhb - (8 << size);
-
- /* Range of size is limited by decode: immh is a non-zero 4 bit field */
- assert(size >= 0 && size <= 3);
-
- if (extract32(immh, 3, 1) && !is_q) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- if (insert) {
- gen_gvec_fn2i(s, is_q, rd, rn, shift, gen_gvec_sli, size);
- } else {
- gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size);
- }
-}
-
/* USHLL/SHLL - Vector shift left with widening */
static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
int immh, int immb, int opcode, int rn, int rd)
@@ -10585,9 +10560,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
}
switch (opcode) {
- case 0x0a: /* SHL / SLI */
- handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd);
- break;
case 0x10: /* SHRN */
case 0x11: /* RSHRN / SQRSHRUN */
if (is_u) {
@@ -10628,6 +10600,7 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
case 0x04: /* SRSHR / URSHR (rounding) */
case 0x06: /* SRSRA / URSRA (accum + rounding) */
case 0x08: /* SRI */
+ case 0x0a: /* SHL / SLI */
unallocated_encoding(s);
return;
}
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 74ba1fa07c..77b860a3f2 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1205,6 +1205,11 @@ FMOVI_s 0001 1110 .. 1 imm:8 100 00000 rd:5 esz=%esz_hsd
@q_shri_d . 1 .. ..... 1 ...... ..... . rn:5 rd:5 \
&qrri_e esz=3 imm=%neon_rshift_i6 q=1
+@q_shli_b . q:1 .. ..... 0001 imm:3 ..... . rn:5 rd:5 &qrri_e esz=0
+@q_shli_h . q:1 .. ..... 001 imm:4 ..... . rn:5 rd:5 &qrri_e esz=1
+@q_shli_s . q:1 .. ..... 01 imm:5 ..... . rn:5 rd:5 &qrri_e esz=2
+@q_shli_d . 1 .. ..... 1 imm:6 ..... . rn:5 rd:5 &qrri_e esz=3 q=1
+
FMOVI_v_h 0 q:1 00 1111 00000 ... 1111 11 ..... rd:5 %abcdefgh
# MOVI, MVNI, ORR, BIC, FMOV are all intermixed via cmode.
@@ -1254,3 +1259,13 @@ SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_b
SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_h
SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_s
SRI_v 0.10 11110 .... ... 01000 1 ..... ..... @q_shri_d
+
+SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_b
+SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_h
+SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_s
+SHL_v 0.00 11110 .... ... 01010 1 ..... ..... @q_shli_d
+
+SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_b
+SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_h
+SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_s
+SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_d
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 18/29] target/arm: Use {, s}extract in handle_vec_simd_wshli
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (16 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 17/29] target/arm: Convert handle_vec_simd_shli " Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 19/29] target/arm: Convert SSHLL, USHLL to decodetree Richard Henderson
` (11 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Combine the right shift with the extension via
the tcg extract operations.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 1225aac665..740620074a 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -10477,8 +10477,11 @@ static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
for (i = 0; i < elements; i++) {
- tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
- ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
+ if (is_u) {
+ tcg_gen_extract_i64(tcg_rd, tcg_rn, i * esize, esize);
+ } else {
+ tcg_gen_sextract_i64(tcg_rd, tcg_rn, i * esize, esize);
+ }
tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
write_vec_element(s, tcg_rd, rd, i, size + 1);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 19/29] target/arm: Convert SSHLL, USHLL to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (17 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 18/29] target/arm: Use {, s}extract in handle_vec_simd_wshli Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 20/29] target/arm: Push tcg_rnd into handle_shri_with_rndacc Richard Henderson
` (10 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 81 ++++++++++++++++------------------
target/arm/tcg/a64.decode | 8 ++++
2 files changed, 45 insertions(+), 44 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 740620074a..e00d7fbf48 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6983,6 +6983,42 @@ TRANS(SRI_v, do_vec_shift_imm, a, gen_gvec_sri)
TRANS(SHL_v, do_vec_shift_imm, a, tcg_gen_gvec_shli)
TRANS(SLI_v, do_vec_shift_imm, a, gen_gvec_sli);
+static bool do_vec_shift_imm_wide(DisasContext *s, arg_qrri_e *a, bool is_u)
+{
+ TCGv_i64 tcg_rn, tcg_rd;
+ int esz = a->esz;
+ int esize;
+
+ if (!fp_access_check(s)) {
+ return true;
+ }
+
+ /*
+ * For the LL variants the store is larger than the load,
+ * so if rd == rn we would overwrite parts of our input.
+ * So load everything right now and use shifts in the main loop.
+ */
+ tcg_rd = tcg_temp_new_i64();
+ tcg_rn = tcg_temp_new_i64();
+ read_vec_element(s, tcg_rn, a->rn, a->q, MO_64);
+
+ esize = 8 << esz;
+ for (int i = 0, elements = 8 >> esz; i < elements; i++) {
+ if (is_u) {
+ tcg_gen_extract_i64(tcg_rd, tcg_rn, i * esize, esize);
+ } else {
+ tcg_gen_sextract_i64(tcg_rd, tcg_rn, i * esize, esize);
+ }
+ tcg_gen_shli_i64(tcg_rd, tcg_rd, a->imm);
+ write_vec_element(s, tcg_rd, a->rd, i, esz + 1);
+ }
+ clear_vec_high(s, true, a->rd);
+ return true;
+}
+
+TRANS(SSHLL_v, do_vec_shift_imm_wide, a, false)
+TRANS(USHLL_v, do_vec_shift_imm_wide, a, true)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -10447,47 +10483,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
}
}
-/* USHLL/SHLL - Vector shift left with widening */
-static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
- int immh, int immb, int opcode, int rn, int rd)
-{
- int size = 32 - clz32(immh) - 1;
- int immhb = immh << 3 | immb;
- int shift = immhb - (8 << size);
- int dsize = 64;
- int esize = 8 << size;
- int elements = dsize/esize;
- TCGv_i64 tcg_rn = tcg_temp_new_i64();
- TCGv_i64 tcg_rd = tcg_temp_new_i64();
- int i;
-
- if (size >= 3) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- /* For the LL variants the store is larger than the load,
- * so if rd == rn we would overwrite parts of our input.
- * So load everything right now and use shifts in the main loop.
- */
- read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
-
- for (i = 0; i < elements; i++) {
- if (is_u) {
- tcg_gen_extract_i64(tcg_rd, tcg_rn, i * esize, esize);
- } else {
- tcg_gen_sextract_i64(tcg_rd, tcg_rn, i * esize, esize);
- }
- tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
- write_vec_element(s, tcg_rd, rd, i, size + 1);
- }
- clear_vec_high(s, true, rd);
-}
-
/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */
static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
int immh, int immb, int opcode, int rn, int rd)
@@ -10577,9 +10572,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb,
opcode, rn, rd);
break;
- case 0x14: /* SSHLL / USHLL */
- handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd);
- break;
case 0x1c: /* SCVTF / UCVTF */
handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
opcode, rn, rd);
@@ -10604,6 +10596,7 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
case 0x06: /* SRSRA / URSRA (accum + rounding) */
case 0x08: /* SRI */
case 0x0a: /* SHL / SLI */
+ case 0x14: /* SSHLL / USHLL */
unallocated_encoding(s);
return;
}
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 77b860a3f2..bf67f8a357 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1269,3 +1269,11 @@ SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_b
SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_h
SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_s
SLI_v 0.10 11110 .... ... 01010 1 ..... ..... @q_shli_d
+
+SSHLL_v 0.00 11110 .... ... 10100 1 ..... ..... @q_shli_b
+SSHLL_v 0.00 11110 .... ... 10100 1 ..... ..... @q_shli_h
+SSHLL_v 0.00 11110 .... ... 10100 1 ..... ..... @q_shli_s
+
+USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_b
+USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_h
+USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_s
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 20/29] target/arm: Push tcg_rnd into handle_shri_with_rndacc
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (18 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 19/29] target/arm: Convert SSHLL, USHLL to decodetree Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 21/29] target/arm: Split out subroutines of handle_shri_with_rndacc Richard Henderson
` (9 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Philippe Mathieu-Daudé, Peter Maydell
We always pass the same value for round; compute it
within common code.
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 32 ++++++--------------------------
1 file changed, 6 insertions(+), 26 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index e00d7fbf48..e59236330a 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -9205,11 +9205,10 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
* the vector and scalar code.
*/
static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
- TCGv_i64 tcg_rnd, bool accumulate,
+ bool round, bool accumulate,
bool is_u, int size, int shift)
{
bool extended_result = false;
- bool round = tcg_rnd != NULL;
int ext_lshift = 0;
TCGv_i64 tcg_src_hi;
@@ -9227,6 +9226,7 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
/* Deal with the rounding step */
if (round) {
+ TCGv_i64 tcg_rnd = tcg_constant_i64(1ull << (shift - 1));
if (extended_result) {
TCGv_i64 tcg_zero = tcg_constant_i64(0);
if (!is_u) {
@@ -9294,7 +9294,6 @@ static void handle_scalar_simd_shri(DisasContext *s,
bool insert = false;
TCGv_i64 tcg_rn;
TCGv_i64 tcg_rd;
- TCGv_i64 tcg_round;
if (!extract32(immh, 3, 1)) {
unallocated_encoding(s);
@@ -9320,12 +9319,6 @@ static void handle_scalar_simd_shri(DisasContext *s,
break;
}
- if (round) {
- tcg_round = tcg_constant_i64(1ULL << (shift - 1));
- } else {
- tcg_round = NULL;
- }
-
tcg_rn = read_fp_dreg(s, rn);
tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
@@ -9339,7 +9332,7 @@ static void handle_scalar_simd_shri(DisasContext *s,
tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift);
}
} else {
- handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, round,
accumulate, is_u, size, shift);
}
@@ -9392,7 +9385,7 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
int elements = is_scalar ? 1 : (64 / esize);
bool round = extract32(opcode, 0, 1);
MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN);
- TCGv_i64 tcg_rn, tcg_rd, tcg_round;
+ TCGv_i64 tcg_rn, tcg_rd;
TCGv_i32 tcg_rd_narrowed;
TCGv_i64 tcg_final;
@@ -9437,15 +9430,9 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
tcg_rd_narrowed = tcg_temp_new_i32();
tcg_final = tcg_temp_new_i64();
- if (round) {
- tcg_round = tcg_constant_i64(1ULL << (shift - 1));
- } else {
- tcg_round = NULL;
- }
-
for (i = 0; i < elements; i++) {
read_vec_element(s, tcg_rn, rn, i, ldop);
- handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, round,
false, is_u_shift, size+1, shift);
narrowfn(tcg_rd_narrowed, tcg_env, tcg_rd);
tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed);
@@ -10495,7 +10482,6 @@ static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
int shift = (2 * esize) - immhb;
bool round = extract32(opcode, 0, 1);
TCGv_i64 tcg_rn, tcg_rd, tcg_final;
- TCGv_i64 tcg_round;
int i;
if (extract32(immh, 3, 1)) {
@@ -10512,15 +10498,9 @@ static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
tcg_final = tcg_temp_new_i64();
read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64);
- if (round) {
- tcg_round = tcg_constant_i64(1ULL << (shift - 1));
- } else {
- tcg_round = NULL;
- }
-
for (i = 0; i < elements; i++) {
read_vec_element(s, tcg_rn, rn, i, size+1);
- handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, round,
false, true, size+1, shift);
tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 21/29] target/arm: Split out subroutines of handle_shri_with_rndacc
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (19 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 20/29] target/arm: Push tcg_rnd into handle_shri_with_rndacc Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 22/29] target/arm: Convert SHRN, RSHRN to decodetree Richard Henderson
` (8 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
There isn't a lot of commonality along the different paths of
handle_shri_with_rndacc. Split them out to separate functions,
which will be usable during the decodetree conversion.
Simplify 64-bit rounding operations to not require double-word arithmetic.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 138 ++++++++++++++++++++-------------
1 file changed, 82 insertions(+), 56 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index e59236330a..f4deacd554 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -7019,6 +7019,78 @@ static bool do_vec_shift_imm_wide(DisasContext *s, arg_qrri_e *a, bool is_u)
TRANS(SSHLL_v, do_vec_shift_imm_wide, a, false)
TRANS(USHLL_v, do_vec_shift_imm_wide, a, true)
+static void gen_sshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ assert(shift >= 0 && shift <= 64);
+ tcg_gen_sari_i64(dst, src, MIN(shift, 63));
+}
+
+static void gen_ushr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ assert(shift >= 0 && shift <= 64);
+ if (shift == 64) {
+ tcg_gen_movi_i64(dst, 0);
+ } else {
+ tcg_gen_shri_i64(dst, src, shift);
+ }
+}
+
+static void gen_srshr_bhs(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ assert(shift >= 0 && shift <= 32);
+ if (shift) {
+ TCGv_i64 rnd = tcg_constant_i64(1ull << (shift - 1));
+ tcg_gen_add_i64(dst, src, rnd);
+ tcg_gen_sari_i64(dst, dst, shift);
+ } else {
+ tcg_gen_mov_i64(dst, src);
+ }
+}
+
+static void gen_urshr_bhs(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ assert(shift >= 0 && shift <= 32);
+ if (shift) {
+ TCGv_i64 rnd = tcg_constant_i64(1ull << (shift - 1));
+ tcg_gen_add_i64(dst, src, rnd);
+ tcg_gen_shri_i64(dst, dst, shift);
+ } else {
+ tcg_gen_mov_i64(dst, src);
+ }
+}
+
+static void gen_srshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ assert(shift >= 0 && shift <= 64);
+ if (shift == 0) {
+ tcg_gen_mov_i64(dst, src);
+ } else if (shift == 64) {
+ /* Extension of sign bit (0,-1) plus sign bit (0,1) is zero. */
+ tcg_gen_movi_i64(dst, 0);
+ } else {
+ TCGv_i64 rnd = tcg_temp_new_i64();
+ tcg_gen_extract_i64(rnd, src, shift - 1, 1);
+ tcg_gen_sari_i64(dst, src, shift);
+ tcg_gen_add_i64(dst, dst, rnd);
+ }
+}
+
+static void gen_urshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ assert(shift >= 0 && shift <= 64);
+ if (shift == 0) {
+ tcg_gen_mov_i64(dst, src);
+ } else if (shift == 64) {
+ /* Rounding will propagate bit 63 into bit 64. */
+ tcg_gen_shri_i64(dst, src, 63);
+ } else {
+ TCGv_i64 rnd = tcg_temp_new_i64();
+ tcg_gen_extract_i64(rnd, src, shift - 1, 1);
+ tcg_gen_shri_i64(dst, src, shift);
+ tcg_gen_add_i64(dst, dst, rnd);
+ }
+}
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9208,69 +9280,23 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
bool round, bool accumulate,
bool is_u, int size, int shift)
{
- bool extended_result = false;
- int ext_lshift = 0;
- TCGv_i64 tcg_src_hi;
-
- if (round && size == 3) {
- extended_result = true;
- ext_lshift = 64 - shift;
- tcg_src_hi = tcg_temp_new_i64();
- } else if (shift == 64) {
- if (!accumulate && is_u) {
- /* result is zero */
- tcg_gen_movi_i64(tcg_res, 0);
- return;
- }
- }
-
- /* Deal with the rounding step */
- if (round) {
- TCGv_i64 tcg_rnd = tcg_constant_i64(1ull << (shift - 1));
- if (extended_result) {
- TCGv_i64 tcg_zero = tcg_constant_i64(0);
- if (!is_u) {
- /* take care of sign extending tcg_res */
- tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
- tcg_gen_add2_i64(tcg_src, tcg_src_hi,
- tcg_src, tcg_src_hi,
- tcg_rnd, tcg_zero);
- } else {
- tcg_gen_add2_i64(tcg_src, tcg_src_hi,
- tcg_src, tcg_zero,
- tcg_rnd, tcg_zero);
- }
+ if (!round) {
+ if (is_u) {
+ gen_ushr_d(tcg_src, tcg_src, shift);
} else {
- tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
+ gen_sshr_d(tcg_src, tcg_src, shift);
}
- }
-
- /* Now do the shift right */
- if (round && extended_result) {
- /* extended case, >64 bit precision required */
- if (ext_lshift == 0) {
- /* special case, only high bits matter */
- tcg_gen_mov_i64(tcg_src, tcg_src_hi);
+ } else if (size == MO_64) {
+ if (is_u) {
+ gen_urshr_d(tcg_src, tcg_src, shift);
} else {
- tcg_gen_shri_i64(tcg_src, tcg_src, shift);
- tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
- tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
+ gen_srshr_d(tcg_src, tcg_src, shift);
}
} else {
if (is_u) {
- if (shift == 64) {
- /* essentially shifting in 64 zeros */
- tcg_gen_movi_i64(tcg_src, 0);
- } else {
- tcg_gen_shri_i64(tcg_src, tcg_src, shift);
- }
+ gen_urshr_bhs(tcg_src, tcg_src, shift);
} else {
- if (shift == 64) {
- /* effectively extending the sign-bit */
- tcg_gen_sari_i64(tcg_src, tcg_src, 63);
- } else {
- tcg_gen_sari_i64(tcg_src, tcg_src, shift);
- }
+ gen_srshr_bhs(tcg_src, tcg_src, shift);
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 22/29] target/arm: Convert SHRN, RSHRN to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (20 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 21/29] target/arm: Split out subroutines of handle_shri_with_rndacc Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 23/29] target/arm: Convert handle_scalar_simd_shri " Richard Henderson
` (7 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 95 +++++++++++++++++-----------------
target/arm/tcg/a64.decode | 8 +++
2 files changed, 55 insertions(+), 48 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index f4deacd554..8871087af0 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -7091,6 +7091,51 @@ static void gen_urshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
}
}
+static bool do_vec_shift_imm_narrow(DisasContext *s, arg_qrri_e *a,
+ WideShiftImmFn * const fns[3], MemOp sign)
+{
+ TCGv_i64 tcg_rn, tcg_rd;
+ int esz = a->esz;
+ int esize;
+ WideShiftImmFn *fn;
+
+ tcg_debug_assert(esz >= MO_8 && esz <= MO_32);
+
+ if (!fp_access_check(s)) {
+ return true;
+ }
+
+ tcg_rn = tcg_temp_new_i64();
+ tcg_rd = tcg_temp_new_i64();
+ tcg_gen_movi_i64(tcg_rd, 0);
+
+ fn = fns[esz];
+ esize = 8 << esz;
+ for (int i = 0, elements = 8 >> esz; i < elements; i++) {
+ read_vec_element(s, tcg_rn, a->rn, i, (esz + 1) | sign);
+ fn(tcg_rn, tcg_rn, a->imm);
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, esize * i, esize);
+ }
+
+ write_vec_element(s, tcg_rd, a->rd, a->q, MO_64);
+ clear_vec_high(s, a->q, a->rd);
+ return true;
+}
+
+static WideShiftImmFn * const shrn_fns[] = {
+ tcg_gen_shri_i64,
+ tcg_gen_shri_i64,
+ gen_ushr_d,
+};
+TRANS(SHRN_v, do_vec_shift_imm_narrow, a, shrn_fns, 0)
+
+static WideShiftImmFn * const rshrn_fns[] = {
+ gen_urshr_bhs,
+ gen_urshr_bhs,
+ gen_urshr_d,
+};
+TRANS(RSHRN_v, do_vec_shift_imm_narrow, a, rshrn_fns, 0)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -10496,52 +10541,6 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
}
}
-/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */
-static void handle_vec_simd_shrn(DisasContext *s, bool is_q,
- int immh, int immb, int opcode, int rn, int rd)
-{
- int immhb = immh << 3 | immb;
- int size = 32 - clz32(immh) - 1;
- int dsize = 64;
- int esize = 8 << size;
- int elements = dsize/esize;
- int shift = (2 * esize) - immhb;
- bool round = extract32(opcode, 0, 1);
- TCGv_i64 tcg_rn, tcg_rd, tcg_final;
- int i;
-
- if (extract32(immh, 3, 1)) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- tcg_rn = tcg_temp_new_i64();
- tcg_rd = tcg_temp_new_i64();
- tcg_final = tcg_temp_new_i64();
- read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64);
-
- for (i = 0; i < elements; i++) {
- read_vec_element(s, tcg_rn, rn, i, size+1);
- handle_shri_with_rndacc(tcg_rd, tcg_rn, round,
- false, true, size+1, shift);
-
- tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
- }
-
- if (!is_q) {
- write_vec_element(s, tcg_final, rd, 0, MO_64);
- } else {
- write_vec_element(s, tcg_final, rd, 1, MO_64);
- }
-
- clear_vec_high(s, is_q, rd);
-}
-
-
/* AdvSIMD shift by immediate
* 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0
* +---+---+---+-------------+------+------+--------+---+------+------+
@@ -10564,13 +10563,13 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
}
switch (opcode) {
- case 0x10: /* SHRN */
+ case 0x10: /* SHRN / SQSHRUN */
case 0x11: /* RSHRN / SQRSHRUN */
if (is_u) {
handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb,
opcode, rn, rd);
} else {
- handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd);
+ unallocated_encoding(s);
}
break;
case 0x12: /* SQSHRN / UQSHRN */
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index bf67f8a357..164ed575b9 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1277,3 +1277,11 @@ SSHLL_v 0.00 11110 .... ... 10100 1 ..... ..... @q_shli_s
USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_b
USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_h
USHLL_v 0.10 11110 .... ... 10100 1 ..... ..... @q_shli_s
+
+SHRN_v 0.00 11110 .... ... 10000 1 ..... ..... @q_shri_b
+SHRN_v 0.00 11110 .... ... 10000 1 ..... ..... @q_shri_h
+SHRN_v 0.00 11110 .... ... 10000 1 ..... ..... @q_shri_s
+
+RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_b
+RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_h
+RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_s
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 23/29] target/arm: Convert handle_scalar_simd_shri to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (21 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 22/29] target/arm: Convert SHRN, RSHRN to decodetree Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 24/29] target/arm: Convert handle_scalar_simd_shli " Richard Henderson
` (6 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
This includes SSHR, USHR, SSRA, USRA, SRSHR, URSHR,
SRSRA, URSRA, SRI.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 140 ++++++++++++++++-----------------
target/arm/tcg/a64.decode | 16 ++++
2 files changed, 86 insertions(+), 70 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 8871087af0..efd93a7f23 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -7035,6 +7035,18 @@ static void gen_ushr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
}
}
+static void gen_ssra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ gen_sshr_d(src, src, shift);
+ tcg_gen_add_i64(dst, dst, src);
+}
+
+static void gen_usra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ gen_ushr_d(src, src, shift);
+ tcg_gen_add_i64(dst, dst, src);
+}
+
static void gen_srshr_bhs(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
{
assert(shift >= 0 && shift <= 32);
@@ -7091,6 +7103,27 @@ static void gen_urshr_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
}
}
+static void gen_srsra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ gen_srshr_d(src, src, shift);
+ tcg_gen_add_i64(dst, dst, src);
+}
+
+static void gen_ursra_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ gen_urshr_d(src, src, shift);
+ tcg_gen_add_i64(dst, dst, src);
+}
+
+static void gen_sri_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ /* If shift is 64, dst is unchanged. */
+ if (shift != 64) {
+ tcg_gen_shri_i64(src, src, shift);
+ tcg_gen_deposit_i64(dst, dst, src, 0, 64 - shift);
+ }
+}
+
static bool do_vec_shift_imm_narrow(DisasContext *s, arg_qrri_e *a,
WideShiftImmFn * const fns[3], MemOp sign)
{
@@ -7136,6 +7169,38 @@ static WideShiftImmFn * const rshrn_fns[] = {
};
TRANS(RSHRN_v, do_vec_shift_imm_narrow, a, rshrn_fns, 0)
+/*
+ * Advanced SIMD Scalar Shift by Immediate
+ */
+
+static bool do_scalar_shift_imm(DisasContext *s, arg_rri_e *a,
+ WideShiftImmFn *fn, bool accumulate,
+ MemOp sign)
+{
+ if (fp_access_check(s)) {
+ TCGv_i64 rd = tcg_temp_new_i64();
+ TCGv_i64 rn = tcg_temp_new_i64();
+
+ read_vec_element(s, rn, a->rn, 0, a->esz | sign);
+ if (accumulate) {
+ read_vec_element(s, rd, a->rd, 0, a->esz | sign);
+ }
+ fn(rd, rn, a->imm);
+ write_fp_dreg(s, a->rd, rd);
+ }
+ return true;
+}
+
+TRANS(SSHR_s, do_scalar_shift_imm, a, gen_sshr_d, false, 0)
+TRANS(USHR_s, do_scalar_shift_imm, a, gen_ushr_d, false, 0)
+TRANS(SSRA_s, do_scalar_shift_imm, a, gen_ssra_d, true, 0)
+TRANS(USRA_s, do_scalar_shift_imm, a, gen_usra_d, true, 0)
+TRANS(SRSHR_s, do_scalar_shift_imm, a, gen_srshr_d, false, 0)
+TRANS(URSHR_s, do_scalar_shift_imm, a, gen_urshr_d, false, 0)
+TRANS(SRSRA_s, do_scalar_shift_imm, a, gen_srsra_d, true, 0)
+TRANS(URSRA_s, do_scalar_shift_imm, a, gen_ursra_d, true, 0)
+TRANS(SRI_s, do_scalar_shift_imm, a, gen_sri_d, true, 0)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9352,64 +9417,6 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
}
}
-/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
-static void handle_scalar_simd_shri(DisasContext *s,
- bool is_u, int immh, int immb,
- int opcode, int rn, int rd)
-{
- const int size = 3;
- int immhb = immh << 3 | immb;
- int shift = 2 * (8 << size) - immhb;
- bool accumulate = false;
- bool round = false;
- bool insert = false;
- TCGv_i64 tcg_rn;
- TCGv_i64 tcg_rd;
-
- if (!extract32(immh, 3, 1)) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- switch (opcode) {
- case 0x02: /* SSRA / USRA (accumulate) */
- accumulate = true;
- break;
- case 0x04: /* SRSHR / URSHR (rounding) */
- round = true;
- break;
- case 0x06: /* SRSRA / URSRA (accum + rounding) */
- accumulate = round = true;
- break;
- case 0x08: /* SRI */
- insert = true;
- break;
- }
-
- tcg_rn = read_fp_dreg(s, rn);
- tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
-
- if (insert) {
- /* shift count same as element size is valid but does nothing;
- * special case to avoid potential shift by 64.
- */
- int esize = 8 << size;
- if (shift != esize) {
- tcg_gen_shri_i64(tcg_rn, tcg_rn, shift);
- tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift);
- }
- } else {
- handle_shri_with_rndacc(tcg_rd, tcg_rn, round,
- accumulate, is_u, size, shift);
- }
-
- write_fp_dreg(s, rd, tcg_rd);
-}
-
/* SHL/SLI - Scalar shift left */
static void handle_scalar_simd_shli(DisasContext *s, bool insert,
int immh, int immb, int opcode,
@@ -9893,18 +9900,6 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
}
switch (opcode) {
- case 0x08: /* SRI */
- if (!is_u) {
- unallocated_encoding(s);
- return;
- }
- /* fall through */
- case 0x00: /* SSHR / USHR */
- case 0x02: /* SSRA / USRA */
- case 0x04: /* SRSHR / URSHR */
- case 0x06: /* SRSRA / URSRA */
- handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
- break;
case 0x0a: /* SHL / SLI */
handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
break;
@@ -9940,6 +9935,11 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd);
break;
default:
+ case 0x00: /* SSHR / USHR */
+ case 0x02: /* SSRA / USRA */
+ case 0x04: /* SRSHR / URSHR */
+ case 0x06: /* SRSRA / URSRA */
+ case 0x08: /* SRI */
unallocated_encoding(s);
break;
}
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 164ed575b9..6c2362b3bb 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -30,6 +30,7 @@
&rri_sf rd rn imm sf
&i imm
&rr_e rd rn esz
+&rri_e rd rn imm esz
&rrr_e rd rn rm esz
&rrx_e rd rn rm idx esz
&rrrr_e rd rn rm ra esz
@@ -1285,3 +1286,18 @@ SHRN_v 0.00 11110 .... ... 10000 1 ..... ..... @q_shri_s
RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_b
RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_h
RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_s
+
+# Advanced SIMD scalar shift by immediate
+
+@shri_d .... ..... 1 ...... ..... . rn:5 rd:5 \
+ &rri_e esz=3 imm=%neon_rshift_i6
+
+SSHR_s 0101 11110 .... ... 00000 1 ..... ..... @shri_d
+USHR_s 0111 11110 .... ... 00000 1 ..... ..... @shri_d
+SSRA_s 0101 11110 .... ... 00010 1 ..... ..... @shri_d
+USRA_s 0111 11110 .... ... 00010 1 ..... ..... @shri_d
+SRSHR_s 0101 11110 .... ... 00100 1 ..... ..... @shri_d
+URSHR_s 0111 11110 .... ... 00100 1 ..... ..... @shri_d
+SRSRA_s 0101 11110 .... ... 00110 1 ..... ..... @shri_d
+URSRA_s 0111 11110 .... ... 00110 1 ..... ..... @shri_d
+SRI_s 0111 11110 .... ... 01000 1 ..... ..... @shri_d
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 24/29] target/arm: Convert handle_scalar_simd_shli to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (22 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 23/29] target/arm: Convert handle_scalar_simd_shri " Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 25/29] target/arm: Convert VQSHL, VQSHLU to gvec Richard Henderson
` (5 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
This includes SHL and SLI.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 44 +++++++---------------------------
target/arm/tcg/a64.decode | 4 ++++
2 files changed, 13 insertions(+), 35 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index efd93a7f23..934746d2f2 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -7124,6 +7124,11 @@ static void gen_sri_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
}
}
+static void gen_sli_d(TCGv_i64 dst, TCGv_i64 src, int64_t shift)
+{
+ tcg_gen_deposit_i64(dst, dst, src, shift, 64 - shift);
+}
+
static bool do_vec_shift_imm_narrow(DisasContext *s, arg_qrri_e *a,
WideShiftImmFn * const fns[3], MemOp sign)
{
@@ -7201,6 +7206,9 @@ TRANS(SRSRA_s, do_scalar_shift_imm, a, gen_srsra_d, true, 0)
TRANS(URSRA_s, do_scalar_shift_imm, a, gen_ursra_d, true, 0)
TRANS(SRI_s, do_scalar_shift_imm, a, gen_sri_d, true, 0)
+TRANS(SHL_s, do_scalar_shift_imm, a, tcg_gen_shli_i64, false, 0)
+TRANS(SLI_s, do_scalar_shift_imm, a, gen_sli_d, true, 0)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9417,38 +9425,6 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
}
}
-/* SHL/SLI - Scalar shift left */
-static void handle_scalar_simd_shli(DisasContext *s, bool insert,
- int immh, int immb, int opcode,
- int rn, int rd)
-{
- int size = 32 - clz32(immh) - 1;
- int immhb = immh << 3 | immb;
- int shift = immhb - (8 << size);
- TCGv_i64 tcg_rn;
- TCGv_i64 tcg_rd;
-
- if (!extract32(immh, 3, 1)) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- tcg_rn = read_fp_dreg(s, rn);
- tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
-
- if (insert) {
- tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift);
- } else {
- tcg_gen_shli_i64(tcg_rd, tcg_rn, shift);
- }
-
- write_fp_dreg(s, rd, tcg_rd);
-}
-
/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with
* (signed/unsigned) narrowing */
static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
@@ -9900,9 +9876,6 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
}
switch (opcode) {
- case 0x0a: /* SHL / SLI */
- handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
- break;
case 0x1c: /* SCVTF, UCVTF */
handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb,
opcode, rn, rd);
@@ -9940,6 +9913,7 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
case 0x04: /* SRSHR / URSHR */
case 0x06: /* SRSRA / URSRA */
case 0x08: /* SRI */
+ case 0x0a: /* SHL / SLI */
unallocated_encoding(s);
break;
}
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 6c2362b3bb..96803fe6e4 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1291,6 +1291,7 @@ RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_s
@shri_d .... ..... 1 ...... ..... . rn:5 rd:5 \
&rri_e esz=3 imm=%neon_rshift_i6
+@shli_d .... ..... 1 imm:6 ..... . rn:5 rd:5 &rri_e esz=3
SSHR_s 0101 11110 .... ... 00000 1 ..... ..... @shri_d
USHR_s 0111 11110 .... ... 00000 1 ..... ..... @shri_d
@@ -1301,3 +1302,6 @@ URSHR_s 0111 11110 .... ... 00100 1 ..... ..... @shri_d
SRSRA_s 0101 11110 .... ... 00110 1 ..... ..... @shri_d
URSRA_s 0111 11110 .... ... 00110 1 ..... ..... @shri_d
SRI_s 0111 11110 .... ... 01000 1 ..... ..... @shri_d
+
+SHL_s 0101 11110 .... ... 01010 1 ..... ..... @shli_d
+SLI_s 0111 11110 .... ... 01010 1 ..... ..... @shli_d
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 25/29] target/arm: Convert VQSHL, VQSHLU to gvec
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (23 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 24/29] target/arm: Convert handle_scalar_simd_shli " Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 26/29] target/arm: Widen NeonGenNarrowEnvFn return to 64 bits Richard Henderson
` (4 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 12 ++++
target/arm/tcg/translate.h | 7 ++
target/arm/tcg/gengvec.c | 36 +++++++++++
target/arm/tcg/neon_helper.c | 33 ++++++++++
target/arm/tcg/translate-neon.c | 110 +-------------------------------
target/arm/tcg/neon-dp.decode | 6 +-
6 files changed, 94 insertions(+), 110 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index b463be38c5..b40589d329 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -324,6 +324,18 @@ DEF_HELPER_FLAGS_5(neon_uqrshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32
DEF_HELPER_FLAGS_5(neon_uqrshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(neon_uqrshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(neon_uqrshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_srshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_srshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index 45990ae292..7721c627e9 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -471,6 +471,13 @@ void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz);
+
void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
diff --git a/target/arm/tcg/gengvec.c b/target/arm/tcg/gengvec.c
index 3abdc57202..f652520b65 100644
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@@ -1313,6 +1313,42 @@ void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
opr_sz, max_sz, 0, fns[vece]);
}
+void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_2_ptr * const fns[] = {
+ gen_helper_neon_sqshli_b, gen_helper_neon_sqshli_h,
+ gen_helper_neon_sqshli_s, gen_helper_neon_sqshli_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(c >= 0 && c <= (8 << vece));
+ tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
+void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_2_ptr * const fns[] = {
+ gen_helper_neon_uqshli_b, gen_helper_neon_uqshli_h,
+ gen_helper_neon_uqshli_s, gen_helper_neon_uqshli_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(c >= 0 && c <= (8 << vece));
+ tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
+void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+ int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+ static gen_helper_gvec_2_ptr * const fns[] = {
+ gen_helper_neon_sqshlui_b, gen_helper_neon_sqshlui_h,
+ gen_helper_neon_sqshlui_s, gen_helper_neon_sqshlui_d,
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(c >= 0 && c <= (8 << vece));
+ tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
{
uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c
index 082bfd88ad..739e16e441 100644
--- a/target/arm/tcg/neon_helper.c
+++ b/target/arm/tcg/neon_helper.c
@@ -141,6 +141,19 @@ void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
clear_tail(d, opr_sz, simd_maxsz(desc)); \
}
+#define NEON_GVEC_VOP2i_ENV(name, vtype) \
+void HELPER(name)(void *vd, void *vn, void *venv, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int imm = simd_data(desc); \
+ vtype *d = vd, *n = vn; \
+ CPUARMState *env = venv; \
+ for (i = 0; i < opr_sz / sizeof(vtype); i++) { \
+ NEON_FN(d[i], n[i], imm); \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
/* Pairwise operations. */
/* For 32-bit elements each segment only contains a single element, so
the elementwise and pairwise operations are the same. */
@@ -271,22 +284,26 @@ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
#undef NEON_FN
uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
@@ -303,22 +320,26 @@ uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
(dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
#undef NEON_FN
uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
@@ -334,11 +355,13 @@ uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
#define NEON_FN(dest, src1, src2) \
(dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) \
(dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
#undef NEON_FN
uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
@@ -351,6 +374,16 @@ uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
}
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
+#undef NEON_FN
+
#define NEON_FN(dest, src1, src2) \
(dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index a31a78c347..6dd70d1c53 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -1101,113 +1101,9 @@ DO_2SH(VRSRA_S, gen_gvec_srsra)
DO_2SH(VRSRA_U, gen_gvec_ursra)
DO_2SH(VSHR_S, gen_gvec_sshr)
DO_2SH(VSHR_U, gen_gvec_ushr)
-
-static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
- NeonGenTwo64OpEnvFn *fn)
-{
- /*
- * 2-reg-and-shift operations, size == 3 case, where the
- * function needs to be passed tcg_env.
- */
- TCGv_i64 constimm;
- int pass;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vm) & 0x10)) {
- return false;
- }
-
- if ((a->vm | a->vd) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- /*
- * To avoid excessive duplication of ops we implement shift
- * by immediate using the variable shift operations.
- */
- constimm = tcg_constant_i64(dup_const(a->size, a->shift));
-
- for (pass = 0; pass < a->q + 1; pass++) {
- TCGv_i64 tmp = tcg_temp_new_i64();
-
- read_neon_element64(tmp, a->vm, pass, MO_64);
- fn(tmp, tcg_env, tmp, constimm);
- write_neon_element64(tmp, a->vd, pass, MO_64);
- }
- return true;
-}
-
-static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
- NeonGenTwoOpEnvFn *fn)
-{
- /*
- * 2-reg-and-shift operations, size < 3 case, where the
- * helper needs to be passed tcg_env.
- */
- TCGv_i32 constimm, tmp;
- int pass;
-
- if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
- return false;
- }
-
- /* UNDEF accesses to D16-D31 if they don't exist. */
- if (!dc_isar_feature(aa32_simd_r32, s) &&
- ((a->vd | a->vm) & 0x10)) {
- return false;
- }
-
- if ((a->vm | a->vd) & a->q) {
- return false;
- }
-
- if (!vfp_access_check(s)) {
- return true;
- }
-
- /*
- * To avoid excessive duplication of ops we implement shift
- * by immediate using the variable shift operations.
- */
- constimm = tcg_constant_i32(dup_const(a->size, a->shift));
- tmp = tcg_temp_new_i32();
-
- for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
- read_neon_element32(tmp, a->vm, pass, MO_32);
- fn(tmp, tcg_env, tmp, constimm);
- write_neon_element32(tmp, a->vd, pass, MO_32);
- }
- return true;
-}
-
-#define DO_2SHIFT_ENV(INSN, FUNC) \
- static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
- { \
- return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64); \
- } \
- static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a) \
- { \
- static NeonGenTwoOpEnvFn * const fns[] = { \
- gen_helper_neon_##FUNC##8, \
- gen_helper_neon_##FUNC##16, \
- gen_helper_neon_##FUNC##32, \
- }; \
- assert(a->size < ARRAY_SIZE(fns)); \
- return do_2shift_env_32(s, a, fns[a->size]); \
- }
-
-DO_2SHIFT_ENV(VQSHLU, qshlu_s)
-DO_2SHIFT_ENV(VQSHL_U, qshl_u)
-DO_2SHIFT_ENV(VQSHL_S, qshl_s)
+DO_2SH(VQSHLU, gen_neon_sqshlui)
+DO_2SH(VQSHL_U, gen_neon_uqshli)
+DO_2SH(VQSHL_S, gen_neon_sqshli)
static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
NeonGenTwo64OpFn *shiftfn,
diff --git a/target/arm/tcg/neon-dp.decode b/target/arm/tcg/neon-dp.decode
index 788578c8fa..e883c6ab58 100644
--- a/target/arm/tcg/neon-dp.decode
+++ b/target/arm/tcg/neon-dp.decode
@@ -291,17 +291,17 @@ VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
VSLI_2sh 1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b
-VQSHLU_64_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
+VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s
VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h
VQSHLU_2sh 1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b
-VQSHL_S_64_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
VQSHL_S_2sh 1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
-VQSHL_U_64_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
VQSHL_U_2sh 1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 26/29] target/arm: Widen NeonGenNarrowEnvFn return to 64 bits
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (24 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 25/29] target/arm: Convert VQSHL, VQSHLU to gvec Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 12:46 ` Peter Maydell
2024-09-12 2:41 ` [PATCH v3 27/29] target/arm: Convert SQSHL, UQSHL, SQSHLU (immediate) to decodetree Richard Henderson
` (3 subsequent siblings)
29 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
While these functions really do return a 32-bit value,
widening the return type means that we need do less
marshalling between TCG types.
Remove NeonGenNarrowEnvFn typedef; add NeonGenOne64OpEnvFn.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 22 ++++++------
target/arm/tcg/translate.h | 2 +-
target/arm/tcg/neon_helper.c | 43 ++++++++++++++---------
target/arm/tcg/translate-a64.c | 60 ++++++++++++++++++---------------
target/arm/tcg/translate-neon.c | 44 ++++++++++++------------
5 files changed, 93 insertions(+), 78 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index b40589d329..58919b670e 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -375,17 +375,17 @@ DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32)
DEF_HELPER_4(neon_qrdmlah_s32, i32, env, s32, s32, s32)
DEF_HELPER_4(neon_qrdmlsh_s32, i32, env, s32, s32, s32)
-DEF_HELPER_1(neon_narrow_u8, i32, i64)
-DEF_HELPER_1(neon_narrow_u16, i32, i64)
-DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64)
-DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64)
-DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64)
+DEF_HELPER_1(neon_narrow_u8, i64, i64)
+DEF_HELPER_1(neon_narrow_u16, i64, i64)
+DEF_HELPER_2(neon_unarrow_sat8, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u8, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s8, i64, env, i64)
+DEF_HELPER_2(neon_unarrow_sat16, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u16, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s16, i64, env, i64)
+DEF_HELPER_2(neon_unarrow_sat32, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u32, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s32, i64, env, i64)
DEF_HELPER_1(neon_narrow_high_u8, i32, i64)
DEF_HELPER_1(neon_narrow_high_u16, i32, i64)
DEF_HELPER_1(neon_narrow_round_high_u8, i32, i64)
diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index 7721c627e9..5a2e10d64d 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -605,13 +605,13 @@ typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
-typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32);
typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr);
typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64);
+typedef void NeonGenOne64OpEnvFn(TCGv_i64, TCGv_env, TCGv_i64);
typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr);
typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
diff --git a/target/arm/tcg/neon_helper.c b/target/arm/tcg/neon_helper.c
index 739e16e441..93b2076c64 100644
--- a/target/arm/tcg/neon_helper.c
+++ b/target/arm/tcg/neon_helper.c
@@ -598,13 +598,15 @@ NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
#undef NEON_FN
#undef NEON_QDMULH32
-uint32_t HELPER(neon_narrow_u8)(uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_u8)(uint64_t x)
{
return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
| ((x >> 24) & 0xff000000u);
}
-uint32_t HELPER(neon_narrow_u16)(uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_u16)(uint64_t x)
{
return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
}
@@ -635,7 +637,8 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
}
-uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
{
uint16_t s;
uint8_t d;
@@ -662,7 +665,8 @@ uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
return res;
}
-uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
{
uint16_t s;
uint8_t d;
@@ -685,7 +689,8 @@ uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
return res;
}
-uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
{
int16_t s;
uint8_t d;
@@ -708,7 +713,8 @@ uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
return res;
}
-uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
{
uint32_t high;
uint32_t low;
@@ -728,10 +734,11 @@ uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
high = 0xffff;
SET_QC();
}
- return low | (high << 16);
+ return deposit32(low, 16, 16, high);
}
-uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
{
uint32_t high;
uint32_t low;
@@ -745,10 +752,11 @@ uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
high = 0xffff;
SET_QC();
}
- return low | (high << 16);
+ return deposit32(low, 16, 16, high);
}
-uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
{
int32_t low;
int32_t high;
@@ -762,10 +770,11 @@ uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
high = (high >> 31) ^ 0x7fff;
SET_QC();
}
- return (uint16_t)low | (high << 16);
+ return deposit32(low, 16, 16, high);
}
-uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
{
if (x & 0x8000000000000000ull) {
SET_QC();
@@ -778,7 +787,8 @@ uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
return x;
}
-uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
{
if (x > 0xffffffffu) {
SET_QC();
@@ -787,13 +797,14 @@ uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
return x;
}
-uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
{
if ((int64_t)x != (int32_t)x) {
SET_QC();
- return ((int64_t)x >> 63) ^ 0x7fffffff;
+ return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
}
- return x;
+ return (uint32_t)x;
}
uint64_t HELPER(neon_widen_u8)(uint32_t x)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 934746d2f2..7918720d9b 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -9439,11 +9439,9 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
int elements = is_scalar ? 1 : (64 / esize);
bool round = extract32(opcode, 0, 1);
MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN);
- TCGv_i64 tcg_rn, tcg_rd;
- TCGv_i32 tcg_rd_narrowed;
- TCGv_i64 tcg_final;
+ TCGv_i64 tcg_rn, tcg_rd, tcg_final;
- static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = {
+ static NeonGenOne64OpEnvFn * const signed_narrow_fns[4][2] = {
{ gen_helper_neon_narrow_sat_s8,
gen_helper_neon_unarrow_sat8 },
{ gen_helper_neon_narrow_sat_s16,
@@ -9452,13 +9450,13 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
gen_helper_neon_unarrow_sat32 },
{ NULL, NULL },
};
- static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = {
+ static NeonGenOne64OpEnvFn * const unsigned_narrow_fns[4] = {
gen_helper_neon_narrow_sat_u8,
gen_helper_neon_narrow_sat_u16,
gen_helper_neon_narrow_sat_u32,
NULL
};
- NeonGenNarrowEnvFn *narrowfn;
+ NeonGenOne64OpEnvFn *narrowfn;
int i;
@@ -9481,15 +9479,13 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
tcg_rn = tcg_temp_new_i64();
tcg_rd = tcg_temp_new_i64();
- tcg_rd_narrowed = tcg_temp_new_i32();
tcg_final = tcg_temp_new_i64();
for (i = 0; i < elements; i++) {
read_vec_element(s, tcg_rn, rn, i, ldop);
handle_shri_with_rndacc(tcg_rd, tcg_rn, round,
false, is_u_shift, size+1, shift);
- narrowfn(tcg_rd_narrowed, tcg_env, tcg_rd);
- tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed);
+ narrowfn(tcg_rd, tcg_env, tcg_rd);
if (i == 0) {
tcg_gen_extract_i64(tcg_final, tcg_rd, 0, esize);
} else {
@@ -10228,35 +10224,35 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
* in the source becomes a size element in the destination).
*/
int pass;
- TCGv_i32 tcg_res[2];
+ TCGv_i64 tcg_res[2];
int destelt = is_q ? 2 : 0;
int passes = scalar ? 1 : 2;
if (scalar) {
- tcg_res[1] = tcg_constant_i32(0);
+ tcg_res[1] = tcg_constant_i64(0);
}
for (pass = 0; pass < passes; pass++) {
TCGv_i64 tcg_op = tcg_temp_new_i64();
- NeonGenNarrowFn *genfn = NULL;
- NeonGenNarrowEnvFn *genenvfn = NULL;
+ NeonGenOne64OpFn *genfn = NULL;
+ NeonGenOne64OpEnvFn *genenvfn = NULL;
if (scalar) {
read_vec_element(s, tcg_op, rn, pass, size + 1);
} else {
read_vec_element(s, tcg_op, rn, pass, MO_64);
}
- tcg_res[pass] = tcg_temp_new_i32();
+ tcg_res[pass] = tcg_temp_new_i64();
switch (opcode) {
case 0x12: /* XTN, SQXTUN */
{
- static NeonGenNarrowFn * const xtnfns[3] = {
+ static NeonGenOne64OpFn * const xtnfns[3] = {
gen_helper_neon_narrow_u8,
gen_helper_neon_narrow_u16,
- tcg_gen_extrl_i64_i32,
+ tcg_gen_ext32u_i64,
};
- static NeonGenNarrowEnvFn * const sqxtunfns[3] = {
+ static NeonGenOne64OpEnvFn * const sqxtunfns[3] = {
gen_helper_neon_unarrow_sat8,
gen_helper_neon_unarrow_sat16,
gen_helper_neon_unarrow_sat32,
@@ -10270,7 +10266,7 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
}
case 0x14: /* SQXTN, UQXTN */
{
- static NeonGenNarrowEnvFn * const fns[3][2] = {
+ static NeonGenOne64OpEnvFn * const fns[3][2] = {
{ gen_helper_neon_narrow_sat_s8,
gen_helper_neon_narrow_sat_u8 },
{ gen_helper_neon_narrow_sat_s16,
@@ -10284,7 +10280,9 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
case 0x16: /* FCVTN, FCVTN2 */
/* 32 bit to 16 bit or 64 bit to 32 bit float conversion */
if (size == 2) {
- gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, tcg_env);
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ gen_helper_vfp_fcvtsd(tmp, tcg_op, tcg_env);
+ tcg_gen_extu_i32_i64(tcg_res[pass], tmp);
} else {
TCGv_i32 tcg_lo = tcg_temp_new_i32();
TCGv_i32 tcg_hi = tcg_temp_new_i32();
@@ -10294,21 +10292,29 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
tcg_gen_extr_i64_i32(tcg_lo, tcg_hi, tcg_op);
gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, fpst, ahp);
gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, fpst, ahp);
- tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16);
+ tcg_gen_deposit_i32(tcg_lo, tcg_lo, tcg_hi, 16, 16);
+ tcg_gen_extu_i32_i64(tcg_res[pass], tcg_lo);
}
break;
case 0x36: /* BFCVTN, BFCVTN2 */
{
TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
- gen_helper_bfcvt_pair(tcg_res[pass], tcg_op, fpst);
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ gen_helper_bfcvt_pair(tmp, tcg_op, fpst);
+ tcg_gen_extu_i32_i64(tcg_res[pass], tmp);
}
break;
case 0x56: /* FCVTXN, FCVTXN2 */
- /* 64 bit to 32 bit float conversion
- * with von Neumann rounding (round to odd)
- */
- assert(size == 2);
- gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, tcg_env);
+ {
+ /*
+ * 64 bit to 32 bit float conversion
+ * with von Neumann rounding (round to odd)
+ */
+ TCGv_i32 tmp = tcg_temp_new_i32();
+ assert(size == 2);
+ gen_helper_fcvtx_f64_to_f32(tmp, tcg_op, tcg_env);
+ tcg_gen_extu_i32_i64(tcg_res[pass], tmp);
+ }
break;
default:
g_assert_not_reached();
@@ -10322,7 +10328,7 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
}
for (pass = 0; pass < 2; pass++) {
- write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32);
+ write_vec_element(s, tcg_res[pass], rd, destelt + pass, MO_32);
}
clear_vec_high(s, is_q, rd);
}
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index 6dd70d1c53..9c8829ad7d 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -1107,11 +1107,10 @@ DO_2SH(VQSHL_S, gen_neon_sqshli)
static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
NeonGenTwo64OpFn *shiftfn,
- NeonGenNarrowEnvFn *narrowfn)
+ NeonGenOne64OpEnvFn *narrowfn)
{
/* 2-reg-and-shift narrowing-shift operations, size == 3 case */
- TCGv_i64 constimm, rm1, rm2;
- TCGv_i32 rd;
+ TCGv_i64 constimm, rm1, rm2, rd;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
return false;
@@ -1138,7 +1137,7 @@ static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
constimm = tcg_constant_i64(-a->shift);
rm1 = tcg_temp_new_i64();
rm2 = tcg_temp_new_i64();
- rd = tcg_temp_new_i32();
+ rd = tcg_temp_new_i64();
/* Load both inputs first to avoid potential overwrite if rm == rd */
read_neon_element64(rm1, a->vm, 0, MO_64);
@@ -1146,18 +1145,18 @@ static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
shiftfn(rm1, rm1, constimm);
narrowfn(rd, tcg_env, rm1);
- write_neon_element32(rd, a->vd, 0, MO_32);
+ write_neon_element64(rd, a->vd, 0, MO_32);
shiftfn(rm2, rm2, constimm);
narrowfn(rd, tcg_env, rm2);
- write_neon_element32(rd, a->vd, 1, MO_32);
+ write_neon_element64(rd, a->vd, 1, MO_32);
return true;
}
static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
NeonGenTwoOpFn *shiftfn,
- NeonGenNarrowEnvFn *narrowfn)
+ NeonGenOne64OpEnvFn *narrowfn)
{
/* 2-reg-and-shift narrowing-shift operations, size < 3 case */
TCGv_i32 constimm, rm1, rm2, rm3, rm4;
@@ -1212,16 +1211,16 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
- narrowfn(rm1, tcg_env, rtmp);
- write_neon_element32(rm1, a->vd, 0, MO_32);
+ narrowfn(rtmp, tcg_env, rtmp);
+ write_neon_element64(rtmp, a->vd, 0, MO_32);
shiftfn(rm3, rm3, constimm);
shiftfn(rm4, rm4, constimm);
tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
- narrowfn(rm3, tcg_env, rtmp);
- write_neon_element32(rm3, a->vd, 1, MO_32);
+ narrowfn(rtmp, tcg_env, rtmp);
+ write_neon_element64(rtmp, a->vd, 1, MO_32);
return true;
}
@@ -1236,17 +1235,17 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC); \
}
-static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+static void gen_neon_narrow_u32(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
{
- tcg_gen_extrl_i64_i32(dest, src);
+ tcg_gen_ext32u_i64(dest, src);
}
-static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+static void gen_neon_narrow_u16(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
{
gen_helper_neon_narrow_u16(dest, src);
}
-static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+static void gen_neon_narrow_u8(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
{
gen_helper_neon_narrow_u8(dest, src);
}
@@ -2837,10 +2836,9 @@ static bool trans_VZIP(DisasContext *s, arg_2misc *a)
}
static bool do_vmovn(DisasContext *s, arg_2misc *a,
- NeonGenNarrowEnvFn *narrowfn)
+ NeonGenOne64OpEnvFn *narrowfn)
{
- TCGv_i64 rm;
- TCGv_i32 rd0, rd1;
+ TCGv_i64 rm, rd0, rd1;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
return false;
@@ -2865,22 +2863,22 @@ static bool do_vmovn(DisasContext *s, arg_2misc *a,
}
rm = tcg_temp_new_i64();
- rd0 = tcg_temp_new_i32();
- rd1 = tcg_temp_new_i32();
+ rd0 = tcg_temp_new_i64();
+ rd1 = tcg_temp_new_i64();
read_neon_element64(rm, a->vm, 0, MO_64);
narrowfn(rd0, tcg_env, rm);
read_neon_element64(rm, a->vm, 1, MO_64);
narrowfn(rd1, tcg_env, rm);
- write_neon_element32(rd0, a->vd, 0, MO_32);
- write_neon_element32(rd1, a->vd, 1, MO_32);
+ write_neon_element64(rd0, a->vd, 0, MO_32);
+ write_neon_element64(rd1, a->vd, 1, MO_32);
return true;
}
#define DO_VMOVN(INSN, FUNC) \
static bool trans_##INSN(DisasContext *s, arg_2misc *a) \
{ \
- static NeonGenNarrowEnvFn * const narrowfn[] = { \
+ static NeonGenOne64OpEnvFn * const narrowfn[] = { \
FUNC##8, \
FUNC##16, \
FUNC##32, \
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 27/29] target/arm: Convert SQSHL, UQSHL, SQSHLU (immediate) to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (25 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 26/29] target/arm: Widen NeonGenNarrowEnvFn return to 64 bits Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 28/29] target/arm: Convert vector [US]QSHRN, [US]QRSHRN, SQSHRUN " Richard Henderson
` (2 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 223 ++++++++++++++-------------------
target/arm/tcg/a64.decode | 36 +++++-
2 files changed, 128 insertions(+), 131 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 7918720d9b..77324e0145 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -6982,6 +6982,9 @@ TRANS(URSRA_v, do_vec_shift_imm, a, gen_gvec_ursra)
TRANS(SRI_v, do_vec_shift_imm, a, gen_gvec_sri)
TRANS(SHL_v, do_vec_shift_imm, a, tcg_gen_gvec_shli)
TRANS(SLI_v, do_vec_shift_imm, a, gen_gvec_sli);
+TRANS(SQSHL_vi, do_vec_shift_imm, a, gen_neon_sqshli)
+TRANS(UQSHL_vi, do_vec_shift_imm, a, gen_neon_uqshli)
+TRANS(SQSHLU_vi, do_vec_shift_imm, a, gen_neon_sqshlui)
static bool do_vec_shift_imm_wide(DisasContext *s, arg_qrri_e *a, bool is_u)
{
@@ -7209,6 +7212,92 @@ TRANS(SRI_s, do_scalar_shift_imm, a, gen_sri_d, true, 0)
TRANS(SHL_s, do_scalar_shift_imm, a, tcg_gen_shli_i64, false, 0)
TRANS(SLI_s, do_scalar_shift_imm, a, gen_sli_d, true, 0)
+static void trunc_i64_env_imm(TCGv_i64 d, TCGv_i64 s, int64_t i,
+ NeonGenTwoOpEnvFn *fn)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+ tcg_gen_extrl_i64_i32(t, s);
+ fn(t, tcg_env, t, tcg_constant_i32(i));
+ tcg_gen_extu_i32_i64(d, t);
+}
+
+static void gen_sqshli_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_s8);
+}
+
+static void gen_sqshli_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_s16);
+}
+
+static void gen_sqshli_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_s32);
+}
+
+static void gen_sqshli_d(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_helper_neon_qshl_s64(d, tcg_env, s, tcg_constant_i64(i));
+}
+
+static void gen_uqshli_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_u8);
+}
+
+static void gen_uqshli_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_u16);
+}
+
+static void gen_uqshli_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshl_u32);
+}
+
+static void gen_uqshli_d(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_helper_neon_qshl_u64(d, tcg_env, s, tcg_constant_i64(i));
+}
+
+static void gen_sqshlui_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshlu_s8);
+}
+
+static void gen_sqshlui_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshlu_s16);
+}
+
+static void gen_sqshlui_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ trunc_i64_env_imm(d, s, i, gen_helper_neon_qshlu_s32);
+}
+
+static void gen_sqshlui_d(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_helper_neon_qshlu_s64(d, tcg_env, s, tcg_constant_i64(i));
+}
+
+static WideShiftImmFn * const f_scalar_sqshli[] = {
+ gen_sqshli_b, gen_sqshli_h, gen_sqshli_s, gen_sqshli_d
+};
+
+static WideShiftImmFn * const f_scalar_uqshli[] = {
+ gen_uqshli_b, gen_uqshli_h, gen_uqshli_s, gen_uqshli_d
+};
+
+static WideShiftImmFn * const f_scalar_sqshlui[] = {
+ gen_sqshlui_b, gen_sqshlui_h, gen_sqshlui_s, gen_sqshlui_d
+};
+
+/* Note that the helpers sign-extend their inputs, so don't do it here. */
+TRANS(SQSHL_si, do_scalar_shift_imm, a, f_scalar_sqshli[a->esz], false, 0)
+TRANS(UQSHL_si, do_scalar_shift_imm, a, f_scalar_uqshli[a->esz], false, 0)
+TRANS(SQSHLU_si, do_scalar_shift_imm, a, f_scalar_sqshlui[a->esz], false, 0)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9501,116 +9590,6 @@ static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
clear_vec_high(s, is_q, rd);
}
-/* SQSHLU, UQSHL, SQSHL: saturating left shifts */
-static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q,
- bool src_unsigned, bool dst_unsigned,
- int immh, int immb, int rn, int rd)
-{
- int immhb = immh << 3 | immb;
- int size = 32 - clz32(immh) - 1;
- int shift = immhb - (8 << size);
- int pass;
-
- assert(immh != 0);
- assert(!(scalar && is_q));
-
- if (!scalar) {
- if (!is_q && extract32(immh, 3, 1)) {
- unallocated_encoding(s);
- return;
- }
-
- /* Since we use the variable-shift helpers we must
- * replicate the shift count into each element of
- * the tcg_shift value.
- */
- switch (size) {
- case 0:
- shift |= shift << 8;
- /* fall through */
- case 1:
- shift |= shift << 16;
- break;
- case 2:
- case 3:
- break;
- default:
- g_assert_not_reached();
- }
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- if (size == 3) {
- TCGv_i64 tcg_shift = tcg_constant_i64(shift);
- static NeonGenTwo64OpEnvFn * const fns[2][2] = {
- { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 },
- { NULL, gen_helper_neon_qshl_u64 },
- };
- NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned];
- int maxpass = is_q ? 2 : 1;
-
- for (pass = 0; pass < maxpass; pass++) {
- TCGv_i64 tcg_op = tcg_temp_new_i64();
-
- read_vec_element(s, tcg_op, rn, pass, MO_64);
- genfn(tcg_op, tcg_env, tcg_op, tcg_shift);
- write_vec_element(s, tcg_op, rd, pass, MO_64);
- }
- clear_vec_high(s, is_q, rd);
- } else {
- TCGv_i32 tcg_shift = tcg_constant_i32(shift);
- static NeonGenTwoOpEnvFn * const fns[2][2][3] = {
- {
- { gen_helper_neon_qshl_s8,
- gen_helper_neon_qshl_s16,
- gen_helper_neon_qshl_s32 },
- { gen_helper_neon_qshlu_s8,
- gen_helper_neon_qshlu_s16,
- gen_helper_neon_qshlu_s32 }
- }, {
- { NULL, NULL, NULL },
- { gen_helper_neon_qshl_u8,
- gen_helper_neon_qshl_u16,
- gen_helper_neon_qshl_u32 }
- }
- };
- NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size];
- MemOp memop = scalar ? size : MO_32;
- int maxpass = scalar ? 1 : is_q ? 4 : 2;
-
- for (pass = 0; pass < maxpass; pass++) {
- TCGv_i32 tcg_op = tcg_temp_new_i32();
-
- read_vec_element_i32(s, tcg_op, rn, pass, memop);
- genfn(tcg_op, tcg_env, tcg_op, tcg_shift);
- if (scalar) {
- switch (size) {
- case 0:
- tcg_gen_ext8u_i32(tcg_op, tcg_op);
- break;
- case 1:
- tcg_gen_ext16u_i32(tcg_op, tcg_op);
- break;
- case 2:
- break;
- default:
- g_assert_not_reached();
- }
- write_fp_sreg(s, rd, tcg_op);
- } else {
- write_vec_element_i32(s, tcg_op, rd, pass, MO_32);
- }
- }
-
- if (!scalar) {
- clear_vec_high(s, is_q, rd);
- }
- }
-}
-
/* Common vector code for handling integer to FP conversion */
static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
int elements, int is_signed,
@@ -9890,16 +9869,6 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
immh, immb, opcode, rn, rd);
break;
- case 0xc: /* SQSHLU */
- if (!is_u) {
- unallocated_encoding(s);
- return;
- }
- handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd);
- break;
- case 0xe: /* SQSHL, UQSHL */
- handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd);
- break;
case 0x1f: /* FCVTZS, FCVTZU */
handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd);
break;
@@ -9910,6 +9879,8 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
case 0x06: /* SRSRA / URSRA */
case 0x08: /* SRI */
case 0x0a: /* SHL / SLI */
+ case 0x0c: /* SQSHLU */
+ case 0x0e: /* SQSHL, UQSHL */
unallocated_encoding(s);
break;
}
@@ -10561,16 +10532,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
opcode, rn, rd);
break;
- case 0xc: /* SQSHLU */
- if (!is_u) {
- unallocated_encoding(s);
- return;
- }
- handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd);
- break;
- case 0xe: /* SQSHL, UQSHL */
- handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd);
- break;
case 0x1f: /* FCVTZS/ FCVTZU */
handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd);
return;
@@ -10581,6 +10542,8 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
case 0x06: /* SRSRA / URSRA (accum + rounding) */
case 0x08: /* SRI */
case 0x0a: /* SHL / SLI */
+ case 0x0c: /* SQSHLU */
+ case 0x0e: /* SQSHL, UQSHL */
case 0x14: /* SSHLL / USHLL */
unallocated_encoding(s);
return;
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 96803fe6e4..63e04ddfcd 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1287,11 +1287,30 @@ RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_b
RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_h
RSHRN_v 0.00 11110 .... ... 10001 1 ..... ..... @q_shri_s
+SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_b
+SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_h
+SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_s
+SQSHL_vi 0.00 11110 .... ... 01110 1 ..... ..... @q_shli_d
+
+UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_b
+UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_h
+UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_s
+UQSHL_vi 0.10 11110 .... ... 01110 1 ..... ..... @q_shli_d
+
+SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_b
+SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_h
+SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_s
+SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_d
+
# Advanced SIMD scalar shift by immediate
@shri_d .... ..... 1 ...... ..... . rn:5 rd:5 \
&rri_e esz=3 imm=%neon_rshift_i6
-@shli_d .... ..... 1 imm:6 ..... . rn:5 rd:5 &rri_e esz=3
+
+@shli_b .... ..... 0001 imm:3 ..... . rn:5 rd:5 &rri_e esz=0
+@shli_h .... ..... 001 imm:4 ..... . rn:5 rd:5 &rri_e esz=1
+@shli_s .... ..... 01 imm:5 ..... . rn:5 rd:5 &rri_e esz=2
+@shli_d .... ..... 1 imm:6 ..... . rn:5 rd:5 &rri_e esz=3
SSHR_s 0101 11110 .... ... 00000 1 ..... ..... @shri_d
USHR_s 0111 11110 .... ... 00000 1 ..... ..... @shri_d
@@ -1305,3 +1324,18 @@ SRI_s 0111 11110 .... ... 01000 1 ..... ..... @shri_d
SHL_s 0101 11110 .... ... 01010 1 ..... ..... @shli_d
SLI_s 0111 11110 .... ... 01010 1 ..... ..... @shli_d
+
+SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_b
+SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_h
+SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_s
+SQSHL_si 0101 11110 .... ... 01110 1 ..... ..... @shli_d
+
+UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_b
+UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_h
+UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_s
+UQSHL_si 0111 11110 .... ... 01110 1 ..... ..... @shli_d
+
+SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_b
+SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_h
+SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_s
+SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_d
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 28/29] target/arm: Convert vector [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (26 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 27/29] target/arm: Convert SQSHL, UQSHL, SQSHLU (immediate) to decodetree Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-12 2:41 ` [PATCH v3 29/29] target/arm: Convert scalar " Richard Henderson
2024-09-16 14:25 ` [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Peter Maydell
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 176 ++++++++++++++++++++++++++++++---
target/arm/tcg/a64.decode | 24 +++++
2 files changed, 186 insertions(+), 14 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index 77324e0145..e6290e1145 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -7163,6 +7163,122 @@ static bool do_vec_shift_imm_narrow(DisasContext *s, arg_qrri_e *a,
return true;
}
+static void gen_sqshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ tcg_gen_sari_i64(d, s, i);
+ tcg_gen_ext16u_i64(d, d);
+ gen_helper_neon_narrow_sat_s8(d, tcg_env, d);
+}
+
+static void gen_sqshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ tcg_gen_sari_i64(d, s, i);
+ tcg_gen_ext32u_i64(d, d);
+ gen_helper_neon_narrow_sat_s16(d, tcg_env, d);
+}
+
+static void gen_sqshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_sshr_d(d, s, i);
+ gen_helper_neon_narrow_sat_s32(d, tcg_env, d);
+}
+
+static void gen_uqshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ tcg_gen_shri_i64(d, s, i);
+ gen_helper_neon_narrow_sat_u8(d, tcg_env, d);
+}
+
+static void gen_uqshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ tcg_gen_shri_i64(d, s, i);
+ gen_helper_neon_narrow_sat_u16(d, tcg_env, d);
+}
+
+static void gen_uqshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_ushr_d(d, s, i);
+ gen_helper_neon_narrow_sat_u32(d, tcg_env, d);
+}
+
+static void gen_sqshrun_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ tcg_gen_sari_i64(d, s, i);
+ tcg_gen_ext16u_i64(d, d);
+ gen_helper_neon_unarrow_sat8(d, tcg_env, d);
+}
+
+static void gen_sqshrun_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ tcg_gen_sari_i64(d, s, i);
+ tcg_gen_ext32u_i64(d, d);
+ gen_helper_neon_unarrow_sat16(d, tcg_env, d);
+}
+
+static void gen_sqshrun_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_sshr_d(d, s, i);
+ gen_helper_neon_unarrow_sat32(d, tcg_env, d);
+}
+
+static void gen_sqrshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_srshr_bhs(d, s, i);
+ tcg_gen_ext16u_i64(d, d);
+ gen_helper_neon_narrow_sat_s8(d, tcg_env, d);
+}
+
+static void gen_sqrshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_srshr_bhs(d, s, i);
+ tcg_gen_ext32u_i64(d, d);
+ gen_helper_neon_narrow_sat_s16(d, tcg_env, d);
+}
+
+static void gen_sqrshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_srshr_d(d, s, i);
+ gen_helper_neon_narrow_sat_s32(d, tcg_env, d);
+}
+
+static void gen_uqrshrn_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_urshr_bhs(d, s, i);
+ gen_helper_neon_narrow_sat_u8(d, tcg_env, d);
+}
+
+static void gen_uqrshrn_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_urshr_bhs(d, s, i);
+ gen_helper_neon_narrow_sat_u16(d, tcg_env, d);
+}
+
+static void gen_uqrshrn_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_urshr_d(d, s, i);
+ gen_helper_neon_narrow_sat_u32(d, tcg_env, d);
+}
+
+static void gen_sqrshrun_b(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_srshr_bhs(d, s, i);
+ tcg_gen_ext16u_i64(d, d);
+ gen_helper_neon_unarrow_sat8(d, tcg_env, d);
+}
+
+static void gen_sqrshrun_h(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_srshr_bhs(d, s, i);
+ tcg_gen_ext32u_i64(d, d);
+ gen_helper_neon_unarrow_sat16(d, tcg_env, d);
+}
+
+static void gen_sqrshrun_s(TCGv_i64 d, TCGv_i64 s, int64_t i)
+{
+ gen_srshr_d(d, s, i);
+ gen_helper_neon_unarrow_sat32(d, tcg_env, d);
+}
+
static WideShiftImmFn * const shrn_fns[] = {
tcg_gen_shri_i64,
tcg_gen_shri_i64,
@@ -7177,6 +7293,48 @@ static WideShiftImmFn * const rshrn_fns[] = {
};
TRANS(RSHRN_v, do_vec_shift_imm_narrow, a, rshrn_fns, 0)
+static WideShiftImmFn * const sqshrn_fns[] = {
+ gen_sqshrn_b,
+ gen_sqshrn_h,
+ gen_sqshrn_s,
+};
+TRANS(SQSHRN_v, do_vec_shift_imm_narrow, a, sqshrn_fns, MO_SIGN)
+
+static WideShiftImmFn * const uqshrn_fns[] = {
+ gen_uqshrn_b,
+ gen_uqshrn_h,
+ gen_uqshrn_s,
+};
+TRANS(UQSHRN_v, do_vec_shift_imm_narrow, a, uqshrn_fns, 0)
+
+static WideShiftImmFn * const sqshrun_fns[] = {
+ gen_sqshrun_b,
+ gen_sqshrun_h,
+ gen_sqshrun_s,
+};
+TRANS(SQSHRUN_v, do_vec_shift_imm_narrow, a, sqshrun_fns, MO_SIGN)
+
+static WideShiftImmFn * const sqrshrn_fns[] = {
+ gen_sqrshrn_b,
+ gen_sqrshrn_h,
+ gen_sqrshrn_s,
+};
+TRANS(SQRSHRN_v, do_vec_shift_imm_narrow, a, sqrshrn_fns, MO_SIGN)
+
+static WideShiftImmFn * const uqrshrn_fns[] = {
+ gen_uqrshrn_b,
+ gen_uqrshrn_h,
+ gen_uqrshrn_s,
+};
+TRANS(UQRSHRN_v, do_vec_shift_imm_narrow, a, uqrshrn_fns, 0)
+
+static WideShiftImmFn * const sqrshrun_fns[] = {
+ gen_sqrshrun_b,
+ gen_sqrshrun_h,
+ gen_sqrshrun_s,
+};
+TRANS(SQRSHRUN_v, do_vec_shift_imm_narrow, a, sqrshrun_fns, MO_SIGN)
+
/*
* Advanced SIMD Scalar Shift by Immediate
*/
@@ -10514,20 +10672,6 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
}
switch (opcode) {
- case 0x10: /* SHRN / SQSHRUN */
- case 0x11: /* RSHRN / SQRSHRUN */
- if (is_u) {
- handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb,
- opcode, rn, rd);
- } else {
- unallocated_encoding(s);
- }
- break;
- case 0x12: /* SQSHRN / UQSHRN */
- case 0x13: /* SQRSHRN / UQRSHRN */
- handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb,
- opcode, rn, rd);
- break;
case 0x1c: /* SCVTF / UCVTF */
handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb,
opcode, rn, rd);
@@ -10544,6 +10688,10 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn)
case 0x0a: /* SHL / SLI */
case 0x0c: /* SQSHLU */
case 0x0e: /* SQSHL, UQSHL */
+ case 0x10: /* SHRN / SQSHRUN */
+ case 0x11: /* RSHRN / SQRSHRUN */
+ case 0x12: /* SQSHRN / UQSHRN */
+ case 0x13: /* SQRSHRN / UQRSHRN */
case 0x14: /* SSHLL / USHLL */
unallocated_encoding(s);
return;
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 63e04ddfcd..042dc79d88 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1302,6 +1302,30 @@ SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_h
SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_s
SQSHLU_vi 0.10 11110 .... ... 01100 1 ..... ..... @q_shli_d
+SQSHRN_v 0.00 11110 .... ... 10010 1 ..... ..... @q_shri_b
+SQSHRN_v 0.00 11110 .... ... 10010 1 ..... ..... @q_shri_h
+SQSHRN_v 0.00 11110 .... ... 10010 1 ..... ..... @q_shri_s
+
+UQSHRN_v 0.10 11110 .... ... 10010 1 ..... ..... @q_shri_b
+UQSHRN_v 0.10 11110 .... ... 10010 1 ..... ..... @q_shri_h
+UQSHRN_v 0.10 11110 .... ... 10010 1 ..... ..... @q_shri_s
+
+SQSHRUN_v 0.10 11110 .... ... 10000 1 ..... ..... @q_shri_b
+SQSHRUN_v 0.10 11110 .... ... 10000 1 ..... ..... @q_shri_h
+SQSHRUN_v 0.10 11110 .... ... 10000 1 ..... ..... @q_shri_s
+
+SQRSHRN_v 0.00 11110 .... ... 10011 1 ..... ..... @q_shri_b
+SQRSHRN_v 0.00 11110 .... ... 10011 1 ..... ..... @q_shri_h
+SQRSHRN_v 0.00 11110 .... ... 10011 1 ..... ..... @q_shri_s
+
+UQRSHRN_v 0.10 11110 .... ... 10011 1 ..... ..... @q_shri_b
+UQRSHRN_v 0.10 11110 .... ... 10011 1 ..... ..... @q_shri_h
+UQRSHRN_v 0.10 11110 .... ... 10011 1 ..... ..... @q_shri_s
+
+SQRSHRUN_v 0.10 11110 .... ... 10001 1 ..... ..... @q_shri_b
+SQRSHRUN_v 0.10 11110 .... ... 10001 1 ..... ..... @q_shri_h
+SQRSHRUN_v 0.10 11110 .... ... 10001 1 ..... ..... @q_shri_s
+
# Advanced SIMD scalar shift by immediate
@shri_d .... ..... 1 ...... ..... . rn:5 rd:5 \
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH v3 29/29] target/arm: Convert scalar [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (27 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 28/29] target/arm: Convert vector [US]QSHRN, [US]QRSHRN, SQSHRUN " Richard Henderson
@ 2024-09-12 2:41 ` Richard Henderson
2024-09-16 14:25 ` [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Peter Maydell
29 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2024-09-12 2:41 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, Peter Maydell
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/tcg/translate-a64.c | 160 +++++++--------------------------
target/arm/tcg/a64.decode | 30 +++++++
2 files changed, 63 insertions(+), 127 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index e6290e1145..071b6349fc 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -7456,6 +7456,35 @@ TRANS(SQSHL_si, do_scalar_shift_imm, a, f_scalar_sqshli[a->esz], false, 0)
TRANS(UQSHL_si, do_scalar_shift_imm, a, f_scalar_uqshli[a->esz], false, 0)
TRANS(SQSHLU_si, do_scalar_shift_imm, a, f_scalar_sqshlui[a->esz], false, 0)
+static bool do_scalar_shift_imm_narrow(DisasContext *s, arg_rri_e *a,
+ WideShiftImmFn * const fns[3],
+ MemOp sign, bool zext)
+{
+ MemOp esz = a->esz;
+
+ tcg_debug_assert(esz >= MO_8 && esz <= MO_32);
+
+ if (fp_access_check(s)) {
+ TCGv_i64 rd = tcg_temp_new_i64();
+ TCGv_i64 rn = tcg_temp_new_i64();
+
+ read_vec_element(s, rn, a->rn, 0, (esz + 1) | sign);
+ fns[esz](rd, rn, a->imm);
+ if (zext) {
+ tcg_gen_ext_i64(rd, rd, esz);
+ }
+ write_fp_dreg(s, a->rd, rd);
+ }
+ return true;
+}
+
+TRANS(SQSHRN_si, do_scalar_shift_imm_narrow, a, sqshrn_fns, MO_SIGN, true)
+TRANS(SQRSHRN_si, do_scalar_shift_imm_narrow, a, sqrshrn_fns, MO_SIGN, true)
+TRANS(UQSHRN_si, do_scalar_shift_imm_narrow, a, uqshrn_fns, 0, false)
+TRANS(UQRSHRN_si, do_scalar_shift_imm_narrow, a, uqrshrn_fns, 0, false)
+TRANS(SQSHRUN_si, do_scalar_shift_imm_narrow, a, sqshrun_fns, MO_SIGN, false)
+TRANS(SQRSHRUN_si, do_scalar_shift_imm_narrow, a, sqrshrun_fns, MO_SIGN, false)
+
/* Shift a TCGv src by TCGv shift_amount, put result in dst.
* Note that it is the caller's responsibility to ensure that the
* shift amount is in range (ie 0..31 or 0..63) and provide the ARM
@@ -9635,119 +9664,6 @@ static void disas_data_proc_fp(DisasContext *s, uint32_t insn)
}
}
-/*
- * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
- *
- * This code is handles the common shifting code and is used by both
- * the vector and scalar code.
- */
-static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
- bool round, bool accumulate,
- bool is_u, int size, int shift)
-{
- if (!round) {
- if (is_u) {
- gen_ushr_d(tcg_src, tcg_src, shift);
- } else {
- gen_sshr_d(tcg_src, tcg_src, shift);
- }
- } else if (size == MO_64) {
- if (is_u) {
- gen_urshr_d(tcg_src, tcg_src, shift);
- } else {
- gen_srshr_d(tcg_src, tcg_src, shift);
- }
- } else {
- if (is_u) {
- gen_urshr_bhs(tcg_src, tcg_src, shift);
- } else {
- gen_srshr_bhs(tcg_src, tcg_src, shift);
- }
- }
-
- if (accumulate) {
- tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
- } else {
- tcg_gen_mov_i64(tcg_res, tcg_src);
- }
-}
-
-/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with
- * (signed/unsigned) narrowing */
-static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q,
- bool is_u_shift, bool is_u_narrow,
- int immh, int immb, int opcode,
- int rn, int rd)
-{
- int immhb = immh << 3 | immb;
- int size = 32 - clz32(immh) - 1;
- int esize = 8 << size;
- int shift = (2 * esize) - immhb;
- int elements = is_scalar ? 1 : (64 / esize);
- bool round = extract32(opcode, 0, 1);
- MemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN);
- TCGv_i64 tcg_rn, tcg_rd, tcg_final;
-
- static NeonGenOne64OpEnvFn * const signed_narrow_fns[4][2] = {
- { gen_helper_neon_narrow_sat_s8,
- gen_helper_neon_unarrow_sat8 },
- { gen_helper_neon_narrow_sat_s16,
- gen_helper_neon_unarrow_sat16 },
- { gen_helper_neon_narrow_sat_s32,
- gen_helper_neon_unarrow_sat32 },
- { NULL, NULL },
- };
- static NeonGenOne64OpEnvFn * const unsigned_narrow_fns[4] = {
- gen_helper_neon_narrow_sat_u8,
- gen_helper_neon_narrow_sat_u16,
- gen_helper_neon_narrow_sat_u32,
- NULL
- };
- NeonGenOne64OpEnvFn *narrowfn;
-
- int i;
-
- assert(size < 4);
-
- if (extract32(immh, 3, 1)) {
- unallocated_encoding(s);
- return;
- }
-
- if (!fp_access_check(s)) {
- return;
- }
-
- if (is_u_shift) {
- narrowfn = unsigned_narrow_fns[size];
- } else {
- narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0];
- }
-
- tcg_rn = tcg_temp_new_i64();
- tcg_rd = tcg_temp_new_i64();
- tcg_final = tcg_temp_new_i64();
-
- for (i = 0; i < elements; i++) {
- read_vec_element(s, tcg_rn, rn, i, ldop);
- handle_shri_with_rndacc(tcg_rd, tcg_rn, round,
- false, is_u_shift, size+1, shift);
- narrowfn(tcg_rd, tcg_env, tcg_rd);
- if (i == 0) {
- tcg_gen_extract_i64(tcg_final, tcg_rd, 0, esize);
- } else {
- tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize);
- }
- }
-
- if (!is_q) {
- write_vec_element(s, tcg_final, rd, 0, MO_64);
- } else {
- write_vec_element(s, tcg_final, rd, 1, MO_64);
- }
- clear_vec_high(s, is_q, rd);
-}
-
/* Common vector code for handling integer to FP conversion */
static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn,
int elements, int is_signed,
@@ -10013,20 +9929,6 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb,
opcode, rn, rd);
break;
- case 0x10: /* SQSHRUN, SQSHRUN2 */
- case 0x11: /* SQRSHRUN, SQRSHRUN2 */
- if (!is_u) {
- unallocated_encoding(s);
- return;
- }
- handle_vec_simd_sqshrn(s, true, false, false, true,
- immh, immb, opcode, rn, rd);
- break;
- case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */
- case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */
- handle_vec_simd_sqshrn(s, true, false, is_u, is_u,
- immh, immb, opcode, rn, rd);
- break;
case 0x1f: /* FCVTZS, FCVTZU */
handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd);
break;
@@ -10039,6 +9941,10 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
case 0x0a: /* SHL / SLI */
case 0x0c: /* SQSHLU */
case 0x0e: /* SQSHL, UQSHL */
+ case 0x10: /* SQSHRUN */
+ case 0x11: /* SQRSHRUN */
+ case 0x12: /* SQSHRN, UQSHRN */
+ case 0x13: /* SQRSHRN, UQRSHRN */
unallocated_encoding(s);
break;
}
diff --git a/target/arm/tcg/a64.decode b/target/arm/tcg/a64.decode
index 042dc79d88..331a8e180c 100644
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@@ -1328,6 +1328,12 @@ SQRSHRUN_v 0.10 11110 .... ... 10001 1 ..... ..... @q_shri_s
# Advanced SIMD scalar shift by immediate
+@shri_b .... ..... 0001 ... ..... . rn:5 rd:5 \
+ &rri_e esz=0 imm=%neon_rshift_i3
+@shri_h .... ..... 001 .... ..... . rn:5 rd:5 \
+ &rri_e esz=1 imm=%neon_rshift_i4
+@shri_s .... ..... 01 ..... ..... . rn:5 rd:5 \
+ &rri_e esz=2 imm=%neon_rshift_i5
@shri_d .... ..... 1 ...... ..... . rn:5 rd:5 \
&rri_e esz=3 imm=%neon_rshift_i6
@@ -1363,3 +1369,27 @@ SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_b
SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_h
SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_s
SQSHLU_si 0111 11110 .... ... 01100 1 ..... ..... @shli_d
+
+SQSHRN_si 0101 11110 .... ... 10010 1 ..... ..... @shri_b
+SQSHRN_si 0101 11110 .... ... 10010 1 ..... ..... @shri_h
+SQSHRN_si 0101 11110 .... ... 10010 1 ..... ..... @shri_s
+
+UQSHRN_si 0111 11110 .... ... 10010 1 ..... ..... @shri_b
+UQSHRN_si 0111 11110 .... ... 10010 1 ..... ..... @shri_h
+UQSHRN_si 0111 11110 .... ... 10010 1 ..... ..... @shri_s
+
+SQSHRUN_si 0111 11110 .... ... 10000 1 ..... ..... @shri_b
+SQSHRUN_si 0111 11110 .... ... 10000 1 ..... ..... @shri_h
+SQSHRUN_si 0111 11110 .... ... 10000 1 ..... ..... @shri_s
+
+SQRSHRN_si 0101 11110 .... ... 10011 1 ..... ..... @shri_b
+SQRSHRN_si 0101 11110 .... ... 10011 1 ..... ..... @shri_h
+SQRSHRN_si 0101 11110 .... ... 10011 1 ..... ..... @shri_s
+
+UQRSHRN_si 0111 11110 .... ... 10011 1 ..... ..... @shri_b
+UQRSHRN_si 0111 11110 .... ... 10011 1 ..... ..... @shri_h
+UQRSHRN_si 0111 11110 .... ... 10011 1 ..... ..... @shri_s
+
+SQRSHRUN_si 0111 11110 .... ... 10001 1 ..... ..... @shri_b
+SQRSHRUN_si 0111 11110 .... ... 10001 1 ..... ..... @shri_h
+SQRSHRUN_si 0111 11110 .... ... 10001 1 ..... ..... @shri_s
--
2.43.0
^ permalink raw reply related [flat|nested] 32+ messages in thread* Re: [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4
2024-09-12 2:40 [PATCH v3 00/29] target/arm: AdvSIMD decodetree conversion, part 4 Richard Henderson
` (28 preceding siblings ...)
2024-09-12 2:41 ` [PATCH v3 29/29] target/arm: Convert scalar " Richard Henderson
@ 2024-09-16 14:25 ` Peter Maydell
29 siblings, 0 replies; 32+ messages in thread
From: Peter Maydell @ 2024-09-16 14:25 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-devel, qemu-arm
On Thu, 12 Sept 2024 at 03:43, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Changes for v3:
> - Zero-extend results in Widen NeonGenNarrowEnvFn return to 64 bits
>
> Only patch 26 needs review.
>
Applied to target-arm.next, thanks.
-- PMM
^ permalink raw reply [flat|nested] 32+ messages in thread