* [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions
2011-02-14 10:22 [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Peter Maydell
@ 2011-02-14 10:22 ` Peter Maydell
2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP " Peter Maydell
2011-02-20 16:35 ` [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Aurelien Jarno
2 siblings, 0 replies; 4+ messages in thread
From: Peter Maydell @ 2011-02-14 10:22 UTC (permalink / raw)
To: qemu-devel; +Cc: patches
Move the implementation of the Neon VUZP unzip instruction from inline
code to helper functions. (At 50+ TCG ops it was well over the
recommended limit for coding inline.) The helper implementations also
fix the handling of the quadword version of the instruction.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
target-arm/helpers.h | 6 +++
target-arm/neon_helper.c | 94 ++++++++++++++++++++++++++++++++++++++
target-arm/translate.c | 112 +++++++++++++++-------------------------------
3 files changed, 137 insertions(+), 75 deletions(-)
diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index 77f1635..88c2fce 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -460,4 +460,10 @@ DEF_HELPER_3(iwmmxt_muladdswl, i64, i64, i32, i32)
DEF_HELPER_2(set_teecr, void, env, i32)
+DEF_HELPER_3(neon_unzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_unzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip32, void, env, i32, i32)
+
#include "def-helper.h"
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index dc09968..6e2a2fd 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1663,3 +1663,97 @@ uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
float32 f1 = float32_abs(vfp_itos(b));
return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
}
+
+#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
+
+void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+ uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+ uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+ uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+ uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
+ | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
+ | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
+ | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
+ uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
+ | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
+ | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+ | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
+ uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
+ | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
+ | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
+ | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
+ uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
+ | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
+ | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
+ | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rm + 1] = make_float64(m1);
+ env->vfp.regs[rd] = make_float64(d0);
+ env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+ uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+ uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+ uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+ uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
+ | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
+ uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
+ | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
+ uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
+ | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
+ uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
+ | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rm + 1] = make_float64(m1);
+ env->vfp.regs[rd] = make_float64(d0);
+ env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+ uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+ uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+ uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+ uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
+ uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+ uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
+ uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rm + 1] = make_float64(m1);
+ env->vfp.regs[rd] = make_float64(d0);
+ env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm = float64_val(env->vfp.regs[rm]);
+ uint64_t zd = float64_val(env->vfp.regs[rd]);
+ uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
+ | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
+ | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+ | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
+ uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
+ | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
+ | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
+ | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rd] = make_float64(d0);
+}
+
+void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm = float64_val(env->vfp.regs[rm]);
+ uint64_t zd = float64_val(env->vfp.regs[rd]);
+ uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
+ | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
+ uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
+ | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rd] = make_float64(d0);
+}
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 362d1d0..6d94223 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3614,40 +3614,43 @@ static inline TCGv neon_get_scalar(int size, int reg)
return tmp;
}
-static void gen_neon_unzip_u8(TCGv t0, TCGv t1)
+static int gen_neon_unzip(int rd, int rm, int size, int q)
{
- TCGv rd, rm, tmp;
-
- rd = new_tmp();
- rm = new_tmp();
- tmp = new_tmp();
-
- tcg_gen_andi_i32(rd, t0, 0xff);
- tcg_gen_shri_i32(tmp, t0, 8);
- tcg_gen_andi_i32(tmp, tmp, 0xff00);
- tcg_gen_or_i32(rd, rd, tmp);
- tcg_gen_shli_i32(tmp, t1, 16);
- tcg_gen_andi_i32(tmp, tmp, 0xff0000);
- tcg_gen_or_i32(rd, rd, tmp);
- tcg_gen_shli_i32(tmp, t1, 8);
- tcg_gen_andi_i32(tmp, tmp, 0xff000000);
- tcg_gen_or_i32(rd, rd, tmp);
-
- tcg_gen_shri_i32(rm, t0, 8);
- tcg_gen_andi_i32(rm, rm, 0xff);
- tcg_gen_shri_i32(tmp, t0, 16);
- tcg_gen_andi_i32(tmp, tmp, 0xff00);
- tcg_gen_or_i32(rm, rm, tmp);
- tcg_gen_shli_i32(tmp, t1, 8);
- tcg_gen_andi_i32(tmp, tmp, 0xff0000);
- tcg_gen_or_i32(rm, rm, tmp);
- tcg_gen_andi_i32(tmp, t1, 0xff000000);
- tcg_gen_or_i32(t1, rm, tmp);
- tcg_gen_mov_i32(t0, rd);
-
- dead_tmp(tmp);
- dead_tmp(rm);
- dead_tmp(rd);
+ TCGv tmp, tmp2;
+ if (size == 3 || (!q && size == 2)) {
+ return 1;
+ }
+ tmp = tcg_const_i32(rd);
+ tmp2 = tcg_const_i32(rm);
+ if (q) {
+ switch (size) {
+ case 0:
+ gen_helper_neon_qunzip8(cpu_env, tmp, tmp2);
+ break;
+ case 1:
+ gen_helper_neon_qunzip16(cpu_env, tmp, tmp2);
+ break;
+ case 2:
+ gen_helper_neon_qunzip32(cpu_env, tmp, tmp2);
+ break;
+ default:
+ abort();
+ }
+ } else {
+ switch (size) {
+ case 0:
+ gen_helper_neon_unzip8(cpu_env, tmp, tmp2);
+ break;
+ case 1:
+ gen_helper_neon_unzip16(cpu_env, tmp, tmp2);
+ break;
+ default:
+ abort();
+ }
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(tmp2);
+ return 0;
}
static void gen_neon_zip_u8(TCGv t0, TCGv t1)
@@ -3705,25 +3708,6 @@ static void gen_neon_zip_u16(TCGv t0, TCGv t1)
dead_tmp(tmp);
}
-static void gen_neon_unzip(int reg, int q, int tmp, int size)
-{
- int n;
- TCGv t0, t1;
-
- for (n = 0; n < q + 1; n += 2) {
- t0 = neon_load_reg(reg, n);
- t1 = neon_load_reg(reg, n + 1);
- switch (size) {
- case 0: gen_neon_unzip_u8(t0, t1); break;
- case 1: gen_neon_zip_u16(t0, t1); break; /* zip and unzip are the same. */
- case 2: /* no-op */; break;
- default: abort();
- }
- neon_store_scratch(tmp + n, t0);
- neon_store_scratch(tmp + n + 1, t1);
- }
-}
-
static void gen_neon_trn_u8(TCGv t0, TCGv t1)
{
TCGv rd, tmp;
@@ -5436,30 +5420,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
}
break;
case 34: /* VUZP */
- /* Reg Before After
- Rd A3 A2 A1 A0 B2 B0 A2 A0
- Rm B3 B2 B1 B0 B3 B1 A3 A1
- */
- if (size == 3)
+ if (gen_neon_unzip(rd, rm, size, q)) {
return 1;
- gen_neon_unzip(rd, q, 0, size);
- gen_neon_unzip(rm, q, 4, size);
- if (q) {
- static int unzip_order_q[8] =
- {0, 2, 4, 6, 1, 3, 5, 7};
- for (n = 0; n < 8; n++) {
- int reg = (n < 4) ? rd : rm;
- tmp = neon_load_scratch(unzip_order_q[n]);
- neon_store_reg(reg, n % 4, tmp);
- }
- } else {
- static int unzip_order[4] =
- {0, 4, 1, 5};
- for (n = 0; n < 4; n++) {
- int reg = (n < 2) ? rd : rm;
- tmp = neon_load_scratch(unzip_order[n]);
- neon_store_reg(reg, n % 2, tmp);
- }
}
break;
case 35: /* VZIP */
--
1.7.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP to helper functions
2011-02-14 10:22 [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Peter Maydell
2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions Peter Maydell
@ 2011-02-14 10:22 ` Peter Maydell
2011-02-20 16:35 ` [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Aurelien Jarno
2 siblings, 0 replies; 4+ messages in thread
From: Peter Maydell @ 2011-02-14 10:22 UTC (permalink / raw)
To: qemu-devel; +Cc: patches
Move the implementation of the Neon VUZP unzip instruction from inline
code to helper functions. (At 50+ TCG ops it was well over the
recommended limit for coding inline.) The helper implementations also
give the correct answers where the inline implementation did not.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
target-arm/helpers.h | 5 ++
target-arm/neon_helper.c | 92 ++++++++++++++++++++++++++++++++++++++
target-arm/translate.c | 109 +++++++++++++++-------------------------------
3 files changed, 133 insertions(+), 73 deletions(-)
diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index 88c2fce..e494b47 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -465,5 +465,10 @@ DEF_HELPER_3(neon_unzip16, void, env, i32, i32)
DEF_HELPER_3(neon_qunzip8, void, env, i32, i32)
DEF_HELPER_3(neon_qunzip16, void, env, i32, i32)
DEF_HELPER_3(neon_qunzip32, void, env, i32, i32)
+DEF_HELPER_3(neon_zip8, void, env, i32, i32)
+DEF_HELPER_3(neon_zip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip32, void, env, i32, i32)
#include "def-helper.h"
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 6e2a2fd..cee9ee4 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1757,3 +1757,95 @@ void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
env->vfp.regs[rm] = make_float64(m0);
env->vfp.regs[rd] = make_float64(d0);
}
+
+void HELPER(neon_qzip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+ uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+ uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+ uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+ uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
+ | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
+ | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
+ | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
+ uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
+ | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
+ | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
+ | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
+ uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
+ | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
+ | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+ | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
+ uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
+ | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
+ | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
+ | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rm + 1] = make_float64(m1);
+ env->vfp.regs[rd] = make_float64(d0);
+ env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qzip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+ uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+ uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+ uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+ uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
+ | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
+ uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
+ | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
+ uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
+ | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
+ uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
+ | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rm + 1] = make_float64(m1);
+ env->vfp.regs[rd] = make_float64(d0);
+ env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_qzip32)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm0 = float64_val(env->vfp.regs[rm]);
+ uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
+ uint64_t zd0 = float64_val(env->vfp.regs[rd]);
+ uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
+ uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
+ uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
+ uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+ uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rm + 1] = make_float64(m1);
+ env->vfp.regs[rd] = make_float64(d0);
+ env->vfp.regs[rd + 1] = make_float64(d1);
+}
+
+void HELPER(neon_zip8)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm = float64_val(env->vfp.regs[rm]);
+ uint64_t zd = float64_val(env->vfp.regs[rd]);
+ uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
+ | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
+ | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+ | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
+ uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
+ | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
+ | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
+ | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rd] = make_float64(d0);
+}
+
+void HELPER(neon_zip16)(CPUState *env, uint32_t rd, uint32_t rm)
+{
+ uint64_t zm = float64_val(env->vfp.regs[rm]);
+ uint64_t zd = float64_val(env->vfp.regs[rd]);
+ uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
+ | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
+ uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
+ | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+ env->vfp.regs[rm] = make_float64(m0);
+ env->vfp.regs[rd] = make_float64(d0);
+}
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 6d94223..50c8fe0 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -3653,59 +3653,43 @@ static int gen_neon_unzip(int rd, int rm, int size, int q)
return 0;
}
-static void gen_neon_zip_u8(TCGv t0, TCGv t1)
-{
- TCGv rd, rm, tmp;
-
- rd = new_tmp();
- rm = new_tmp();
- tmp = new_tmp();
-
- tcg_gen_andi_i32(rd, t0, 0xff);
- tcg_gen_shli_i32(tmp, t1, 8);
- tcg_gen_andi_i32(tmp, tmp, 0xff00);
- tcg_gen_or_i32(rd, rd, tmp);
- tcg_gen_shli_i32(tmp, t0, 16);
- tcg_gen_andi_i32(tmp, tmp, 0xff0000);
- tcg_gen_or_i32(rd, rd, tmp);
- tcg_gen_shli_i32(tmp, t1, 24);
- tcg_gen_andi_i32(tmp, tmp, 0xff000000);
- tcg_gen_or_i32(rd, rd, tmp);
-
- tcg_gen_andi_i32(rm, t1, 0xff000000);
- tcg_gen_shri_i32(tmp, t0, 8);
- tcg_gen_andi_i32(tmp, tmp, 0xff0000);
- tcg_gen_or_i32(rm, rm, tmp);
- tcg_gen_shri_i32(tmp, t1, 8);
- tcg_gen_andi_i32(tmp, tmp, 0xff00);
- tcg_gen_or_i32(rm, rm, tmp);
- tcg_gen_shri_i32(tmp, t0, 16);
- tcg_gen_andi_i32(tmp, tmp, 0xff);
- tcg_gen_or_i32(t1, rm, tmp);
- tcg_gen_mov_i32(t0, rd);
-
- dead_tmp(tmp);
- dead_tmp(rm);
- dead_tmp(rd);
-}
-
-static void gen_neon_zip_u16(TCGv t0, TCGv t1)
+static int gen_neon_zip(int rd, int rm, int size, int q)
{
TCGv tmp, tmp2;
-
- tmp = new_tmp();
- tmp2 = new_tmp();
-
- tcg_gen_andi_i32(tmp, t0, 0xffff);
- tcg_gen_shli_i32(tmp2, t1, 16);
- tcg_gen_or_i32(tmp, tmp, tmp2);
- tcg_gen_andi_i32(t1, t1, 0xffff0000);
- tcg_gen_shri_i32(tmp2, t0, 16);
- tcg_gen_or_i32(t1, t1, tmp2);
- tcg_gen_mov_i32(t0, tmp);
-
- dead_tmp(tmp2);
- dead_tmp(tmp);
+ if (size == 3 || (!q && size == 2)) {
+ return 1;
+ }
+ tmp = tcg_const_i32(rd);
+ tmp2 = tcg_const_i32(rm);
+ if (q) {
+ switch (size) {
+ case 0:
+ gen_helper_neon_qzip8(cpu_env, tmp, tmp2);
+ break;
+ case 1:
+ gen_helper_neon_qzip16(cpu_env, tmp, tmp2);
+ break;
+ case 2:
+ gen_helper_neon_qzip32(cpu_env, tmp, tmp2);
+ break;
+ default:
+ abort();
+ }
+ } else {
+ switch (size) {
+ case 0:
+ gen_helper_neon_zip8(cpu_env, tmp, tmp2);
+ break;
+ case 1:
+ gen_helper_neon_zip16(cpu_env, tmp, tmp2);
+ break;
+ default:
+ abort();
+ }
+ }
+ tcg_temp_free_i32(tmp);
+ tcg_temp_free_i32(tmp2);
+ return 0;
}
static void gen_neon_trn_u8(TCGv t0, TCGv t1)
@@ -5425,29 +5409,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
}
break;
case 35: /* VZIP */
- /* Reg Before After
- Rd A3 A2 A1 A0 B1 A1 B0 A0
- Rm B3 B2 B1 B0 B3 A3 B2 A2
- */
- if (size == 3)
+ if (gen_neon_zip(rd, rm, size, q)) {
return 1;
- count = (q ? 4 : 2);
- for (n = 0; n < count; n++) {
- tmp = neon_load_reg(rd, n);
- tmp2 = neon_load_reg(rd, n);
- switch (size) {
- case 0: gen_neon_zip_u8(tmp, tmp2); break;
- case 1: gen_neon_zip_u16(tmp, tmp2); break;
- case 2: /* no-op */; break;
- default: abort();
- }
- neon_store_scratch(n * 2, tmp);
- neon_store_scratch(n * 2 + 1, tmp2);
- }
- for (n = 0; n < count * 2; n++) {
- int reg = (n < count) ? rd : rm;
- tmp = neon_load_scratch(n);
- neon_store_reg(reg, n % count, tmp);
}
break;
case 36: case 37: /* VMOVN, VQMOVUN, VQMOVN */
--
1.7.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions
2011-02-14 10:22 [Qemu-devel] [PATCH v2 0/2] target-arm: fix Neon VUZP, VZIP instructions Peter Maydell
2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 1/2] target-arm: Move Neon VUZP to helper functions Peter Maydell
2011-02-14 10:22 ` [Qemu-devel] [PATCH v2 2/2] target-arm: Move Neon VZIP " Peter Maydell
@ 2011-02-20 16:35 ` Aurelien Jarno
2 siblings, 0 replies; 4+ messages in thread
From: Aurelien Jarno @ 2011-02-20 16:35 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-devel, patches
On Mon, Feb 14, 2011 at 10:22:47AM +0000, Peter Maydell wrote:
> This patch series fixes bugs in the Neon VZIP and VUZP instructions
> by abandoning the existing inline implementations in favour of
> calling out to straightforward helper functions. The inline routines
> could generate 50+ TCG ops each, which is well over the recommended
> limit in tcg/README for using helpers instead; they also did
> not give the correct results...
>
> I've tested these patches using the usual random instruction
> generation approach.
>
> V2 changes: moved the decoding of register numbers, size and q
> flag out of the helper functions into translate.c; split the
> helpers up into one per (size, q) combination.
>
> Peter Maydell (2):
> target-arm: Move Neon VUZP to helper functions
> target-arm: Move Neon VZIP to helper functions
>
> target-arm/helpers.h | 11 +++
> target-arm/neon_helper.c | 186 +++++++++++++++++++++++++++++++++++++++
> target-arm/translate.c | 215 +++++++++++++++-------------------------------
> 3 files changed, 267 insertions(+), 145 deletions(-)
>
Thanks, both applied.
--
Aurelien Jarno GPG: 1024D/F1BCDB73
aurelien@aurel32.net http://www.aurel32.net
^ permalink raw reply [flat|nested] 4+ messages in thread