qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/5] target-arm: misc optimizations
@ 2012-09-16 23:08 Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 1/5] target-arm: use globals for CC flags Aurelien Jarno
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-16 23:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Peter Maydell, Aurelien Jarno

This patch series optimizes the ARM target by:
 - using globals instead of ld/st function
 - using TCG code instead of helpers
 - marking some helpers const and pure

Aurelien Jarno (5):
  target-arm: use globals for CC flags
  target-arm: convert add_cc and sub_cc helpers to TCG
  target-arm: convert shl and shr helpers to TCG
  target-arm: optimize helper_sar
  target-arm: mark a few integer helpers const and pure

 target-arm/helper.h    |   25 +++---
 target-arm/op_helper.c |   38 +-------
 target-arm/translate.c |  229 ++++++++++++++++++++++++++----------------------
 3 files changed, 137 insertions(+), 155 deletions(-)

-- 
1.7.10.4

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 1/5] target-arm: use globals for CC flags
  2012-09-16 23:08 [Qemu-devel] [PATCH 0/5] target-arm: misc optimizations Aurelien Jarno
@ 2012-09-16 23:08 ` Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 2/5] target-arm: convert add_cc and sub_cc helpers to TCG Aurelien Jarno
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-16 23:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Peter Maydell, Aurelien Jarno

Use globals for CC flags instead of loading/storing them each they are
accessed. This allows some optimizations to be performed by the TCG
optimization passes.

Cc: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-arm/translate.c |  127 ++++++++++++++++++------------------------------
 1 file changed, 46 insertions(+), 81 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index f4b447a..3deb24d 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -85,6 +85,7 @@ static TCGv_ptr cpu_env;
 /* We reuse the same 64-bit temporaries for efficiency.  */
 static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
 static TCGv_i32 cpu_R[16];
+static TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 static TCGv_i32 cpu_exclusive_addr;
 static TCGv_i32 cpu_exclusive_val;
 static TCGv_i32 cpu_exclusive_high;
@@ -115,6 +116,11 @@ void arm_translate_init(void)
                                           offsetof(CPUARMState, regs[i]),
                                           regnames[i]);
     }
+    cpu_CF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, CF), "CF");
+    cpu_NF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, NF), "NF");
+    cpu_VF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, VF), "VF");
+    cpu_ZF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, ZF), "ZF");
+
     cpu_exclusive_addr = tcg_global_mem_new_i32(TCG_AREG0,
         offsetof(CPUARMState, exclusive_addr), "exclusive_addr");
     cpu_exclusive_val = tcg_global_mem_new_i32(TCG_AREG0,
@@ -369,53 +375,39 @@ static void gen_add16(TCGv t0, TCGv t1)
     tcg_temp_free_i32(t1);
 }
 
-#define gen_set_CF(var) tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, CF))
-
 /* Set CF to the top bit of var.  */
 static void gen_set_CF_bit31(TCGv var)
 {
-    TCGv tmp = tcg_temp_new_i32();
-    tcg_gen_shri_i32(tmp, var, 31);
-    gen_set_CF(tmp);
-    tcg_temp_free_i32(tmp);
+    tcg_gen_shri_i32(cpu_CF, var, 31);
 }
 
 /* Set N and Z flags from var.  */
 static inline void gen_logic_CC(TCGv var)
 {
-    tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, NF));
-    tcg_gen_st_i32(var, cpu_env, offsetof(CPUARMState, ZF));
+    tcg_gen_mov_i32(cpu_NF, var);
+    tcg_gen_mov_i32(cpu_ZF, var);
 }
 
 /* T0 += T1 + CF.  */
 static void gen_adc(TCGv t0, TCGv t1)
 {
-    TCGv tmp;
     tcg_gen_add_i32(t0, t0, t1);
-    tmp = load_cpu_field(CF);
-    tcg_gen_add_i32(t0, t0, tmp);
-    tcg_temp_free_i32(tmp);
+    tcg_gen_add_i32(t0, t0, cpu_CF);
 }
 
 /* dest = T0 + T1 + CF. */
 static void gen_add_carry(TCGv dest, TCGv t0, TCGv t1)
 {
-    TCGv tmp;
     tcg_gen_add_i32(dest, t0, t1);
-    tmp = load_cpu_field(CF);
-    tcg_gen_add_i32(dest, dest, tmp);
-    tcg_temp_free_i32(tmp);
+    tcg_gen_add_i32(dest, dest, cpu_CF);
 }
 
 /* dest = T0 - T1 + CF - 1.  */
 static void gen_sub_carry(TCGv dest, TCGv t0, TCGv t1)
 {
-    TCGv tmp;
     tcg_gen_sub_i32(dest, t0, t1);
-    tmp = load_cpu_field(CF);
-    tcg_gen_add_i32(dest, dest, tmp);
+    tcg_gen_add_i32(dest, dest, cpu_CF);
     tcg_gen_subi_i32(dest, dest, 1);
-    tcg_temp_free_i32(tmp);
 }
 
 /* FIXME:  Implement this natively.  */
@@ -423,16 +415,14 @@ static void gen_sub_carry(TCGv dest, TCGv t0, TCGv t1)
 
 static void shifter_out_im(TCGv var, int shift)
 {
-    TCGv tmp = tcg_temp_new_i32();
     if (shift == 0) {
-        tcg_gen_andi_i32(tmp, var, 1);
+        tcg_gen_andi_i32(cpu_CF, var, 1);
     } else {
-        tcg_gen_shri_i32(tmp, var, shift);
-        if (shift != 31)
-            tcg_gen_andi_i32(tmp, tmp, 1);
+        tcg_gen_shri_i32(cpu_CF, var, shift);
+        if (shift != 31) {
+            tcg_gen_andi_i32(cpu_CF, cpu_CF, 1);
+        }
     }
-    gen_set_CF(tmp);
-    tcg_temp_free_i32(tmp);
 }
 
 /* Shift by immediate.  Includes special handling for shift == 0.  */
@@ -449,8 +439,7 @@ static inline void gen_arm_shift_im(TCGv var, int shiftop, int shift, int flags)
     case 1: /* LSR */
         if (shift == 0) {
             if (flags) {
-                tcg_gen_shri_i32(var, var, 31);
-                gen_set_CF(var);
+                tcg_gen_shri_i32(cpu_CF, var, 31);
             }
             tcg_gen_movi_i32(var, 0);
         } else {
@@ -474,11 +463,11 @@ static inline void gen_arm_shift_im(TCGv var, int shiftop, int shift, int flags)
                 shifter_out_im(var, shift - 1);
             tcg_gen_rotri_i32(var, var, shift); break;
         } else {
-            TCGv tmp = load_cpu_field(CF);
+            TCGv tmp = tcg_temp_new_i32();
             if (flags)
                 shifter_out_im(var, 0);
             tcg_gen_shri_i32(var, var, 1);
-            tcg_gen_shli_i32(tmp, tmp, 31);
+            tcg_gen_shli_i32(tmp, cpu_CF, 31);
             tcg_gen_or_i32(var, var, tmp);
             tcg_temp_free_i32(tmp);
         }
@@ -603,99 +592,75 @@ static void gen_thumb2_parallel_addsub(int op1, int op2, TCGv a, TCGv b)
 static void gen_test_cc(int cc, int label)
 {
     TCGv tmp;
-    TCGv tmp2;
     int inv;
 
     switch (cc) {
     case 0: /* eq: Z */
-        tmp = load_cpu_field(ZF);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ZF, 0, label);
         break;
     case 1: /* ne: !Z */
-        tmp = load_cpu_field(ZF);
-        tcg_gen_brcondi_i32(TCG_COND_NE, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_NE, cpu_ZF, 0, label);
         break;
     case 2: /* cs: C */
-        tmp = load_cpu_field(CF);
-        tcg_gen_brcondi_i32(TCG_COND_NE, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_NE, cpu_CF, 0, label);
         break;
     case 3: /* cc: !C */
-        tmp = load_cpu_field(CF);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_CF, 0, label);
         break;
     case 4: /* mi: N */
-        tmp = load_cpu_field(NF);
-        tcg_gen_brcondi_i32(TCG_COND_LT, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_LT, cpu_NF, 0, label);
         break;
     case 5: /* pl: !N */
-        tmp = load_cpu_field(NF);
-        tcg_gen_brcondi_i32(TCG_COND_GE, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_GE, cpu_NF, 0, label);
         break;
     case 6: /* vs: V */
-        tmp = load_cpu_field(VF);
-        tcg_gen_brcondi_i32(TCG_COND_LT, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_LT, cpu_VF, 0, label);
         break;
     case 7: /* vc: !V */
-        tmp = load_cpu_field(VF);
-        tcg_gen_brcondi_i32(TCG_COND_GE, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_GE, cpu_VF, 0, label);
         break;
     case 8: /* hi: C && !Z */
         inv = gen_new_label();
-        tmp = load_cpu_field(CF);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, inv);
-        tcg_temp_free_i32(tmp);
-        tmp = load_cpu_field(ZF);
-        tcg_gen_brcondi_i32(TCG_COND_NE, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_CF, 0, inv);
+        tcg_gen_brcondi_i32(TCG_COND_NE, cpu_ZF, 0, label);
         gen_set_label(inv);
         break;
     case 9: /* ls: !C || Z */
-        tmp = load_cpu_field(CF);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, label);
-        tcg_temp_free_i32(tmp);
-        tmp = load_cpu_field(ZF);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_CF, 0, label);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ZF, 0, label);
         break;
     case 10: /* ge: N == V -> N ^ V == 0 */
-        tmp = load_cpu_field(VF);
-        tmp2 = load_cpu_field(NF);
-        tcg_gen_xor_i32(tmp, tmp, tmp2);
-        tcg_temp_free_i32(tmp2);
+        tmp = tcg_temp_new_i32();
+        tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
         tcg_gen_brcondi_i32(TCG_COND_GE, tmp, 0, label);
+        tcg_temp_free_i32(tmp);
         break;
     case 11: /* lt: N != V -> N ^ V != 0 */
-        tmp = load_cpu_field(VF);
-        tmp2 = load_cpu_field(NF);
-        tcg_gen_xor_i32(tmp, tmp, tmp2);
-        tcg_temp_free_i32(tmp2);
+        tmp = tcg_temp_new_i32();
+        tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
         tcg_gen_brcondi_i32(TCG_COND_LT, tmp, 0, label);
+        tcg_temp_free_i32(tmp);
         break;
     case 12: /* gt: !Z && N == V */
         inv = gen_new_label();
-        tmp = load_cpu_field(ZF);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, inv);
-        tcg_temp_free_i32(tmp);
-        tmp = load_cpu_field(VF);
-        tmp2 = load_cpu_field(NF);
-        tcg_gen_xor_i32(tmp, tmp, tmp2);
-        tcg_temp_free_i32(tmp2);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ZF, 0, inv);
+        tmp = tcg_temp_new_i32();
+        tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
         tcg_gen_brcondi_i32(TCG_COND_GE, tmp, 0, label);
+        tcg_temp_free_i32(tmp);
         gen_set_label(inv);
         break;
     case 13: /* le: Z || N != V */
-        tmp = load_cpu_field(ZF);
-        tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, label);
-        tcg_temp_free_i32(tmp);
-        tmp = load_cpu_field(VF);
-        tmp2 = load_cpu_field(NF);
-        tcg_gen_xor_i32(tmp, tmp, tmp2);
-        tcg_temp_free_i32(tmp2);
+        tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ZF, 0, label);
+        tmp = tcg_temp_new_i32();
+        tcg_gen_xor_i32(tmp, cpu_VF, cpu_NF);
         tcg_gen_brcondi_i32(TCG_COND_LT, tmp, 0, label);
+        tcg_temp_free_i32(tmp);
         break;
     default:
         fprintf(stderr, "Bad condition code 0x%x\n", cc);
         abort();
     }
-    tcg_temp_free_i32(tmp);
 }
 
 static const uint8_t table_logic_cc[16] = {
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 2/5] target-arm: convert add_cc and sub_cc helpers to TCG
  2012-09-16 23:08 [Qemu-devel] [PATCH 0/5] target-arm: misc optimizations Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 1/5] target-arm: use globals for CC flags Aurelien Jarno
@ 2012-09-16 23:08 ` Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr " Aurelien Jarno
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-16 23:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Peter Maydell, Aurelien Jarno

Now that the setcond TCG op is available, it's possible to replace
add_cc and sub_cc helpers by TCG code. The code generated by TCG is
actually very close to the one generated by GCC for the helper, and
this avoid all the consequences of using an helper: globals saved back
to memory, no possible optimization, call overhead, etc.

Cc: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-arm/helper.h    |    2 --
 target-arm/op_helper.c |   20 ---------------
 target-arm/translate.c |   66 +++++++++++++++++++++++++++++++++++-------------
 3 files changed, 48 insertions(+), 40 deletions(-)

diff --git a/target-arm/helper.h b/target-arm/helper.h
index afdb2b5..7151e28 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -142,9 +142,7 @@ DEF_HELPER_2(recpe_u32, i32, i32, env)
 DEF_HELPER_2(rsqrte_u32, i32, i32, env)
 DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
 
-DEF_HELPER_3(add_cc, i32, env, i32, i32)
 DEF_HELPER_3(adc_cc, i32, env, i32, i32)
-DEF_HELPER_3(sub_cc, i32, env, i32, i32)
 DEF_HELPER_3(sbc_cc, i32, env, i32, i32)
 
 DEF_HELPER_3(shl, i32, env, i32, i32)
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index f13fc3a..6095f24 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -323,16 +323,6 @@ uint64_t HELPER(get_cp_reg64)(CPUARMState *env, void *rip)
    The only way to do that in TCG is a conditional branch, which clobbers
    all our temporaries.  For now implement these as helper functions.  */
 
-uint32_t HELPER (add_cc)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t result;
-    result = a + b;
-    env->NF = env->ZF = result;
-    env->CF = result < a;
-    env->VF = (a ^ b ^ -1) & (a ^ result);
-    return result;
-}
-
 uint32_t HELPER(adc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
 {
     uint32_t result;
@@ -348,16 +338,6 @@ uint32_t HELPER(adc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
     return result;
 }
 
-uint32_t HELPER(sub_cc)(CPUARMState *env, uint32_t a, uint32_t b)
-{
-    uint32_t result;
-    result = a - b;
-    env->NF = env->ZF = result;
-    env->CF = a >= b;
-    env->VF = (a ^ b) & (a ^ result);
-    return result;
-}
-
 uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
 {
     uint32_t result;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 3deb24d..19bb1e8 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -410,6 +410,36 @@ static void gen_sub_carry(TCGv dest, TCGv t0, TCGv t1)
     tcg_gen_subi_i32(dest, dest, 1);
 }
 
+/* dest = T0 + T1. Compute C, N, V and Z flags */
+static void gen_add_CC(TCGv dest, TCGv t0, TCGv t1)
+{
+    TCGv tmp;
+    tcg_gen_add_i32(cpu_NF, t0, t1);
+    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    tcg_gen_setcond_i32(TCG_COND_LTU, cpu_CF, cpu_NF, t0);
+    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+    tmp = tcg_temp_new_i32();
+    tcg_gen_xor_i32(tmp, t0, t1);
+    tcg_gen_andc_i32(cpu_VF, cpu_VF, tmp);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_mov_i32(dest, cpu_NF);
+}
+
+/* dest = T0 - T1. Compute C, N, V and Z flags */
+static void gen_sub_CC(TCGv dest, TCGv t0, TCGv t1)
+{
+    TCGv tmp;
+    tcg_gen_sub_i32(cpu_NF, t0, t1);
+    tcg_gen_mov_i32(cpu_ZF, cpu_NF);
+    tcg_gen_setcond_i32(TCG_COND_GEU, cpu_CF, t0, t1);
+    tcg_gen_xor_i32(cpu_VF, cpu_NF, t0);
+    tmp = tcg_temp_new_i32();
+    tcg_gen_xor_i32(tmp, t0, t1);
+    tcg_gen_and_i32(cpu_VF, cpu_VF, tmp);
+    tcg_temp_free_i32(tmp);
+    tcg_gen_mov_i32(dest, cpu_NF);
+}
+
 /* FIXME:  Implement this natively.  */
 #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1)
 
@@ -6970,11 +7000,11 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
                 if (IS_USER(s)) {
                     goto illegal_op;
                 }
-                gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                gen_sub_CC(tmp, tmp, tmp2);
                 gen_exception_return(s, tmp);
             } else {
                 if (set_cc) {
-                    gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                    gen_sub_CC(tmp, tmp, tmp2);
                 } else {
                     tcg_gen_sub_i32(tmp, tmp, tmp2);
                 }
@@ -6983,7 +7013,7 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
             break;
         case 0x03:
             if (set_cc) {
-                gen_helper_sub_cc(tmp, cpu_env, tmp2, tmp);
+                gen_sub_CC(tmp, tmp2, tmp);
             } else {
                 tcg_gen_sub_i32(tmp, tmp2, tmp);
             }
@@ -6991,7 +7021,7 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
             break;
         case 0x04:
             if (set_cc) {
-                gen_helper_add_cc(tmp, cpu_env, tmp, tmp2);
+                gen_add_CC(tmp, tmp, tmp2);
             } else {
                 tcg_gen_add_i32(tmp, tmp, tmp2);
             }
@@ -7037,13 +7067,13 @@ static void disas_arm_insn(CPUARMState * env, DisasContext *s)
             break;
         case 0x0a:
             if (set_cc) {
-                gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                gen_sub_CC(tmp, tmp, tmp2);
             }
             tcg_temp_free_i32(tmp);
             break;
         case 0x0b:
             if (set_cc) {
-                gen_helper_add_cc(tmp, cpu_env, tmp, tmp2);
+                gen_add_CC(tmp, tmp, tmp2);
             }
             tcg_temp_free_i32(tmp);
             break;
@@ -7830,7 +7860,7 @@ gen_thumb2_data_op(DisasContext *s, int op, int conds, uint32_t shifter_out, TCG
         break;
     case 8: /* add */
         if (conds)
-            gen_helper_add_cc(t0, cpu_env, t0, t1);
+            gen_add_CC(t0, t0, t1);
         else
             tcg_gen_add_i32(t0, t0, t1);
         break;
@@ -7848,13 +7878,13 @@ gen_thumb2_data_op(DisasContext *s, int op, int conds, uint32_t shifter_out, TCG
         break;
     case 13: /* sub */
         if (conds)
-            gen_helper_sub_cc(t0, cpu_env, t0, t1);
+            gen_sub_CC(t0, t0, t1);
         else
             tcg_gen_sub_i32(t0, t0, t1);
         break;
     case 14: /* rsb */
         if (conds)
-            gen_helper_sub_cc(t0, cpu_env, t1, t0);
+            gen_sub_CC(t0, t1, t0);
         else
             tcg_gen_sub_i32(t0, t1, t0);
         break;
@@ -8982,12 +9012,12 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
                 if (s->condexec_mask)
                     tcg_gen_sub_i32(tmp, tmp, tmp2);
                 else
-                    gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                    gen_sub_CC(tmp, tmp, tmp2);
             } else {
                 if (s->condexec_mask)
                     tcg_gen_add_i32(tmp, tmp, tmp2);
                 else
-                    gen_helper_add_cc(tmp, cpu_env, tmp, tmp2);
+                    gen_add_CC(tmp, tmp, tmp2);
             }
             tcg_temp_free_i32(tmp2);
             store_reg(s, rd, tmp);
@@ -9018,7 +9048,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
             tcg_gen_movi_i32(tmp2, insn & 0xff);
             switch (op) {
             case 1: /* cmp */
-                gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                gen_sub_CC(tmp, tmp, tmp2);
                 tcg_temp_free_i32(tmp);
                 tcg_temp_free_i32(tmp2);
                 break;
@@ -9026,7 +9056,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
                 if (s->condexec_mask)
                     tcg_gen_add_i32(tmp, tmp, tmp2);
                 else
-                    gen_helper_add_cc(tmp, cpu_env, tmp, tmp2);
+                    gen_add_CC(tmp, tmp, tmp2);
                 tcg_temp_free_i32(tmp2);
                 store_reg(s, rd, tmp);
                 break;
@@ -9034,7 +9064,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
                 if (s->condexec_mask)
                     tcg_gen_sub_i32(tmp, tmp, tmp2);
                 else
-                    gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                    gen_sub_CC(tmp, tmp, tmp2);
                 tcg_temp_free_i32(tmp2);
                 store_reg(s, rd, tmp);
                 break;
@@ -9070,7 +9100,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
             case 1: /* cmp */
                 tmp = load_reg(s, rd);
                 tmp2 = load_reg(s, rm);
-                gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                gen_sub_CC(tmp, tmp, tmp2);
                 tcg_temp_free_i32(tmp2);
                 tcg_temp_free_i32(tmp);
                 break;
@@ -9183,14 +9213,14 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
             if (s->condexec_mask)
                 tcg_gen_neg_i32(tmp, tmp2);
             else
-                gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+                gen_sub_CC(tmp, tmp, tmp2);
             break;
         case 0xa: /* cmp */
-            gen_helper_sub_cc(tmp, cpu_env, tmp, tmp2);
+            gen_sub_CC(tmp, tmp, tmp2);
             rd = 16;
             break;
         case 0xb: /* cmn */
-            gen_helper_add_cc(tmp, cpu_env, tmp, tmp2);
+            gen_add_CC(tmp, tmp, tmp2);
             rd = 16;
             break;
         case 0xc: /* orr */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-16 23:08 [Qemu-devel] [PATCH 0/5] target-arm: misc optimizations Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 1/5] target-arm: use globals for CC flags Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 2/5] target-arm: convert add_cc and sub_cc helpers to TCG Aurelien Jarno
@ 2012-09-16 23:08 ` Aurelien Jarno
  2012-09-17  9:30   ` Laurent Desnogues
  2012-09-17 15:36   ` Richard Henderson
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 4/5] target-arm: optimize helper_sar Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 5/5] target-arm: mark a few integer helpers const and pure Aurelien Jarno
  4 siblings, 2 replies; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-16 23:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Peter Maydell, Aurelien Jarno

Now that the setcond TCG op is available, it's possible to replace
shl and shr helpers by TCG code. The code generated by TCG is slightly
longer than the code generated by GCC for the helper but is still worth
it as this avoid all the consequences of using an helper: globals saved
back to memory, no possible optimization, call overhead, etc.

Cc: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-arm/helper.h    |    2 --
 target-arm/op_helper.c |   16 ----------------
 target-arm/translate.c |   32 ++++++++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/target-arm/helper.h b/target-arm/helper.h
index 7151e28..b123d3e 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -145,8 +145,6 @@ DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
 DEF_HELPER_3(adc_cc, i32, env, i32, i32)
 DEF_HELPER_3(sbc_cc, i32, env, i32, i32)
 
-DEF_HELPER_3(shl, i32, env, i32, i32)
-DEF_HELPER_3(shr, i32, env, i32, i32)
 DEF_HELPER_3(sar, i32, env, i32, i32)
 DEF_HELPER_3(shl_cc, i32, env, i32, i32)
 DEF_HELPER_3(shr_cc, i32, env, i32, i32)
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 6095f24..5fcd12c 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -355,22 +355,6 @@ uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
 
 /* Similarly for variable shift instructions.  */
 
-uint32_t HELPER(shl)(CPUARMState *env, uint32_t x, uint32_t i)
-{
-    int shift = i & 0xff;
-    if (shift >= 32)
-        return 0;
-    return x << shift;
-}
-
-uint32_t HELPER(shr)(CPUARMState *env, uint32_t x, uint32_t i)
-{
-    int shift = i & 0xff;
-    if (shift >= 32)
-        return 0;
-    return (uint32_t)x >> shift;
-}
-
 uint32_t HELPER(sar)(CPUARMState *env, uint32_t x, uint32_t i)
 {
     int shift = i & 0xff;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 19bb1e8..9c29065 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -440,6 +440,26 @@ static void gen_sub_CC(TCGv dest, TCGv t0, TCGv t1)
     tcg_gen_mov_i32(dest, cpu_NF);
 }
 
+#define GEN_SHIFT(name)                                \
+static void gen_##name(TCGv dest, TCGv t0, TCGv t1)    \
+{                                                      \
+    TCGv tmp1, tmp2;                                   \
+    tmp1 = tcg_temp_new_i32();                         \
+    tcg_gen_andi_i32(tmp1, t1, 0xff);                  \
+    tmp2 = tcg_temp_new_i32();                         \
+    tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
+    tcg_gen_andi_i32(tmp1, tmp1, 0x1f);                \
+    tcg_gen_##name##_i32(dest, t0, tmp1);              \
+    tcg_temp_free_i32(tmp1);                           \
+    tcg_gen_subi_i32(tmp2, tmp2, 1);                   \
+    tcg_gen_and_i32(dest, dest, tmp2);                 \
+    tcg_temp_free_i32(tmp2);                           \
+}
+GEN_SHIFT(shl)
+GEN_SHIFT(shr)
+#undef GEN_SHIFT
+
+
 /* FIXME:  Implement this natively.  */
 #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1)
 
@@ -516,8 +536,12 @@ static inline void gen_arm_shift_reg(TCGv var, int shiftop,
         }
     } else {
         switch (shiftop) {
-        case 0: gen_helper_shl(var, cpu_env, var, shift); break;
-        case 1: gen_helper_shr(var, cpu_env, var, shift); break;
+        case 0:
+            gen_shl(var, var, shift);
+            break;
+        case 1:
+            gen_shr(var, var, shift);
+            break;
         case 2: gen_helper_sar(var, cpu_env, var, shift); break;
         case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
                 tcg_gen_rotr_i32(var, var, shift); break;
@@ -9161,7 +9185,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
             break;
         case 0x2: /* lsl */
             if (s->condexec_mask) {
-                gen_helper_shl(tmp2, cpu_env, tmp2, tmp);
+                gen_shl(tmp2, tmp2, tmp);
             } else {
                 gen_helper_shl_cc(tmp2, cpu_env, tmp2, tmp);
                 gen_logic_CC(tmp2);
@@ -9169,7 +9193,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
             break;
         case 0x3: /* lsr */
             if (s->condexec_mask) {
-                gen_helper_shr(tmp2, cpu_env, tmp2, tmp);
+                gen_shr(tmp2, tmp2, tmp);
             } else {
                 gen_helper_shr_cc(tmp2, cpu_env, tmp2, tmp);
                 gen_logic_CC(tmp2);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 4/5] target-arm: optimize helper_sar
  2012-09-16 23:08 [Qemu-devel] [PATCH 0/5] target-arm: misc optimizations Aurelien Jarno
                   ` (2 preceding siblings ...)
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr " Aurelien Jarno
@ 2012-09-16 23:08 ` Aurelien Jarno
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 5/5] target-arm: mark a few integer helpers const and pure Aurelien Jarno
  4 siblings, 0 replies; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-16 23:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Peter Maydell, Aurelien Jarno

helper_sar doesn't need env. It can also be marked const and pure.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-arm/helper.h    |    2 +-
 target-arm/op_helper.c |    2 +-
 target-arm/translate.c |    6 ++++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/target-arm/helper.h b/target-arm/helper.h
index b123d3e..b2d2670 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -145,7 +145,7 @@ DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
 DEF_HELPER_3(adc_cc, i32, env, i32, i32)
 DEF_HELPER_3(sbc_cc, i32, env, i32, i32)
 
-DEF_HELPER_3(sar, i32, env, i32, i32)
+DEF_HELPER_FLAGS_2(sar, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32, i32)
 DEF_HELPER_3(shl_cc, i32, env, i32, i32)
 DEF_HELPER_3(shr_cc, i32, env, i32, i32)
 DEF_HELPER_3(sar_cc, i32, env, i32, i32)
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index 5fcd12c..0148818 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -355,7 +355,7 @@ uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
 
 /* Similarly for variable shift instructions.  */
 
-uint32_t HELPER(sar)(CPUARMState *env, uint32_t x, uint32_t i)
+uint32_t HELPER(sar)(uint32_t x, uint32_t i)
 {
     int shift = i & 0xff;
     if (shift >= 32)
diff --git a/target-arm/translate.c b/target-arm/translate.c
index 9c29065..cb5bca4 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -542,7 +542,9 @@ static inline void gen_arm_shift_reg(TCGv var, int shiftop,
         case 1:
             gen_shr(var, var, shift);
             break;
-        case 2: gen_helper_sar(var, cpu_env, var, shift); break;
+        case 2:
+            gen_helper_sar(var, var, shift);
+            break;
         case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
                 tcg_gen_rotr_i32(var, var, shift); break;
         }
@@ -9201,7 +9203,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
             break;
         case 0x4: /* asr */
             if (s->condexec_mask) {
-                gen_helper_sar(tmp2, cpu_env, tmp2, tmp);
+                gen_helper_sar(tmp2, tmp2, tmp);
             } else {
                 gen_helper_sar_cc(tmp2, cpu_env, tmp2, tmp);
                 gen_logic_CC(tmp2);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 5/5] target-arm: mark a few integer helpers const and pure
  2012-09-16 23:08 [Qemu-devel] [PATCH 0/5] target-arm: misc optimizations Aurelien Jarno
                   ` (3 preceding siblings ...)
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 4/5] target-arm: optimize helper_sar Aurelien Jarno
@ 2012-09-16 23:08 ` Aurelien Jarno
  4 siblings, 0 replies; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-16 23:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Peter Maydell, Aurelien Jarno

Cc: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-arm/helper.h |   19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/target-arm/helper.h b/target-arm/helper.h
index b2d2670..e2d181b 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -1,8 +1,8 @@
 #include "def-helper.h"
 
-DEF_HELPER_1(clz, i32, i32)
-DEF_HELPER_1(sxtb16, i32, i32)
-DEF_HELPER_1(uxtb16, i32, i32)
+DEF_HELPER_FLAGS_1(clz, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32)
+DEF_HELPER_FLAGS_1(sxtb16, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32)
+DEF_HELPER_FLAGS_1(uxtb16, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32)
 
 DEF_HELPER_3(add_setq, i32, env, i32, i32)
 DEF_HELPER_3(add_saturate, i32, env, i32, i32)
@@ -10,10 +10,10 @@ DEF_HELPER_3(sub_saturate, i32, env, i32, i32)
 DEF_HELPER_3(add_usaturate, i32, env, i32, i32)
 DEF_HELPER_3(sub_usaturate, i32, env, i32, i32)
 DEF_HELPER_2(double_saturate, i32, env, s32)
-DEF_HELPER_2(sdiv, s32, s32, s32)
-DEF_HELPER_2(udiv, i32, i32, i32)
-DEF_HELPER_1(rbit, i32, i32)
-DEF_HELPER_1(abs, i32, i32)
+DEF_HELPER_FLAGS_2(sdiv, TCG_CALL_CONST | TCG_CALL_PURE, s32, s32, s32)
+DEF_HELPER_FLAGS_2(udiv, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32, i32)
+DEF_HELPER_FLAGS_1(rbit, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32)
+DEF_HELPER_FLAGS_1(abs, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32)
 
 #define PAS_OP(pfx)  \
     DEF_HELPER_3(pfx ## add8, i32, i32, i32, ptr) \
@@ -45,11 +45,12 @@ DEF_HELPER_3(usat, i32, env, i32, i32)
 DEF_HELPER_3(ssat16, i32, env, i32, i32)
 DEF_HELPER_3(usat16, i32, env, i32, i32)
 
-DEF_HELPER_2(usad8, i32, i32, i32)
+DEF_HELPER_FLAGS_2(usad8, TCG_CALL_CONST | TCG_CALL_PURE, i32, i32, i32)
 
 DEF_HELPER_1(logicq_cc, i32, i64)
 
-DEF_HELPER_3(sel_flags, i32, i32, i32, i32)
+DEF_HELPER_FLAGS_3(sel_flags, TCG_CALL_CONST | TCG_CALL_PURE,
+                   i32, i32, i32, i32)
 DEF_HELPER_2(exception, void, env, i32)
 DEF_HELPER_1(wfi, void, env)
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr " Aurelien Jarno
@ 2012-09-17  9:30   ` Laurent Desnogues
  2012-09-17  9:43     ` Peter Maydell
  2012-09-17 15:36   ` Richard Henderson
  1 sibling, 1 reply; 18+ messages in thread
From: Laurent Desnogues @ 2012-09-17  9:30 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: Peter Maydell, qemu-devel

On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> Now that the setcond TCG op is available, it's possible to replace
> shl and shr helpers by TCG code. The code generated by TCG is slightly
> longer than the code generated by GCC for the helper but is still worth
> it as this avoid all the consequences of using an helper: globals saved
> back to memory, no possible optimization, call overhead, etc.
>
> Cc: Peter Maydell <peter.maydell@linaro.org>
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-arm/helper.h    |    2 --
>  target-arm/op_helper.c |   16 ----------------
>  target-arm/translate.c |   32 ++++++++++++++++++++++++++++----
>  3 files changed, 28 insertions(+), 22 deletions(-)
>
> diff --git a/target-arm/helper.h b/target-arm/helper.h
> index 7151e28..b123d3e 100644
> --- a/target-arm/helper.h
> +++ b/target-arm/helper.h
> @@ -145,8 +145,6 @@ DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32)
>  DEF_HELPER_3(adc_cc, i32, env, i32, i32)
>  DEF_HELPER_3(sbc_cc, i32, env, i32, i32)
>
> -DEF_HELPER_3(shl, i32, env, i32, i32)
> -DEF_HELPER_3(shr, i32, env, i32, i32)
>  DEF_HELPER_3(sar, i32, env, i32, i32)
>  DEF_HELPER_3(shl_cc, i32, env, i32, i32)
>  DEF_HELPER_3(shr_cc, i32, env, i32, i32)
> diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
> index 6095f24..5fcd12c 100644
> --- a/target-arm/op_helper.c
> +++ b/target-arm/op_helper.c
> @@ -355,22 +355,6 @@ uint32_t HELPER(sbc_cc)(CPUARMState *env, uint32_t a, uint32_t b)
>
>  /* Similarly for variable shift instructions.  */
>
> -uint32_t HELPER(shl)(CPUARMState *env, uint32_t x, uint32_t i)
> -{
> -    int shift = i & 0xff;
> -    if (shift >= 32)
> -        return 0;
> -    return x << shift;
> -}
> -
> -uint32_t HELPER(shr)(CPUARMState *env, uint32_t x, uint32_t i)
> -{
> -    int shift = i & 0xff;
> -    if (shift >= 32)
> -        return 0;
> -    return (uint32_t)x >> shift;
> -}
> -
>  uint32_t HELPER(sar)(CPUARMState *env, uint32_t x, uint32_t i)
>  {
>      int shift = i & 0xff;
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index 19bb1e8..9c29065 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -440,6 +440,26 @@ static void gen_sub_CC(TCGv dest, TCGv t0, TCGv t1)
>      tcg_gen_mov_i32(dest, cpu_NF);
>  }
>
> +#define GEN_SHIFT(name)                                \
> +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)    \
> +{                                                      \
> +    TCGv tmp1, tmp2;                                   \
> +    tmp1 = tcg_temp_new_i32();                         \
> +    tcg_gen_andi_i32(tmp1, t1, 0xff);                  \
> +    tmp2 = tcg_temp_new_i32();                         \
> +    tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
> +    tcg_gen_andi_i32(tmp1, tmp1, 0x1f);                \

I don't think the 'and 0x1f' is needed given that later you'll and
with 0 if the shift amount is >= 32.

> +    tcg_gen_##name##_i32(dest, t0, tmp1);              \
> +    tcg_temp_free_i32(tmp1);                           \
> +    tcg_gen_subi_i32(tmp2, tmp2, 1);                   \
> +    tcg_gen_and_i32(dest, dest, tmp2);                 \
> +    tcg_temp_free_i32(tmp2);                           \
> +}


Laurent

> +GEN_SHIFT(shl)
> +GEN_SHIFT(shr)
> +#undef GEN_SHIFT
> +
> +
>  /* FIXME:  Implement this natively.  */
>  #define tcg_gen_abs_i32(t0, t1) gen_helper_abs(t0, t1)
>
> @@ -516,8 +536,12 @@ static inline void gen_arm_shift_reg(TCGv var, int shiftop,
>          }
>      } else {
>          switch (shiftop) {
> -        case 0: gen_helper_shl(var, cpu_env, var, shift); break;
> -        case 1: gen_helper_shr(var, cpu_env, var, shift); break;
> +        case 0:
> +            gen_shl(var, var, shift);
> +            break;
> +        case 1:
> +            gen_shr(var, var, shift);
> +            break;
>          case 2: gen_helper_sar(var, cpu_env, var, shift); break;
>          case 3: tcg_gen_andi_i32(shift, shift, 0x1f);
>                  tcg_gen_rotr_i32(var, var, shift); break;
> @@ -9161,7 +9185,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
>              break;
>          case 0x2: /* lsl */
>              if (s->condexec_mask) {
> -                gen_helper_shl(tmp2, cpu_env, tmp2, tmp);
> +                gen_shl(tmp2, tmp2, tmp);
>              } else {
>                  gen_helper_shl_cc(tmp2, cpu_env, tmp2, tmp);
>                  gen_logic_CC(tmp2);
> @@ -9169,7 +9193,7 @@ static void disas_thumb_insn(CPUARMState *env, DisasContext *s)
>              break;
>          case 0x3: /* lsr */
>              if (s->condexec_mask) {
> -                gen_helper_shr(tmp2, cpu_env, tmp2, tmp);
> +                gen_shr(tmp2, tmp2, tmp);
>              } else {
>                  gen_helper_shr_cc(tmp2, cpu_env, tmp2, tmp);
>                  gen_logic_CC(tmp2);
> --
> 1.7.10.4
>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17  9:30   ` Laurent Desnogues
@ 2012-09-17  9:43     ` Peter Maydell
  2012-09-17 13:36       ` Laurent Desnogues
  2012-09-17 15:34       ` Richard Henderson
  0 siblings, 2 replies; 18+ messages in thread
From: Peter Maydell @ 2012-09-17  9:43 UTC (permalink / raw)
  To: Laurent Desnogues; +Cc: qemu-devel, Aurelien Jarno

On 17 September 2012 10:30, Laurent Desnogues
<laurent.desnogues@gmail.com> wrote:
> On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote:
>> +#define GEN_SHIFT(name)                                \
>> +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)    \
>> +{                                                      \
>> +    TCGv tmp1, tmp2;                                   \
>> +    tmp1 = tcg_temp_new_i32();                         \
>> +    tcg_gen_andi_i32(tmp1, t1, 0xff);                  \
>> +    tmp2 = tcg_temp_new_i32();                         \
>> +    tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
>> +    tcg_gen_andi_i32(tmp1, tmp1, 0x1f);                \
>
> I don't think the 'and 0x1f' is needed given that later you'll and
> with 0 if the shift amount is >= 32.

The TCG shift operations are undefined behaviour (not merely
undefined result) if the shift is >= 32, so we must avoid
doing that even if we're going to throw away the answer.

-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17  9:43     ` Peter Maydell
@ 2012-09-17 13:36       ` Laurent Desnogues
  2012-09-17 13:50         ` Peter Maydell
  2012-09-18 21:12         ` Edgar E. Iglesias
  2012-09-17 15:34       ` Richard Henderson
  1 sibling, 2 replies; 18+ messages in thread
From: Laurent Desnogues @ 2012-09-17 13:36 UTC (permalink / raw)
  To: Peter Maydell; +Cc: qemu-devel, Aurelien Jarno

On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell
<peter.maydell@linaro.org> wrote:
> On 17 September 2012 10:30, Laurent Desnogues
> <laurent.desnogues@gmail.com> wrote:
>> On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote:
>>> +#define GEN_SHIFT(name)                                \
>>> +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)    \
>>> +{                                                      \
>>> +    TCGv tmp1, tmp2;                                   \
>>> +    tmp1 = tcg_temp_new_i32();                         \
>>> +    tcg_gen_andi_i32(tmp1, t1, 0xff);                  \
>>> +    tmp2 = tcg_temp_new_i32();                         \
>>> +    tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
>>> +    tcg_gen_andi_i32(tmp1, tmp1, 0x1f);                \
>>
>> I don't think the 'and 0x1f' is needed given that later you'll and
>> with 0 if the shift amount is >= 32.
>
> The TCG shift operations are undefined behaviour (not merely
> undefined result) if the shift is >= 32, so we must avoid
> doing that even if we're going to throw away the answer.

That's odd that it doesn't just state that the result is undefined.
I wonder what "undefined behavior" means.  I understand
what undefined behavior (as opposed tu undefined result)
means for divisions by 0, but not for a shift larger than data
type width.

Anyway that makes my comment about removing the & 0x1f
pointless.


Laurent

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17 13:36       ` Laurent Desnogues
@ 2012-09-17 13:50         ` Peter Maydell
  2012-09-18 21:12         ` Edgar E. Iglesias
  1 sibling, 0 replies; 18+ messages in thread
From: Peter Maydell @ 2012-09-17 13:50 UTC (permalink / raw)
  To: Laurent Desnogues; +Cc: qemu-devel, Aurelien Jarno

On 17 September 2012 14:36, Laurent Desnogues
<laurent.desnogues@gmail.com> wrote:
> On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell
> <peter.maydell@linaro.org> wrote:
>> The TCG shift operations are undefined behaviour (not merely
>> undefined result) if the shift is >= 32, so we must avoid
>> doing that even if we're going to throw away the answer.
>
> That's odd that it doesn't just state that the result is undefined.
> I wonder what "undefined behavior" means.  I understand
> what undefined behavior (as opposed tu undefined result)
> means for divisions by 0, but not for a shift larger than data
> type width.

Well, it means anything the implementation likes :-)
The only concrete example I could come up with was the old
pre-286 behaviour where the 8086 didn't mask the shift
count at all, so a variable shift by a large amount would
be implemented as that many one-bit-per-cycle shifts and
could take an extremely long time.

-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17  9:43     ` Peter Maydell
  2012-09-17 13:36       ` Laurent Desnogues
@ 2012-09-17 15:34       ` Richard Henderson
  2012-09-17 15:41         ` Peter Maydell
  1 sibling, 1 reply; 18+ messages in thread
From: Richard Henderson @ 2012-09-17 15:34 UTC (permalink / raw)
  To: Peter Maydell; +Cc: Laurent Desnogues, qemu-devel, Aurelien Jarno

On 09/17/2012 02:43 AM, Peter Maydell wrote:
> The TCG shift operations are undefined behaviour (not merely
> undefined result) if the shift is >= 32, so we must avoid
> doing that even if we're going to throw away the answer.

I don't think that ought to be true.

In particular, I've just posted a patch to cure the Sparc TCG
backend from producing an illegal insn in the face of a shift
with a count of 64.

I'd much rather we simply had an undefined result.


r~

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-16 23:08 ` [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr " Aurelien Jarno
  2012-09-17  9:30   ` Laurent Desnogues
@ 2012-09-17 15:36   ` Richard Henderson
  2012-09-17 18:09     ` Aurelien Jarno
  1 sibling, 1 reply; 18+ messages in thread
From: Richard Henderson @ 2012-09-17 15:36 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: Peter Maydell, qemu-devel

On 09/16/2012 04:08 PM, Aurelien Jarno wrote:
> +    tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
> +    tcg_gen_andi_i32(tmp1, tmp1, 0x1f);                \
> +    tcg_gen_##name##_i32(dest, t0, tmp1);              \
> +    tcg_temp_free_i32(tmp1);                           \
> +    tcg_gen_subi_i32(tmp2, tmp2, 1);                   \
> +    tcg_gen_and_i32(dest, dest, tmp2);                 \

Ought I revive my movcond patch?

Surely you can agree it's much more natural to write this
as a conditional move of zero than playing masking games.


r~

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17 15:34       ` Richard Henderson
@ 2012-09-17 15:41         ` Peter Maydell
  2012-09-17 15:44           ` Richard Henderson
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Maydell @ 2012-09-17 15:41 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Laurent Desnogues, qemu-devel, Aurelien Jarno

On 17 September 2012 16:34, Richard Henderson <rth@twiddle.net> wrote:
> On 09/17/2012 02:43 AM, Peter Maydell wrote:
>> The TCG shift operations are undefined behaviour (not merely
>> undefined result) if the shift is >= 32, so we must avoid
>> doing that even if we're going to throw away the answer.
>
> I don't think that ought to be true.
>
> In particular, I've just posted a patch to cure the Sparc TCG
> backend from producing an illegal insn in the face of a shift
> with a count of 64.
>
> I'd much rather we simply had an undefined result.

Variously:
 * I didn't really want to have to audit all the TCG backends
   to see if they were taking advantage of the 'undefined behaviour'
   clause
 * the C standard says 'undefined behaviour', not 'unknown value',
   for out of range shifts
 * the TCG spec doesn't currently have the concept of 'unknown
   value' at all and I wasn't sure it was worth introducing it

which all added up to 'not really worth it to save one insn'.

Of these, the first is the most significant. If you want to wade
through architecture manuals for 8 architectures, be my guest :-)

-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17 15:41         ` Peter Maydell
@ 2012-09-17 15:44           ` Richard Henderson
  0 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2012-09-17 15:44 UTC (permalink / raw)
  To: Peter Maydell; +Cc: Laurent Desnogues, qemu-devel, Aurelien Jarno

On 09/17/2012 08:41 AM, Peter Maydell wrote:
> Of these, the first is the most significant. If you want to wade
> through architecture manuals for 8 architectures, be my guest :-)

I think that's probably easier than auditing all of our guests now.  ;-)


r~

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17 15:36   ` Richard Henderson
@ 2012-09-17 18:09     ` Aurelien Jarno
  2012-09-17 18:47       ` Richard Henderson
  0 siblings, 1 reply; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-17 18:09 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Peter Maydell, qemu-devel

On Mon, Sep 17, 2012 at 08:36:31AM -0700, Richard Henderson wrote:
> On 09/16/2012 04:08 PM, Aurelien Jarno wrote:
> > +    tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
> > +    tcg_gen_andi_i32(tmp1, tmp1, 0x1f);                \
> > +    tcg_gen_##name##_i32(dest, t0, tmp1);              \
> > +    tcg_temp_free_i32(tmp1);                           \
> > +    tcg_gen_subi_i32(tmp2, tmp2, 1);                   \
> > +    tcg_gen_and_i32(dest, dest, tmp2);                 \
> 
> Ought I revive my movcond patch?
> 
> Surely you can agree it's much more natural to write this
> as a conditional move of zero than playing masking games.
> 

I agree it's more natural, that said in that case it doesn't spare that
many instructions, and I am still not sure we want to introduce such a
TCG op. What is important is to avoid helpers and branches which are 
really killing the performances.

If you insist, maybe we can just add a movcond op that uses setcond, so
it makes the code more readable?

We introduced setcond a lot of time ago, and there are still plenty of
places where it's not used. We can get higher performances already by
fixing that.

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17 18:09     ` Aurelien Jarno
@ 2012-09-17 18:47       ` Richard Henderson
  2012-09-17 20:03         ` Aurelien Jarno
  0 siblings, 1 reply; 18+ messages in thread
From: Richard Henderson @ 2012-09-17 18:47 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: Peter Maydell, qemu-devel

On 09/17/2012 11:09 AM, Aurelien Jarno wrote:
> If you insist, maybe we can just add a movcond op that uses setcond, so
> it makes the code more readable?

Well, that's certainly a good first step.  And an easy fall back for
hosts that choose not to implement the full conditional move.  But at
least 4 of our 8 hosts do have a proper cmove insn.

But honestly I can't understand the resistance to movcond.


r~

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17 18:47       ` Richard Henderson
@ 2012-09-17 20:03         ` Aurelien Jarno
  0 siblings, 0 replies; 18+ messages in thread
From: Aurelien Jarno @ 2012-09-17 20:03 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Peter Maydell, qemu-devel

On Mon, Sep 17, 2012 at 11:47:05AM -0700, Richard Henderson wrote:
> On 09/17/2012 11:09 AM, Aurelien Jarno wrote:
> > If you insist, maybe we can just add a movcond op that uses setcond, so
> > it makes the code more readable?
> 
> Well, that's certainly a good first step.  And an easy fall back for
> hosts that choose not to implement the full conditional move.  But at
> least 4 of our 8 hosts do have a proper cmove insn.
> 
> But honestly I can't understand the resistance to movcond.
> 

I am not fundamentally opposed to a conditional mov instruction, that
said there are things I don't like in the way you implemented movcond.
When introducing new instructions, especially mandatory ones, we have
to really take care of designing it correctly, as changing it latter
won't be that easy and might requite an audit of all targets (look at
the shifts undefined result/behavior).

What I don't really like in the implementation you proposed:
- It's a mandatory op.
- It's implemented using jumps as a fallback at least on i386 and mips.
  One of the main reasons to have a movcond instruction, is to avoid
  jumps.
- I am not sure it really matches what's the host CPUs are providing,
  and also not sure it matches the needs. What you offer is a mix
  between isel and cmov.

One last argument is that in the patch series you offered, movcond
wasn't used a lot in the targets. Now that we might use it for
implementing shifts this argument has lost parts of its value.

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr helpers to TCG
  2012-09-17 13:36       ` Laurent Desnogues
  2012-09-17 13:50         ` Peter Maydell
@ 2012-09-18 21:12         ` Edgar E. Iglesias
  1 sibling, 0 replies; 18+ messages in thread
From: Edgar E. Iglesias @ 2012-09-18 21:12 UTC (permalink / raw)
  To: Laurent Desnogues; +Cc: Peter Maydell, qemu-devel, Aurelien Jarno

On Mon, Sep 17, 2012 at 03:36:46PM +0200, Laurent Desnogues wrote:
> On Mon, Sep 17, 2012 at 11:43 AM, Peter Maydell
> <peter.maydell@linaro.org> wrote:
> > On 17 September 2012 10:30, Laurent Desnogues
> > <laurent.desnogues@gmail.com> wrote:
> >> On Mon, Sep 17, 2012 at 1:08 AM, Aurelien Jarno <aurelien@aurel32.net> wrote:
> >>> +#define GEN_SHIFT(name)                                \
> >>> +static void gen_##name(TCGv dest, TCGv t0, TCGv t1)    \
> >>> +{                                                      \
> >>> +    TCGv tmp1, tmp2;                                   \
> >>> +    tmp1 = tcg_temp_new_i32();                         \
> >>> +    tcg_gen_andi_i32(tmp1, t1, 0xff);                  \
> >>> +    tmp2 = tcg_temp_new_i32();                         \
> >>> +    tcg_gen_setcondi_i32(TCG_COND_GE, tmp2, tmp1, 32); \
> >>> +    tcg_gen_andi_i32(tmp1, tmp1, 0x1f);                \
> >>
> >> I don't think the 'and 0x1f' is needed given that later you'll and
> >> with 0 if the shift amount is >= 32.
> >
> > The TCG shift operations are undefined behaviour (not merely
> > undefined result) if the shift is >= 32, so we must avoid
> > doing that even if we're going to throw away the answer.
> 
> That's odd that it doesn't just state that the result is undefined.
> I wonder what "undefined behavior" means.  I understand
> what undefined behavior (as opposed tu undefined result)
> means for divisions by 0, but not for a shift larger than data
> type width.
> 
> Anyway that makes my comment about removing the & 0x1f
> pointless.

Hi,

I'm a bit late with email but I don't think your comment is
irrelevant. It sounds like if we are optimizing for very odd
or nonexisting archs. Is there any modern arch that has
undefined behaviour (in reality and not only in theoretical
specs) these days?

Cheers,
E

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2012-09-18 21:12 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-16 23:08 [Qemu-devel] [PATCH 0/5] target-arm: misc optimizations Aurelien Jarno
2012-09-16 23:08 ` [Qemu-devel] [PATCH 1/5] target-arm: use globals for CC flags Aurelien Jarno
2012-09-16 23:08 ` [Qemu-devel] [PATCH 2/5] target-arm: convert add_cc and sub_cc helpers to TCG Aurelien Jarno
2012-09-16 23:08 ` [Qemu-devel] [PATCH 3/5] target-arm: convert shl and shr " Aurelien Jarno
2012-09-17  9:30   ` Laurent Desnogues
2012-09-17  9:43     ` Peter Maydell
2012-09-17 13:36       ` Laurent Desnogues
2012-09-17 13:50         ` Peter Maydell
2012-09-18 21:12         ` Edgar E. Iglesias
2012-09-17 15:34       ` Richard Henderson
2012-09-17 15:41         ` Peter Maydell
2012-09-17 15:44           ` Richard Henderson
2012-09-17 15:36   ` Richard Henderson
2012-09-17 18:09     ` Aurelien Jarno
2012-09-17 18:47       ` Richard Henderson
2012-09-17 20:03         ` Aurelien Jarno
2012-09-16 23:08 ` [Qemu-devel] [PATCH 4/5] target-arm: optimize helper_sar Aurelien Jarno
2012-09-16 23:08 ` [Qemu-devel] [PATCH 5/5] target-arm: mark a few integer helpers const and pure Aurelien Jarno

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).