qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups
@ 2013-12-22 11:50 Aurelien Jarno
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 1/9] target-sh4: use bit number for SR constants Aurelien Jarno
                   ` (8 more replies)
  0 siblings, 9 replies; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

This patchset improves the SH4 emulation by using the lately added
TCG instructions, namely add2, sub2 and movcond. For that the T, Q and
M bits are split out from the SR register.

The last three patches are doing cleanup in the code.

Changes v1 -> v2:
- rebased
- added last patch

Aurelien Jarno (9):
  target-sh4: use bit number for SR constants
  target-sh4: Split out T from SR
  target-sh4: optimize addc using add2
  target-sh4: optimize subc using sub2
  target-sh4: optimize negc using add2 and sub2
  target-sh4: split out Q and M from of SR and optimize div1
  target-sh4: factorize fmov implementation
  target-sh4: remove dead code
  target-sh4: simplify tas instruction

 target-sh4/cpu.c       |    3 +-
 target-sh4/cpu.h       |   51 +++++---
 target-sh4/gdbstub.c   |    8 +-
 target-sh4/helper.c    |   29 ++---
 target-sh4/helper.h    |    1 -
 target-sh4/op_helper.c |  148 +----------------------
 target-sh4/translate.c |  313 ++++++++++++++++++++++++------------------------
 7 files changed, 221 insertions(+), 332 deletions(-)

--
1.7.10.4

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 1/9] target-sh4: use bit number for SR constants
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:10   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 2/9] target-sh4: Split out T from SR Aurelien Jarno
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Use the bit number for SR constants instead of using a bit mask. This
make possible to also use the constants for shifts.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/cpu.c       |    3 +-
 target-sh4/cpu.h       |   30 +++++++++----------
 target-sh4/gdbstub.c   |    4 +--
 target-sh4/helper.c    |   27 ++++++++---------
 target-sh4/op_helper.c |   26 ++++++++---------
 target-sh4/translate.c |   75 +++++++++++++++++++++++++-----------------------
 6 files changed, 85 insertions(+), 80 deletions(-)

diff --git a/target-sh4/cpu.c b/target-sh4/cpu.c
index c23294d..1d80f47 100644
--- a/target-sh4/cpu.c
+++ b/target-sh4/cpu.c
@@ -56,7 +56,8 @@ static void superh_cpu_reset(CPUState *s)
     env->fpscr = FPSCR_PR; /* value for userspace according to the kernel */
     set_float_rounding_mode(float_round_nearest_even, &env->fp_status); /* ?! */
 #else
-    env->sr = SR_MD | SR_RB | SR_BL | SR_I3 | SR_I2 | SR_I1 | SR_I0;
+    env->sr = (1u << SR_MD) | (1u << SR_RB) | (1u << SR_BL) |
+              (1u << SR_I3) | (1u << SR_I2) | (1u << SR_I1) | (1u << SR_I0);
     env->fpscr = FPSCR_DN | FPSCR_RM_ZERO; /* CPU reset value according to SH4 manual */
     set_float_rounding_mode(float_round_to_zero, &env->fp_status);
     set_flush_to_zero(1, &env->fp_status);
diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index c181dda..c8ba70f 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -48,18 +48,18 @@
 #define TARGET_PHYS_ADDR_SPACE_BITS 32
 #define TARGET_VIRT_ADDR_SPACE_BITS 32
 
-#define SR_MD (1 << 30)
-#define SR_RB (1 << 29)
-#define SR_BL (1 << 28)
-#define SR_FD (1 << 15)
-#define SR_M  (1 << 9)
-#define SR_Q  (1 << 8)
-#define SR_I3 (1 << 7)
-#define SR_I2 (1 << 6)
-#define SR_I1 (1 << 5)
-#define SR_I0 (1 << 4)
-#define SR_S  (1 << 1)
-#define SR_T  (1 << 0)
+#define SR_MD 30
+#define SR_RB 29
+#define SR_BL 28
+#define SR_FD 15
+#define SR_M  9
+#define SR_Q  8
+#define SR_I3 7
+#define SR_I2 6
+#define SR_I1 5
+#define SR_I0 4
+#define SR_S  1
+#define SR_T  0
 
 #define FPSCR_MASK             (0x003fffff)
 #define FPSCR_FR               (1 << 21)
@@ -242,7 +242,7 @@ static inline CPUSH4State *cpu_init(const char *cpu_model)
 #define MMU_USER_IDX 1
 static inline int cpu_mmu_index (CPUSH4State *env)
 {
-    return (env->sr & SR_MD) == 0 ? 1 : 0;
+    return (env->sr & (1u << SR_MD)) == 0 ? 1 : 0;
 }
 
 #include "exec/cpu-all.h"
@@ -347,8 +347,8 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
     *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
                     | DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
             | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
-            | (env->sr & (SR_MD | SR_RB))                      /* Bits 29-30 */
-            | (env->sr & SR_FD)                                /* Bit 15 */
+            | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))      /* Bits 29-30 */
+            | (env->sr & (1u << SR_FD))                        /* Bit 15 */
             | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 4 */
 }
 
diff --git a/target-sh4/gdbstub.c b/target-sh4/gdbstub.c
index df4fa2a..05ba728 100644
--- a/target-sh4/gdbstub.c
+++ b/target-sh4/gdbstub.c
@@ -31,7 +31,7 @@ int superh_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n)
 
     switch (n) {
     case 0 ... 7:
-        if ((env->sr & (SR_MD | SR_RB)) == (SR_MD | SR_RB)) {
+        if ((env->sr & (1u << SR_MD)) && (env->sr & (1u << SR_RB))) {
             return gdb_get_regl(mem_buf, env->gregs[n + 16]);
         } else {
             return gdb_get_regl(mem_buf, env->gregs[n]);
@@ -83,7 +83,7 @@ int superh_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
 
     switch (n) {
     case 0 ... 7:
-        if ((env->sr & (SR_MD | SR_RB)) == (SR_MD | SR_RB)) {
+        if ((env->sr & (1u << SR_MD)) && (env->sr & (1u << SR_RB))) {
             env->gregs[n + 16] = ldl_p(mem_buf);
         } else {
             env->gregs[n] = ldl_p(mem_buf);
diff --git a/target-sh4/helper.c b/target-sh4/helper.c
index 9ac2825..07f8e91 100644
--- a/target-sh4/helper.c
+++ b/target-sh4/helper.c
@@ -93,7 +93,7 @@ void superh_cpu_do_interrupt(CPUState *cs)
     do_exp = env->exception_index != -1;
     do_irq = do_irq && (env->exception_index == -1);
 
-    if (env->sr & SR_BL) {
+    if (env->sr & (1u << SR_BL)) {
         if (do_exp && env->exception_index != 0x1e0) {
             env->exception_index = 0x000; /* masked exception -> reset */
         }
@@ -165,7 +165,7 @@ void superh_cpu_do_interrupt(CPUState *cs)
     env->ssr = env->sr;
     env->spc = env->pc;
     env->sgr = env->gregs[15];
-    env->sr |= SR_BL | SR_MD | SR_RB;
+    env->sr |= (1u << SR_BL) | (1u << SR_MD) | (1u << SR_RB);
 
     if (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) {
         /* Branch instruction should be executed again before delay slot. */
@@ -182,7 +182,7 @@ void superh_cpu_do_interrupt(CPUState *cs)
         case 0x000:
         case 0x020:
         case 0x140:
-            env->sr &= ~SR_FD;
+            env->sr &= ~(1u << SR_FD);
             env->sr |= 0xf << 4; /* IMASK */
             env->pc = 0xa0000000;
             break;
@@ -349,23 +349,24 @@ static int get_mmu_address(CPUSH4State * env, target_ulong * physical,
     int use_asid, n;
     tlb_t *matching = NULL;
 
-    use_asid = (env->mmucr & MMUCR_SV) == 0 || (env->sr & SR_MD) == 0;
+    use_asid = !(env->mmucr & MMUCR_SV) || !(env->sr & (1u << SR_MD));
 
     if (rw == 2) {
         n = find_itlb_entry(env, address, use_asid);
 	if (n >= 0) {
 	    matching = &env->itlb[n];
-	    if (!(env->sr & SR_MD) && !(matching->pr & 2))
+            if (!(env->sr & (1u << SR_MD)) && !(matching->pr & 2)) {
 		n = MMU_ITLB_VIOLATION;
-	    else
+            } else {
 		*prot = PAGE_EXEC;
+            }
         } else {
             n = find_utlb_entry(env, address, use_asid);
             if (n >= 0) {
                 n = copy_utlb_entry_itlb(env, n);
                 matching = &env->itlb[n];
-                if (!(env->sr & SR_MD) && !(matching->pr & 2)) {
-                      n = MMU_ITLB_VIOLATION;
+                if (!(env->sr & (1u << SR_MD)) && !(matching->pr & 2)) {
+                    n = MMU_ITLB_VIOLATION;
                 } else {
                     *prot = PAGE_READ | PAGE_EXEC;
                     if ((matching->pr & 1) && matching->d) {
@@ -382,7 +383,7 @@ static int get_mmu_address(CPUSH4State * env, target_ulong * physical,
 	n = find_utlb_entry(env, address, use_asid);
 	if (n >= 0) {
 	    matching = &env->utlb[n];
-            if (!(env->sr & SR_MD) && !(matching->pr & 2)) {
+            if (!(env->sr & (1u << SR_MD)) && !(matching->pr & 2)) {
                 n = (rw == 1) ? MMU_DTLB_VIOLATION_WRITE :
                     MMU_DTLB_VIOLATION_READ;
             } else if ((rw == 1) && !(matching->pr & 1)) {
@@ -415,7 +416,7 @@ static int get_physical_address(CPUSH4State * env, target_ulong * physical,
     /* P1, P2 and P4 areas do not use translation */
     if ((address >= 0x80000000 && address < 0xc0000000) ||
 	address >= 0xe0000000) {
-	if (!(env->sr & SR_MD)
+        if (!(env->sr & (1u << SR_MD))
 	    && (address < 0xe0000000 || address >= 0xe4000000)) {
 	    /* Unauthorized access in user mode (only store queues are available) */
 	    fprintf(stderr, "Unauthorized access\n");
@@ -681,7 +682,7 @@ void cpu_sh4_write_mmaped_utlb_addr(CPUSH4State *s, hwaddr addr,
     uint8_t d = (uint8_t)((mem_value & 0x00000200) >> 9);
     uint8_t v = (uint8_t)((mem_value & 0x00000100) >> 8);
     uint8_t asid = (uint8_t)(mem_value & 0x000000ff);
-    int use_asid = (s->mmucr & MMUCR_SV) == 0 || (s->sr & SR_MD) == 0;
+    int use_asid = !(s->mmucr & MMUCR_SV) || !(s->sr & (1u << SR_MD));
 
     if (associate) {
         int i;
@@ -807,10 +808,10 @@ void cpu_sh4_write_mmaped_utlb_data(CPUSH4State *s, hwaddr addr,
 int cpu_sh4_is_cached(CPUSH4State * env, target_ulong addr)
 {
     int n;
-    int use_asid = (env->mmucr & MMUCR_SV) == 0 || (env->sr & SR_MD) == 0;
+    int use_asid = !(env->mmucr & MMUCR_SV) || !(env->sr & (1u << SR_MD));
 
     /* check area */
-    if (env->sr & SR_MD) {
+    if (env->sr & (1u << SR_MD)) {
         /* For previledged mode, P2 and P4 area is not cachable. */
         if ((0xA0000000 <= addr && addr < 0xC0000000) || 0xE0000000 <= addr)
             return 0;
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index e955e81..6d56df2 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -166,15 +166,15 @@ void helper_ocbi(CPUSH4State *env, uint32_t address)
     }
 }
 
-#define T (env->sr & SR_T)
-#define Q (env->sr & SR_Q ? 1 : 0)
-#define M (env->sr & SR_M ? 1 : 0)
-#define SETT env->sr |= SR_T
-#define CLRT env->sr &= ~SR_T
-#define SETQ env->sr |= SR_Q
-#define CLRQ env->sr &= ~SR_Q
-#define SETM env->sr |= SR_M
-#define CLRM env->sr &= ~SR_M
+#define T (env->sr & (1u << SR_T))
+#define Q (env->sr & (1u << SR_Q) ? 1 : 0)
+#define M (env->sr & (1u << SR_M) ? 1 : 0)
+#define SETT (env->sr |= (1u << SR_T))
+#define CLRT (env->sr &= ~(1u << SR_T))
+#define SETQ (env->sr |= (1u << SR_Q))
+#define CLRQ (env->sr &= ~(1u << SR_Q))
+#define SETM (env->sr |= (1u << SR_M))
+#define CLRM (env->sr &= ~(1u << SR_M))
 
 uint32_t helper_div1(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
 {
@@ -292,7 +292,7 @@ void helper_macl(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
     res += (int64_t) (int32_t) arg0 *(int64_t) (int32_t) arg1;
     env->mach = (res >> 32) & 0xffffffff;
     env->macl = res & 0xffffffff;
-    if (env->sr & SR_S) {
+    if (env->sr & (1u << SR_S)) {
 	if (res < 0)
 	    env->mach |= 0xffff0000;
 	else
@@ -308,7 +308,7 @@ void helper_macw(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
     res += (int64_t) (int16_t) arg0 *(int64_t) (int16_t) arg1;
     env->mach = (res >> 32) & 0xffffffff;
     env->macl = res & 0xffffffff;
-    if (env->sr & SR_S) {
+    if (env->sr & (1u << SR_S)) {
 	if (res < -0x80000000) {
 	    env->mach = 1;
 	    env->macl = 0x80000000;
@@ -321,12 +321,12 @@ void helper_macw(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
 
 static inline void set_t(CPUSH4State *env)
 {
-    env->sr |= SR_T;
+    env->sr |= (1u << SR_T);
 }
 
 static inline void clr_t(CPUSH4State *env)
 {
-    env->sr &= ~SR_T;
+    env->sr &= ~(1u << SR_T);
 }
 
 void helper_ld_fpscr(CPUSH4State *env, uint32_t val)
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 661fc6c..214c5c4 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -44,7 +44,7 @@ typedef struct DisasContext {
 #if defined(CONFIG_USER_ONLY)
 #define IS_USER(ctx) 1
 #else
-#define IS_USER(ctx) (!(ctx->flags & SR_MD))
+#define IS_USER(ctx) (!(ctx->flags & (1u << SR_MD)))
 #endif
 
 enum {
@@ -211,7 +211,7 @@ static inline void gen_branch_slot(uint32_t delayed_pc, int t)
     int label = gen_new_label();
     tcg_gen_movi_i32(cpu_delayed_pc, delayed_pc);
     sr = tcg_temp_new();
-    tcg_gen_andi_i32(sr, cpu_sr, SR_T);
+    tcg_gen_andi_i32(sr, cpu_sr, (1u << SR_T));
     tcg_gen_brcondi_i32(t ? TCG_COND_EQ:TCG_COND_NE, sr, 0, label);
     tcg_gen_ori_i32(cpu_flags, cpu_flags, DELAY_SLOT_TRUE);
     gen_set_label(label);
@@ -226,7 +226,7 @@ static void gen_conditional_jump(DisasContext * ctx,
 
     l1 = gen_new_label();
     sr = tcg_temp_new();
-    tcg_gen_andi_i32(sr, cpu_sr, SR_T);
+    tcg_gen_andi_i32(sr, cpu_sr, (1u << SR_T));
     tcg_gen_brcondi_i32(TCG_COND_NE, sr, 0, l1);
     gen_goto_tb(ctx, 0, ifnott);
     gen_set_label(l1);
@@ -255,7 +255,7 @@ static inline void gen_cmp(int cond, TCGv t0, TCGv t1)
 
     t = tcg_temp_new();
     tcg_gen_setcond_i32(cond, t, t1, t0);
-    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
     tcg_gen_or_i32(cpu_sr, cpu_sr, t);
 
     tcg_temp_free(t);
@@ -267,7 +267,7 @@ static inline void gen_cmp_imm(int cond, TCGv t0, int32_t imm)
 
     t = tcg_temp_new();
     tcg_gen_setcondi_i32(cond, t, t0, imm);
-    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
     tcg_gen_or_i32(cpu_sr, cpu_sr, t);
 
     tcg_temp_free(t);
@@ -323,10 +323,12 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 #define B11_8 ((ctx->opcode >> 8) & 0xf)
 #define B15_12 ((ctx->opcode >> 12) & 0xf)
 
-#define REG(x) ((x) < 8 && (ctx->flags & (SR_MD | SR_RB)) == (SR_MD | SR_RB) \
+#define REG(x) ((x) < 8 && (ctx->flags & (1u << SR_MD))\
+                        && (ctx->flags & (1u << SR_RB))\
                 ? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
-#define ALTREG(x) ((x) < 8 && (ctx->flags & (SR_MD | SR_RB)) != (SR_MD | SR_RB)\
+#define ALTREG(x) ((x) < 8 && (!(ctx->flags & (1u << SR_MD))\
+                               || !(ctx->flags & (1u << SR_RB)))\
 		? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
 #define FREG(x) (ctx->flags & FPSCR_FR ? (x) ^ 0x10 : (x))
@@ -356,7 +358,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
   }
 
 #define CHECK_FPU_ENABLED                                       \
-  if (ctx->flags & SR_FD) {                                     \
+  if (ctx->flags & (1u << SR_FD)) {                             \
       tcg_gen_movi_i32(cpu_pc, ctx->pc);                        \
       if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
           gen_helper_raise_slot_fpu_disable(cpu_env);           \
@@ -406,7 +408,8 @@ static void _decode_opc(DisasContext * ctx)
 
     switch (ctx->opcode) {
     case 0x0019:		/* div0u */
-	tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(SR_M | SR_Q | SR_T));
+        tcg_gen_andi_i32(cpu_sr, cpu_sr,
+                         ~((1u << SR_M) | (1u << SR_Q) | (1u << SR_T)));
 	return;
     case 0x000b:		/* rts */
 	CHECK_NOT_DELAY_SLOT
@@ -419,10 +422,10 @@ static void _decode_opc(DisasContext * ctx)
 	tcg_gen_movi_i32(cpu_macl, 0);
 	return;
     case 0x0048:		/* clrs */
-	tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_S);
+        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_S));
 	return;
     case 0x0008:		/* clrt */
-        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
 	return;
     case 0x0038:		/* ldtlb */
 	CHECK_PRIVILEGED
@@ -437,10 +440,10 @@ static void _decode_opc(DisasContext * ctx)
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
     case 0x0058:		/* sets */
-	tcg_gen_ori_i32(cpu_sr, cpu_sr, SR_S);
+        tcg_gen_ori_i32(cpu_sr, cpu_sr, (1u << SR_S));
 	return;
     case 0x0018:		/* sett */
-        tcg_gen_ori_i32(cpu_sr, cpu_sr, SR_T);
+        tcg_gen_ori_i32(cpu_sr, cpu_sr, (1u << SR_T));
 	return;
     case 0xfbfd:		/* frchg */
 	tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_FR);
@@ -658,7 +661,7 @@ static void _decode_opc(DisasContext * ctx)
         {
             TCGv t0, t1, t2;
             t0 = tcg_temp_new();
-            tcg_gen_andi_i32(t0, cpu_sr, SR_T);
+            tcg_gen_andi_i32(t0, cpu_sr, (1u << SR_T));
             t1 = tcg_temp_new();
             tcg_gen_add_i32(t1, REG(B7_4), REG(B11_8));
             tcg_gen_add_i32(t0, t0, t1);
@@ -667,7 +670,7 @@ static void _decode_opc(DisasContext * ctx)
             tcg_gen_setcond_i32(TCG_COND_GTU, t1, t1, t0);
             tcg_gen_or_i32(t1, t1, t2);
             tcg_temp_free(t2);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
             tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
             tcg_temp_free(t1);
             tcg_gen_mov_i32(REG(B11_8), t0);
@@ -686,7 +689,7 @@ static void _decode_opc(DisasContext * ctx)
             tcg_gen_andc_i32(t1, t1, t2);
             tcg_temp_free(t2);
             tcg_gen_shri_i32(t1, t1, 31);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
             tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
             tcg_temp_free(t1);
             tcg_gen_mov_i32(REG(B7_4), t0);
@@ -715,7 +718,7 @@ static void _decode_opc(DisasContext * ctx)
 	{
 	    TCGv cmp1 = tcg_temp_new();
 	    TCGv cmp2 = tcg_temp_new();
-	    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
 	    tcg_gen_xor_i32(cmp1, REG(B7_4), REG(B11_8));
 	    tcg_gen_andi_i32(cmp2, cmp1, 0xff000000);
 	    tcg_gen_setcondi_i32(TCG_COND_EQ, cmp2, cmp2, 0);
@@ -735,11 +738,11 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x2007:		/* div0s Rm,Rn */
 	{
-	    gen_copy_bit_i32(cpu_sr, 8, REG(B11_8), 31);	/* SR_Q */
-	    gen_copy_bit_i32(cpu_sr, 9, REG(B7_4), 31);		/* SR_M */
+            gen_copy_bit_i32(cpu_sr, SR_Q, REG(B11_8), 31);     /* SR_Q */
+            gen_copy_bit_i32(cpu_sr, SR_M, REG(B7_4), 31);      /* SR_M */
 	    TCGv val = tcg_temp_new();
 	    tcg_gen_xor_i32(val, REG(B7_4), REG(B11_8));
-	    gen_copy_bit_i32(cpu_sr, 0, val, 31);		/* SR_T */
+            gen_copy_bit_i32(cpu_sr, SR_T, val, 31);            /* SR_T */
 	    tcg_temp_free(val);
 	}
 	return;
@@ -828,9 +831,9 @@ static void _decode_opc(DisasContext * ctx)
             t0 = tcg_temp_new();
             tcg_gen_neg_i32(t0, REG(B7_4));
             t1 = tcg_temp_new();
-            tcg_gen_andi_i32(t1, cpu_sr, SR_T);
+            tcg_gen_andi_i32(t1, cpu_sr, (1u << SR_T));
             tcg_gen_sub_i32(REG(B11_8), t0, t1);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
             tcg_gen_setcondi_i32(TCG_COND_GTU, t1, t0, 0);
             tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
             tcg_gen_setcond_i32(TCG_COND_GTU, t1, REG(B11_8), t0);
@@ -917,7 +920,7 @@ static void _decode_opc(DisasContext * ctx)
         {
             TCGv t0, t1, t2;
             t0 = tcg_temp_new();
-            tcg_gen_andi_i32(t0, cpu_sr, SR_T);
+            tcg_gen_andi_i32(t0, cpu_sr, (1u << SR_T));
             t1 = tcg_temp_new();
             tcg_gen_sub_i32(t1, REG(B11_8), REG(B7_4));
             tcg_gen_sub_i32(t0, t1, t0);
@@ -926,7 +929,7 @@ static void _decode_opc(DisasContext * ctx)
             tcg_gen_setcond_i32(TCG_COND_LTU, t1, t1, t0);
             tcg_gen_or_i32(t1, t1, t2);
             tcg_temp_free(t2);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
             tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
             tcg_temp_free(t1);
             tcg_gen_mov_i32(REG(B11_8), t0);
@@ -945,7 +948,7 @@ static void _decode_opc(DisasContext * ctx)
             tcg_gen_and_i32(t1, t1, t2);
             tcg_temp_free(t2);
             tcg_gen_shri_i32(t1, t1, 31);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
             tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
             tcg_temp_free(t1);
             tcg_gen_mov_i32(REG(B11_8), t0);
@@ -1542,7 +1545,7 @@ static void _decode_opc(DisasContext * ctx)
 	tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
 	return;
     case 0x0029:		/* movt Rn */
-	tcg_gen_andi_i32(REG(B11_8), cpu_sr, SR_T);
+        tcg_gen_andi_i32(REG(B11_8), cpu_sr, (1u << SR_T));
 	return;
     case 0x0073:
         /* MOVCO.L
@@ -1552,7 +1555,7 @@ static void _decode_opc(DisasContext * ctx)
         */
         if (ctx->features & SH_FEATURE_SH4A) {
 	    int label = gen_new_label();
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~SR_T);
+            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
 	    tcg_gen_or_i32(cpu_sr, cpu_sr, cpu_ldst);
 	    tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ldst, 0, label);
             tcg_gen_qemu_st_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
@@ -1607,9 +1610,9 @@ static void _decode_opc(DisasContext * ctx)
 	{
 	    TCGv tmp = tcg_temp_new();
 	    tcg_gen_mov_i32(tmp, cpu_sr);
-	    gen_copy_bit_i32(cpu_sr, 0, REG(B11_8), 31);
+            gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 31);
 	    tcg_gen_shli_i32(REG(B11_8), REG(B11_8), 1);
-	    gen_copy_bit_i32(REG(B11_8), 0, tmp, 0);
+            gen_copy_bit_i32(REG(B11_8), SR_T, tmp, 0);
 	    tcg_temp_free(tmp);
 	}
 	return;
@@ -1617,7 +1620,7 @@ static void _decode_opc(DisasContext * ctx)
 	{
 	    TCGv tmp = tcg_temp_new();
 	    tcg_gen_mov_i32(tmp, cpu_sr);
-	    gen_copy_bit_i32(cpu_sr, 0, REG(B11_8), 0);
+            gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
 	    tcg_gen_shri_i32(REG(B11_8), REG(B11_8), 1);
 	    gen_copy_bit_i32(REG(B11_8), 31, tmp, 0);
 	    tcg_temp_free(tmp);
@@ -1625,23 +1628,23 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x4004:		/* rotl Rn */
 	tcg_gen_rotli_i32(REG(B11_8), REG(B11_8), 1);
-	gen_copy_bit_i32(cpu_sr, 0, REG(B11_8), 0);
+        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
 	return;
     case 0x4005:		/* rotr Rn */
-	gen_copy_bit_i32(cpu_sr, 0, REG(B11_8), 0);
+        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
 	tcg_gen_rotri_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4000:		/* shll Rn */
     case 0x4020:		/* shal Rn */
-	gen_copy_bit_i32(cpu_sr, 0, REG(B11_8), 31);
+        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 31);
 	tcg_gen_shli_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4021:		/* shar Rn */
-	gen_copy_bit_i32(cpu_sr, 0, REG(B11_8), 0);
+        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
 	tcg_gen_sari_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4001:		/* shlr Rn */
-	gen_copy_bit_i32(cpu_sr, 0, REG(B11_8), 0);
+        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
 	tcg_gen_shri_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4008:		/* shll2 Rn */
@@ -1873,7 +1876,7 @@ gen_intermediate_code_internal(SuperHCPU *cpu, TranslationBlock *tb,
     ctx.pc = pc_start;
     ctx.flags = (uint32_t)tb->flags;
     ctx.bstate = BS_NONE;
-    ctx.memidx = (ctx.flags & SR_MD) == 0 ? 1 : 0;
+    ctx.memidx = (ctx.flags & (1u << SR_MD)) == 0 ? 1 : 0;
     /* We don't know if the delayed pc came from a dynamic or static branch,
        so assume it is a dynamic branch.  */
     ctx.delayed_pc = -1; /* use delayed pc from env pointer */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 2/9] target-sh4: Split out T from SR
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 1/9] target-sh4: use bit number for SR constants Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:13   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 3/9] target-sh4: optimize addc using add2 Aurelien Jarno
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

In preparation for more efficient setting of this field.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/cpu.h       |   14 +++-
 target-sh4/gdbstub.c   |    4 +-
 target-sh4/helper.c    |    2 +-
 target-sh4/op_helper.c |   32 ++------
 target-sh4/translate.c |  205 ++++++++++++++++++++----------------------------
 5 files changed, 110 insertions(+), 147 deletions(-)

diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index c8ba70f..b7dd7ab 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -139,7 +139,8 @@ typedef struct CPUSH4State {
     uint32_t flags;		/* general execution flags */
     uint32_t gregs[24];		/* general registers */
     float32 fregs[32];		/* floating point registers */
-    uint32_t sr;		/* status register */
+    uint32_t sr;                /* status register (with T split out) */
+    uint32_t sr_t;              /* T bit of status register */
     uint32_t ssr;		/* saved status register */
     uint32_t spc;		/* saved program counter */
     uint32_t gbr;		/* global base register */
@@ -339,6 +340,17 @@ static inline int cpu_ptel_pr (uint32_t ptel)
 
 #define TB_FLAG_PENDING_MOVCA  (1 << 4)
 
+static inline target_ulong cpu_read_sr(CPUSH4State *env)
+{
+    return (env->sr & ~(1u << SR_T)) | (env->sr_t << SR_T);
+}
+
+static inline void cpu_write_sr(CPUSH4State *env, target_ulong sr)
+{
+    env->sr_t = sr & (1u << SR_T);
+    env->sr = sr & ~(1u << SR_T);
+}
+
 static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
                                         target_ulong *cs_base, int *flags)
 {
diff --git a/target-sh4/gdbstub.c b/target-sh4/gdbstub.c
index 05ba728..a365a27 100644
--- a/target-sh4/gdbstub.c
+++ b/target-sh4/gdbstub.c
@@ -51,7 +51,7 @@ int superh_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n)
     case 21:
         return gdb_get_regl(mem_buf, env->macl);
     case 22:
-        return gdb_get_regl(mem_buf, env->sr);
+        return gdb_get_regl(mem_buf, cpu_read_sr(env));
     case 23:
         return gdb_get_regl(mem_buf, env->fpul);
     case 24:
@@ -111,7 +111,7 @@ int superh_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n)
         env->macl = ldl_p(mem_buf);
         break;
     case 22:
-        env->sr = ldl_p(mem_buf);
+        cpu_write_sr(env, ldl_p(mem_buf));
         break;
     case 23:
         env->fpul = ldl_p(mem_buf);
diff --git a/target-sh4/helper.c b/target-sh4/helper.c
index 07f8e91..a33db4d 100644
--- a/target-sh4/helper.c
+++ b/target-sh4/helper.c
@@ -162,7 +162,7 @@ void superh_cpu_do_interrupt(CPUState *cs)
         log_cpu_state(cs, 0);
     }
 
-    env->ssr = env->sr;
+    env->ssr = cpu_read_sr(env);
     env->spc = env->pc;
     env->sgr = env->gregs[15];
     env->sr |= (1u << SR_BL) | (1u << SR_MD) | (1u << SR_RB);
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index 6d56df2..0e881a8 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -166,11 +166,11 @@ void helper_ocbi(CPUSH4State *env, uint32_t address)
     }
 }
 
-#define T (env->sr & (1u << SR_T))
+#define T (env->sr_t)
 #define Q (env->sr & (1u << SR_Q) ? 1 : 0)
 #define M (env->sr & (1u << SR_M) ? 1 : 0)
-#define SETT (env->sr |= (1u << SR_T))
-#define CLRT (env->sr &= ~(1u << SR_T))
+#define SETT (env->sr_t = 1)
+#define CLRT (env->sr_t = 0)
 #define SETQ (env->sr |= (1u << SR_Q))
 #define CLRQ (env->sr &= ~(1u << SR_Q))
 #define SETM (env->sr |= (1u << SR_M))
@@ -319,16 +319,6 @@ void helper_macw(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
     }
 }
 
-static inline void set_t(CPUSH4State *env)
-{
-    env->sr |= (1u << SR_T);
-}
-
-static inline void clr_t(CPUSH4State *env)
-{
-    env->sr &= ~(1u << SR_T);
-}
-
 void helper_ld_fpscr(CPUSH4State *env, uint32_t val)
 {
     env->fpscr = val & FPSCR_MASK;
@@ -413,10 +403,8 @@ void helper_fcmp_eq_FT(CPUSH4State *env, float32 t0, float32 t1)
     relation = float32_compare(t0, t1, &env->fp_status);
     if (unlikely(relation == float_relation_unordered)) {
         update_fpscr(env, GETPC());
-    } else if (relation == float_relation_equal) {
-        set_t(env);
     } else {
-        clr_t(env);
+        env->sr_t = (relation == float_relation_equal);
     }
 }
 
@@ -428,10 +416,8 @@ void helper_fcmp_eq_DT(CPUSH4State *env, float64 t0, float64 t1)
     relation = float64_compare(t0, t1, &env->fp_status);
     if (unlikely(relation == float_relation_unordered)) {
         update_fpscr(env, GETPC());
-    } else if (relation == float_relation_equal) {
-        set_t(env);
     } else {
-        clr_t(env);
+        env->sr_t = (relation == float_relation_equal);
     }
 }
 
@@ -443,10 +429,8 @@ void helper_fcmp_gt_FT(CPUSH4State *env, float32 t0, float32 t1)
     relation = float32_compare(t0, t1, &env->fp_status);
     if (unlikely(relation == float_relation_unordered)) {
         update_fpscr(env, GETPC());
-    } else if (relation == float_relation_greater) {
-        set_t(env);
     } else {
-        clr_t(env);
+        env->sr_t = (relation == float_relation_greater);
     }
 }
 
@@ -458,10 +442,8 @@ void helper_fcmp_gt_DT(CPUSH4State *env, float64 t0, float64 t1)
     relation = float64_compare(t0, t1, &env->fp_status);
     if (unlikely(relation == float_relation_unordered)) {
         update_fpscr(env, GETPC());
-    } else if (relation == float_relation_greater) {
-        set_t(env);
     } else {
-        clr_t(env);
+        env->sr_t = (relation == float_relation_greater);
     }
 }
 
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 214c5c4..fad9869 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -59,7 +59,7 @@ enum {
 /* global register indexes */
 static TCGv_ptr cpu_env;
 static TCGv cpu_gregs[24];
-static TCGv cpu_pc, cpu_sr, cpu_ssr, cpu_spc, cpu_gbr;
+static TCGv cpu_pc, cpu_sr, cpu_sr_t, cpu_ssr, cpu_spc, cpu_gbr;
 static TCGv cpu_vbr, cpu_sgr, cpu_dbr, cpu_mach, cpu_macl;
 static TCGv cpu_pr, cpu_fpscr, cpu_fpul, cpu_ldst;
 static TCGv cpu_fregs[32];
@@ -107,6 +107,8 @@ void sh4_translate_init(void)
                                     offsetof(CPUSH4State, pc), "PC");
     cpu_sr = tcg_global_mem_new_i32(TCG_AREG0,
                                     offsetof(CPUSH4State, sr), "SR");
+    cpu_sr_t = tcg_global_mem_new_i32(TCG_AREG0,
+                                    offsetof(CPUSH4State, sr_t), "SR_T");
     cpu_ssr = tcg_global_mem_new_i32(TCG_AREG0,
                                      offsetof(CPUSH4State, ssr), "SSR");
     cpu_spc = tcg_global_mem_new_i32(TCG_AREG0,
@@ -153,7 +155,7 @@ void superh_cpu_dump_state(CPUState *cs, FILE *f,
     CPUSH4State *env = &cpu->env;
     int i;
     cpu_fprintf(f, "pc=0x%08x sr=0x%08x pr=0x%08x fpscr=0x%08x\n",
-		env->pc, env->sr, env->pr, env->fpscr);
+                env->pc, cpu_read_sr(env), env->pr, env->fpscr);
     cpu_fprintf(f, "spc=0x%08x ssr=0x%08x gbr=0x%08x vbr=0x%08x\n",
 		env->spc, env->ssr, env->gbr, env->vbr);
     cpu_fprintf(f, "sgr=0x%08x dbr=0x%08x delayed_pc=0x%08x fpul=0x%08x\n",
@@ -171,6 +173,17 @@ void superh_cpu_dump_state(CPUState *cs, FILE *f,
 		    env->delayed_pc);
     }
 }
+static void gen_read_sr(TCGv dst)
+{
+    tcg_gen_andi_i32(dst, cpu_sr, ~(1u << SR_T));
+    tcg_gen_or_i32(dst, dst, cpu_sr_t);
+}
+
+static void gen_write_sr(TCGv src)
+{
+    tcg_gen_andi_i32(cpu_sr, src, ~(1u << SR_T));
+    tcg_gen_andi_i32(cpu_sr_t, src, (1u << SR_T));
+}
 
 static void gen_goto_tb(DisasContext * ctx, int n, target_ulong dest)
 {
@@ -207,12 +220,9 @@ static void gen_jump(DisasContext * ctx)
 
 static inline void gen_branch_slot(uint32_t delayed_pc, int t)
 {
-    TCGv sr;
     int label = gen_new_label();
     tcg_gen_movi_i32(cpu_delayed_pc, delayed_pc);
-    sr = tcg_temp_new();
-    tcg_gen_andi_i32(sr, cpu_sr, (1u << SR_T));
-    tcg_gen_brcondi_i32(t ? TCG_COND_EQ:TCG_COND_NE, sr, 0, label);
+    tcg_gen_brcondi_i32(t ? TCG_COND_EQ : TCG_COND_NE, cpu_sr_t, 0, label);
     tcg_gen_ori_i32(cpu_flags, cpu_flags, DELAY_SLOT_TRUE);
     gen_set_label(label);
 }
@@ -221,13 +231,8 @@ static inline void gen_branch_slot(uint32_t delayed_pc, int t)
 static void gen_conditional_jump(DisasContext * ctx,
 				 target_ulong ift, target_ulong ifnott)
 {
-    int l1;
-    TCGv sr;
-
-    l1 = gen_new_label();
-    sr = tcg_temp_new();
-    tcg_gen_andi_i32(sr, cpu_sr, (1u << SR_T));
-    tcg_gen_brcondi_i32(TCG_COND_NE, sr, 0, l1);
+    int l1 = gen_new_label();
+    tcg_gen_brcondi_i32(TCG_COND_NE, cpu_sr_t, 0, l1);
     gen_goto_tb(ctx, 0, ifnott);
     gen_set_label(l1);
     gen_goto_tb(ctx, 1, ift);
@@ -249,30 +254,6 @@ static void gen_delayed_conditional_jump(DisasContext * ctx)
     gen_jump(ctx);
 }
 
-static inline void gen_cmp(int cond, TCGv t0, TCGv t1)
-{
-    TCGv t;
-
-    t = tcg_temp_new();
-    tcg_gen_setcond_i32(cond, t, t1, t0);
-    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-    tcg_gen_or_i32(cpu_sr, cpu_sr, t);
-
-    tcg_temp_free(t);
-}
-
-static inline void gen_cmp_imm(int cond, TCGv t0, int32_t imm)
-{
-    TCGv t;
-
-    t = tcg_temp_new();
-    tcg_gen_setcondi_i32(cond, t, t0, imm);
-    tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-    tcg_gen_or_i32(cpu_sr, cpu_sr, t);
-
-    tcg_temp_free(t);
-}
-
 static inline void gen_store_flags(uint32_t flags)
 {
     tcg_gen_andi_i32(cpu_flags, cpu_flags, DELAY_SLOT_TRUE);
@@ -408,8 +389,8 @@ static void _decode_opc(DisasContext * ctx)
 
     switch (ctx->opcode) {
     case 0x0019:		/* div0u */
-        tcg_gen_andi_i32(cpu_sr, cpu_sr,
-                         ~((1u << SR_M) | (1u << SR_Q) | (1u << SR_T)));
+        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~((1u << SR_M) | (1u << SR_Q)));
+        tcg_gen_movi_i32(cpu_sr_t, 0);
 	return;
     case 0x000b:		/* rts */
 	CHECK_NOT_DELAY_SLOT
@@ -425,7 +406,7 @@ static void _decode_opc(DisasContext * ctx)
         tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_S));
 	return;
     case 0x0008:		/* clrt */
-        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
+        tcg_gen_movi_i32(cpu_sr_t, 0);
 	return;
     case 0x0038:		/* ldtlb */
 	CHECK_PRIVILEGED
@@ -434,7 +415,7 @@ static void _decode_opc(DisasContext * ctx)
     case 0x002b:		/* rte */
 	CHECK_PRIVILEGED
 	CHECK_NOT_DELAY_SLOT
-	tcg_gen_mov_i32(cpu_sr, cpu_ssr);
+        gen_write_sr(cpu_ssr);
 	tcg_gen_mov_i32(cpu_delayed_pc, cpu_spc);
 	ctx->flags |= DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
@@ -443,7 +424,7 @@ static void _decode_opc(DisasContext * ctx)
         tcg_gen_ori_i32(cpu_sr, cpu_sr, (1u << SR_S));
 	return;
     case 0x0018:		/* sett */
-        tcg_gen_ori_i32(cpu_sr, cpu_sr, (1u << SR_T));
+        tcg_gen_movi_i32(cpu_sr_t, 1);
 	return;
     case 0xfbfd:		/* frchg */
 	tcg_gen_xori_i32(cpu_fpscr, cpu_fpscr, FPSCR_FR);
@@ -659,22 +640,17 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x300e:		/* addc Rm,Rn */
         {
-            TCGv t0, t1, t2;
+            TCGv t0, t1;
             t0 = tcg_temp_new();
-            tcg_gen_andi_i32(t0, cpu_sr, (1u << SR_T));
             t1 = tcg_temp_new();
-            tcg_gen_add_i32(t1, REG(B7_4), REG(B11_8));
-            tcg_gen_add_i32(t0, t0, t1);
-            t2 = tcg_temp_new();
-            tcg_gen_setcond_i32(TCG_COND_GTU, t2, REG(B11_8), t1);
-            tcg_gen_setcond_i32(TCG_COND_GTU, t1, t1, t0);
-            tcg_gen_or_i32(t1, t1, t2);
-            tcg_temp_free(t2);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
-            tcg_temp_free(t1);
-            tcg_gen_mov_i32(REG(B11_8), t0);
+            tcg_gen_add_i32(t0, REG(B7_4), REG(B11_8));
+            tcg_gen_add_i32(t1, cpu_sr_t, t0);
+            tcg_gen_setcond_i32(TCG_COND_GTU, cpu_sr_t, REG(B11_8), t0);
+            tcg_gen_setcond_i32(TCG_COND_GTU, t0, t0, t1);
+            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, t0);
             tcg_temp_free(t0);
+            tcg_gen_mov_i32(REG(B11_8), t1);
+            tcg_temp_free(t1);
         }
 	return;
     case 0x300f:		/* addv Rm,Rn */
@@ -686,11 +662,9 @@ static void _decode_opc(DisasContext * ctx)
             tcg_gen_xor_i32(t1, t0, REG(B11_8));
             t2 = tcg_temp_new();
             tcg_gen_xor_i32(t2, REG(B7_4), REG(B11_8));
-            tcg_gen_andc_i32(t1, t1, t2);
+            tcg_gen_andc_i32(cpu_sr_t, t1, t2);
             tcg_temp_free(t2);
-            tcg_gen_shri_i32(t1, t1, 31);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
+            tcg_gen_shri_i32(cpu_sr_t, cpu_sr_t, 31);
             tcg_temp_free(t1);
             tcg_gen_mov_i32(REG(B7_4), t0);
             tcg_temp_free(t0);
@@ -700,38 +674,36 @@ static void _decode_opc(DisasContext * ctx)
 	tcg_gen_and_i32(REG(B11_8), REG(B11_8), REG(B7_4));
 	return;
     case 0x3000:		/* cmp/eq Rm,Rn */
-	gen_cmp(TCG_COND_EQ, REG(B7_4), REG(B11_8));
+        tcg_gen_setcond_i32(TCG_COND_EQ, cpu_sr_t, REG(B11_8), REG(B7_4));
 	return;
     case 0x3003:		/* cmp/ge Rm,Rn */
-	gen_cmp(TCG_COND_GE, REG(B7_4), REG(B11_8));
+        tcg_gen_setcond_i32(TCG_COND_GE, cpu_sr_t, REG(B11_8), REG(B7_4));
 	return;
     case 0x3007:		/* cmp/gt Rm,Rn */
-	gen_cmp(TCG_COND_GT, REG(B7_4), REG(B11_8));
+        tcg_gen_setcond_i32(TCG_COND_GT, cpu_sr_t, REG(B11_8), REG(B7_4));
 	return;
     case 0x3006:		/* cmp/hi Rm,Rn */
-	gen_cmp(TCG_COND_GTU, REG(B7_4), REG(B11_8));
+        tcg_gen_setcond_i32(TCG_COND_GTU, cpu_sr_t, REG(B11_8), REG(B7_4));
 	return;
     case 0x3002:		/* cmp/hs Rm,Rn */
-	gen_cmp(TCG_COND_GEU, REG(B7_4), REG(B11_8));
+        tcg_gen_setcond_i32(TCG_COND_GEU, cpu_sr_t, REG(B11_8), REG(B7_4));
 	return;
     case 0x200c:		/* cmp/str Rm,Rn */
 	{
 	    TCGv cmp1 = tcg_temp_new();
 	    TCGv cmp2 = tcg_temp_new();
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
 	    tcg_gen_xor_i32(cmp1, REG(B7_4), REG(B11_8));
 	    tcg_gen_andi_i32(cmp2, cmp1, 0xff000000);
-	    tcg_gen_setcondi_i32(TCG_COND_EQ, cmp2, cmp2, 0);
-	    tcg_gen_or_i32(cpu_sr, cpu_sr, cmp2);
+            tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, cmp2, 0);
 	    tcg_gen_andi_i32(cmp2, cmp1, 0x00ff0000);
 	    tcg_gen_setcondi_i32(TCG_COND_EQ, cmp2, cmp2, 0);
-	    tcg_gen_or_i32(cpu_sr, cpu_sr, cmp2);
+            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, cmp2);
 	    tcg_gen_andi_i32(cmp2, cmp1, 0x0000ff00);
 	    tcg_gen_setcondi_i32(TCG_COND_EQ, cmp2, cmp2, 0);
-	    tcg_gen_or_i32(cpu_sr, cpu_sr, cmp2);
+            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, cmp2);
 	    tcg_gen_andi_i32(cmp2, cmp1, 0x000000ff);
 	    tcg_gen_setcondi_i32(TCG_COND_EQ, cmp2, cmp2, 0);
-	    tcg_gen_or_i32(cpu_sr, cpu_sr, cmp2);
+            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, cmp2);
 	    tcg_temp_free(cmp2);
 	    tcg_temp_free(cmp1);
 	}
@@ -741,8 +713,8 @@ static void _decode_opc(DisasContext * ctx)
             gen_copy_bit_i32(cpu_sr, SR_Q, REG(B11_8), 31);     /* SR_Q */
             gen_copy_bit_i32(cpu_sr, SR_M, REG(B7_4), 31);      /* SR_M */
 	    TCGv val = tcg_temp_new();
-	    tcg_gen_xor_i32(val, REG(B7_4), REG(B11_8));
-            gen_copy_bit_i32(cpu_sr, SR_T, val, 31);            /* SR_T */
+            tcg_gen_xor_i32(cpu_sr_t, REG(B7_4), REG(B11_8));
+            tcg_gen_shri_i32(cpu_sr_t, cpu_sr_t, 31);           /* SR_T */
 	    tcg_temp_free(val);
 	}
 	return;
@@ -827,19 +799,13 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x600a:		/* negc Rm,Rn */
         {
-	    TCGv t0, t1;
-            t0 = tcg_temp_new();
+            TCGv t0 = tcg_temp_new();
             tcg_gen_neg_i32(t0, REG(B7_4));
-            t1 = tcg_temp_new();
-            tcg_gen_andi_i32(t1, cpu_sr, (1u << SR_T));
-            tcg_gen_sub_i32(REG(B11_8), t0, t1);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-            tcg_gen_setcondi_i32(TCG_COND_GTU, t1, t0, 0);
-            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
-            tcg_gen_setcond_i32(TCG_COND_GTU, t1, REG(B11_8), t0);
-            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
+            tcg_gen_sub_i32(REG(B11_8), t0, cpu_sr_t);
+            tcg_gen_setcondi_i32(TCG_COND_GTU, cpu_sr_t, t0, 0);
+            tcg_gen_setcond_i32(TCG_COND_GTU, t0, REG(B11_8), t0);
+            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, t0);
             tcg_temp_free(t0);
-            tcg_temp_free(t1);
         }
 	return;
     case 0x6007:		/* not Rm,Rn */
@@ -920,17 +886,14 @@ static void _decode_opc(DisasContext * ctx)
         {
             TCGv t0, t1, t2;
             t0 = tcg_temp_new();
-            tcg_gen_andi_i32(t0, cpu_sr, (1u << SR_T));
             t1 = tcg_temp_new();
             tcg_gen_sub_i32(t1, REG(B11_8), REG(B7_4));
-            tcg_gen_sub_i32(t0, t1, t0);
+            tcg_gen_sub_i32(t0, t1, cpu_sr_t);
             t2 = tcg_temp_new();
             tcg_gen_setcond_i32(TCG_COND_LTU, t2, REG(B11_8), t1);
             tcg_gen_setcond_i32(TCG_COND_LTU, t1, t1, t0);
-            tcg_gen_or_i32(t1, t1, t2);
+            tcg_gen_or_i32(cpu_sr_t, t1, t2);
             tcg_temp_free(t2);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
             tcg_temp_free(t1);
             tcg_gen_mov_i32(REG(B11_8), t0);
             tcg_temp_free(t0);
@@ -947,9 +910,7 @@ static void _decode_opc(DisasContext * ctx)
             tcg_gen_xor_i32(t2, REG(B11_8), REG(B7_4));
             tcg_gen_and_i32(t1, t1, t2);
             tcg_temp_free(t2);
-            tcg_gen_shri_i32(t1, t1, 31);
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-            tcg_gen_or_i32(cpu_sr, cpu_sr, t1);
+            tcg_gen_shri_i32(cpu_sr_t, t1, 31);
             tcg_temp_free(t1);
             tcg_gen_mov_i32(REG(B11_8), t0);
             tcg_temp_free(t0);
@@ -959,7 +920,7 @@ static void _decode_opc(DisasContext * ctx)
 	{
 	    TCGv val = tcg_temp_new();
 	    tcg_gen_and_i32(val, REG(B7_4), REG(B11_8));
-	    gen_cmp_imm(TCG_COND_EQ, val, 0);
+            tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
 	    tcg_temp_free(val);
 	}
 	return;
@@ -1210,7 +1171,7 @@ static void _decode_opc(DisasContext * ctx)
 	ctx->flags |= DELAY_SLOT_CONDITIONAL;
 	return;
     case 0x8800:		/* cmp/eq #imm,R0 */
-	gen_cmp_imm(TCG_COND_EQ, REG(0), B7_0s);
+        tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, REG(0), B7_0s);
 	return;
     case 0xc400:		/* mov.b @(disp,GBR),R0 */
 	{
@@ -1326,7 +1287,7 @@ static void _decode_opc(DisasContext * ctx)
 	{
 	    TCGv val = tcg_temp_new();
 	    tcg_gen_andi_i32(val, REG(0), B7_0);
-	    gen_cmp_imm(TCG_COND_EQ, val, 0);
+            tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
 	    tcg_temp_free(val);
 	}
 	return;
@@ -1336,7 +1297,7 @@ static void _decode_opc(DisasContext * ctx)
 	    tcg_gen_add_i32(val, REG(0), cpu_gbr);
             tcg_gen_qemu_ld_i32(val, val, ctx->memidx, MO_UB);
 	    tcg_gen_andi_i32(val, val, B7_0);
-	    gen_cmp_imm(TCG_COND_EQ, val, 0);
+            tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
 	    tcg_temp_free(val);
 	}
 	return;
@@ -1399,14 +1360,14 @@ static void _decode_opc(DisasContext * ctx)
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
     case 0x4015:		/* cmp/pl Rn */
-	gen_cmp_imm(TCG_COND_GT, REG(B11_8), 0);
+        tcg_gen_setcondi_i32(TCG_COND_GT, cpu_sr_t, REG(B11_8), 0);
 	return;
     case 0x4011:		/* cmp/pz Rn */
-	gen_cmp_imm(TCG_COND_GE, REG(B11_8), 0);
+        tcg_gen_setcondi_i32(TCG_COND_GE, cpu_sr_t, REG(B11_8), 0);
 	return;
     case 0x4010:		/* dt Rn */
 	tcg_gen_subi_i32(REG(B11_8), REG(B11_8), 1);
-	gen_cmp_imm(TCG_COND_EQ, REG(B11_8), 0);
+        tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, REG(B11_8), 0);
 	return;
     case 0x402b:		/* jmp @Rn */
 	CHECK_NOT_DELAY_SLOT
@@ -1423,15 +1384,21 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x400e:		/* ldc Rm,SR */
 	CHECK_PRIVILEGED
-	tcg_gen_andi_i32(cpu_sr, REG(B11_8), 0x700083f3);
-	ctx->bstate = BS_STOP;
+        {
+            TCGv val = tcg_temp_new();
+            tcg_gen_andi_i32(val, REG(B11_8), 0x700083f3);
+            gen_write_sr(val);
+            tcg_temp_free(val);
+            ctx->bstate = BS_STOP;
+        }
 	return;
     case 0x4007:		/* ldc.l @Rm+,SR */
 	CHECK_PRIVILEGED
 	{
 	    TCGv val = tcg_temp_new();
             tcg_gen_qemu_ld_i32(val, REG(B11_8), ctx->memidx, MO_TESL);
-	    tcg_gen_andi_i32(cpu_sr, val, 0x700083f3);
+            tcg_gen_andi_i32(val, val, 0x700083f3);
+            gen_write_sr(val);
 	    tcg_temp_free(val);
 	    tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
 	    ctx->bstate = BS_STOP;
@@ -1439,15 +1406,18 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x0002:		/* stc SR,Rn */
 	CHECK_PRIVILEGED
-	tcg_gen_mov_i32(REG(B11_8), cpu_sr);
+        gen_read_sr(REG(B11_8));
 	return;
     case 0x4003:		/* stc SR,@-Rn */
 	CHECK_PRIVILEGED
 	{
 	    TCGv addr = tcg_temp_new();
+            TCGv val = tcg_temp_new();
 	    tcg_gen_subi_i32(addr, REG(B11_8), 4);
-            tcg_gen_qemu_st_i32(cpu_sr, addr, ctx->memidx, MO_TEUL);
+            gen_read_sr(val);
+            tcg_gen_qemu_st_i32(val, addr, ctx->memidx, MO_TEUL);
 	    tcg_gen_mov_i32(REG(B11_8), addr);
+            tcg_temp_free(val);
 	    tcg_temp_free(addr);
 	}
 	return;
@@ -1545,7 +1515,7 @@ static void _decode_opc(DisasContext * ctx)
 	tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
 	return;
     case 0x0029:		/* movt Rn */
-        tcg_gen_andi_i32(REG(B11_8), cpu_sr, (1u << SR_T));
+        tcg_gen_mov_i32(REG(B11_8), cpu_sr_t);
 	return;
     case 0x0073:
         /* MOVCO.L
@@ -1555,8 +1525,7 @@ static void _decode_opc(DisasContext * ctx)
         */
         if (ctx->features & SH_FEATURE_SH4A) {
 	    int label = gen_new_label();
-            tcg_gen_andi_i32(cpu_sr, cpu_sr, ~(1u << SR_T));
-	    tcg_gen_or_i32(cpu_sr, cpu_sr, cpu_ldst);
+            tcg_gen_mov_i32(cpu_sr, cpu_ldst);
 	    tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_ldst, 0, label);
             tcg_gen_qemu_st_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
 	    gen_set_label(label);
@@ -1609,42 +1578,42 @@ static void _decode_opc(DisasContext * ctx)
     case 0x4024:		/* rotcl Rn */
 	{
 	    TCGv tmp = tcg_temp_new();
-	    tcg_gen_mov_i32(tmp, cpu_sr);
-            gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 31);
+            tcg_gen_mov_i32(tmp, cpu_sr_t);
+            tcg_gen_shri_i32(cpu_sr_t, REG(B11_8), 31);
 	    tcg_gen_shli_i32(REG(B11_8), REG(B11_8), 1);
-            gen_copy_bit_i32(REG(B11_8), SR_T, tmp, 0);
+            tcg_gen_or_i32(REG(B11_8), REG(B11_8), tmp);
 	    tcg_temp_free(tmp);
 	}
 	return;
     case 0x4025:		/* rotcr Rn */
 	{
 	    TCGv tmp = tcg_temp_new();
-	    tcg_gen_mov_i32(tmp, cpu_sr);
-            gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
+            tcg_gen_shli_i32(tmp, cpu_sr_t, 31);
+            tcg_gen_andi_i32(cpu_sr_t, REG(B11_8), 1);
 	    tcg_gen_shri_i32(REG(B11_8), REG(B11_8), 1);
-	    gen_copy_bit_i32(REG(B11_8), 31, tmp, 0);
+            tcg_gen_or_i32(REG(B11_8), REG(B11_8), tmp);
 	    tcg_temp_free(tmp);
 	}
 	return;
     case 0x4004:		/* rotl Rn */
 	tcg_gen_rotli_i32(REG(B11_8), REG(B11_8), 1);
-        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
+        tcg_gen_andi_i32(cpu_sr_t, REG(B11_8), 0);
 	return;
     case 0x4005:		/* rotr Rn */
-        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
+        tcg_gen_andi_i32(cpu_sr_t, REG(B11_8), 0);
 	tcg_gen_rotri_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4000:		/* shll Rn */
     case 0x4020:		/* shal Rn */
-        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 31);
+        tcg_gen_shri_i32(cpu_sr_t, REG(B11_8), 31);
 	tcg_gen_shli_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4021:		/* shar Rn */
-        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
+        tcg_gen_andi_i32(cpu_sr_t, REG(B11_8), 1);
 	tcg_gen_sari_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4001:		/* shlr Rn */
-        gen_copy_bit_i32(cpu_sr, SR_T, REG(B11_8), 0);
+        tcg_gen_andi_i32(cpu_sr_t, REG(B11_8), 1);
 	tcg_gen_shri_i32(REG(B11_8), REG(B11_8), 1);
 	return;
     case 0x4008:		/* shll2 Rn */
@@ -1672,7 +1641,7 @@ static void _decode_opc(DisasContext * ctx)
 	    tcg_gen_mov_i32(addr, REG(B11_8));
 	    val = tcg_temp_local_new();
             tcg_gen_qemu_ld_i32(val, addr, ctx->memidx, MO_UB);
-	    gen_cmp_imm(TCG_COND_EQ, val, 0);
+            tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
 	    tcg_gen_ori_i32(val, val, 0x80);
             tcg_gen_qemu_st_i32(val, addr, ctx->memidx, MO_UB);
 	    tcg_temp_free(val);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 3/9] target-sh4: optimize addc using add2
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 1/9] target-sh4: use bit number for SR constants Aurelien Jarno
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 2/9] target-sh4: Split out T from SR Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:18   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 4/9] target-sh4: optimize subc using sub2 Aurelien Jarno
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/translate.c |   14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index fad9869..31d47b3 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -640,17 +640,15 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x300e:		/* addc Rm,Rn */
         {
-            TCGv t0, t1;
-            t0 = tcg_temp_new();
+            TCGv t0, t1, t2;
+            t0 = tcg_const_tl(0);
             t1 = tcg_temp_new();
-            tcg_gen_add_i32(t0, REG(B7_4), REG(B11_8));
-            tcg_gen_add_i32(t1, cpu_sr_t, t0);
-            tcg_gen_setcond_i32(TCG_COND_GTU, cpu_sr_t, REG(B11_8), t0);
-            tcg_gen_setcond_i32(TCG_COND_GTU, t0, t0, t1);
-            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, t0);
+            t2 = tcg_temp_new();
+            tcg_gen_add2_i32(t1, t2, REG(B11_8), t0, REG(B7_4), t0);
+            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t, t1, t2, cpu_sr_t, t0);
             tcg_temp_free(t0);
-            tcg_gen_mov_i32(REG(B11_8), t1);
             tcg_temp_free(t1);
+            tcg_temp_free(t2);
         }
 	return;
     case 0x300f:		/* addv Rm,Rn */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 4/9] target-sh4: optimize subc using sub2
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
                   ` (2 preceding siblings ...)
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 3/9] target-sh4: optimize addc using add2 Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:22   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 5/9] target-sh4: optimize negc using add2 and sub2 Aurelien Jarno
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/translate.c |   15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 31d47b3..21605b0 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -883,18 +883,15 @@ static void _decode_opc(DisasContext * ctx)
     case 0x300a:		/* subc Rm,Rn */
         {
             TCGv t0, t1, t2;
-            t0 = tcg_temp_new();
+            t0 = tcg_const_tl(0);
             t1 = tcg_temp_new();
-            tcg_gen_sub_i32(t1, REG(B11_8), REG(B7_4));
-            tcg_gen_sub_i32(t0, t1, cpu_sr_t);
             t2 = tcg_temp_new();
-            tcg_gen_setcond_i32(TCG_COND_LTU, t2, REG(B11_8), t1);
-            tcg_gen_setcond_i32(TCG_COND_LTU, t1, t1, t0);
-            tcg_gen_or_i32(cpu_sr_t, t1, t2);
-            tcg_temp_free(t2);
-            tcg_temp_free(t1);
-            tcg_gen_mov_i32(REG(B11_8), t0);
+            tcg_gen_sub2_i32(t1, t2, REG(B11_8), t0, REG(B7_4), t0);
+            tcg_gen_sub2_i32(REG(B11_8), cpu_sr_t, t1, t2, cpu_sr_t, t0);
+            tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
             tcg_temp_free(t0);
+            tcg_temp_free(t1);
+            tcg_temp_free(t2);
         }
 	return;
     case 0x300b:		/* subv Rm,Rn */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 5/9] target-sh4: optimize negc using add2 and sub2
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
                   ` (3 preceding siblings ...)
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 4/9] target-sh4: optimize subc using sub2 Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:25   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 6/9] target-sh4: split out Q and M from of SR and optimize div1 Aurelien Jarno
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/translate.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 21605b0..4ef0398 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -797,12 +797,12 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x600a:		/* negc Rm,Rn */
         {
-            TCGv t0 = tcg_temp_new();
-            tcg_gen_neg_i32(t0, REG(B7_4));
-            tcg_gen_sub_i32(REG(B11_8), t0, cpu_sr_t);
-            tcg_gen_setcondi_i32(TCG_COND_GTU, cpu_sr_t, t0, 0);
-            tcg_gen_setcond_i32(TCG_COND_GTU, t0, REG(B11_8), t0);
-            tcg_gen_or_i32(cpu_sr_t, cpu_sr_t, t0);
+            TCGv t0 = tcg_const_i32(0);
+            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t,
+                             REG(B7_4), t0, cpu_sr_t, t0);
+            tcg_gen_sub2_i32(REG(B11_8), cpu_sr_t,
+                             t0, t0, REG(B11_8), cpu_sr_t);
+            tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
             tcg_temp_free(t0);
         }
 	return;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 6/9] target-sh4: split out Q and M from of SR and optimize div1
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
                   ` (4 preceding siblings ...)
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 5/9] target-sh4: optimize negc using add2 and sub2 Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:44   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 7/9] target-sh4: factorize fmov implementation Aurelien Jarno
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Splitting Q and M out of SR, it's possible to optimize div1 by using
TCG code instead of an helper.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/cpu.h       |   13 ++++--
 target-sh4/helper.h    |    1 -
 target-sh4/op_helper.c |  118 ------------------------------------------------
 target-sh4/translate.c |   67 +++++++++++++++++++++++----
 4 files changed, 68 insertions(+), 131 deletions(-)

diff --git a/target-sh4/cpu.h b/target-sh4/cpu.h
index b7dd7ab..82221c8 100644
--- a/target-sh4/cpu.h
+++ b/target-sh4/cpu.h
@@ -140,6 +140,8 @@ typedef struct CPUSH4State {
     uint32_t gregs[24];		/* general registers */
     float32 fregs[32];		/* floating point registers */
     uint32_t sr;                /* status register (with T split out) */
+    uint32_t sr_m;              /* M bit of status register */
+    uint32_t sr_q;              /* Q bit of status register */
     uint32_t sr_t;              /* T bit of status register */
     uint32_t ssr;		/* saved status register */
     uint32_t spc;		/* saved program counter */
@@ -342,13 +344,18 @@ static inline int cpu_ptel_pr (uint32_t ptel)
 
 static inline target_ulong cpu_read_sr(CPUSH4State *env)
 {
-    return (env->sr & ~(1u << SR_T)) | (env->sr_t << SR_T);
+    return (env->sr & ~((1u << SR_M) | (1u << SR_Q) | (1u << SR_T))) |
+           (env->sr_m << SR_M) |
+           (env->sr_q << SR_Q) |
+           (env->sr_t << SR_T);
 }
 
 static inline void cpu_write_sr(CPUSH4State *env, target_ulong sr)
 {
-    env->sr_t = sr & (1u << SR_T);
-    env->sr = sr & ~(1u << SR_T);
+    env->sr_m = (sr >> SR_M) & 1;
+    env->sr_q = (sr >> SR_Q) & 1;
+    env->sr_t = (sr >> SR_T) & 1;
+    env->sr = sr & ~((1u << SR_M) | (1u << SR_Q) | (1u << SR_T));
 }
 
 static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
diff --git a/target-sh4/helper.h b/target-sh4/helper.h
index 7162448..fbbe264 100644
--- a/target-sh4/helper.h
+++ b/target-sh4/helper.h
@@ -13,7 +13,6 @@ DEF_HELPER_3(movcal, void, env, i32, i32)
 DEF_HELPER_1(discard_movcal_backup, void, env)
 DEF_HELPER_2(ocbi, void, env, i32)
 
-DEF_HELPER_3(div1, i32, env, i32, i32)
 DEF_HELPER_3(macl, void, env, i32, i32)
 DEF_HELPER_3(macw, void, env, i32, i32)
 
diff --git a/target-sh4/op_helper.c b/target-sh4/op_helper.c
index 0e881a8..3ed0e2d 100644
--- a/target-sh4/op_helper.c
+++ b/target-sh4/op_helper.c
@@ -166,124 +166,6 @@ void helper_ocbi(CPUSH4State *env, uint32_t address)
     }
 }
 
-#define T (env->sr_t)
-#define Q (env->sr & (1u << SR_Q) ? 1 : 0)
-#define M (env->sr & (1u << SR_M) ? 1 : 0)
-#define SETT (env->sr_t = 1)
-#define CLRT (env->sr_t = 0)
-#define SETQ (env->sr |= (1u << SR_Q))
-#define CLRQ (env->sr &= ~(1u << SR_Q))
-#define SETM (env->sr |= (1u << SR_M))
-#define CLRM (env->sr &= ~(1u << SR_M))
-
-uint32_t helper_div1(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
-{
-    uint32_t tmp0, tmp2;
-    uint8_t old_q, tmp1 = 0xff;
-
-    //printf("div1 arg0=0x%08x arg1=0x%08x M=%d Q=%d T=%d\n", arg0, arg1, M, Q, T);
-    old_q = Q;
-    if ((0x80000000 & arg1) != 0)
-	SETQ;
-    else
-	CLRQ;
-    tmp2 = arg0;
-    arg1 <<= 1;
-    arg1 |= T;
-    switch (old_q) {
-    case 0:
-	switch (M) {
-	case 0:
-	    tmp0 = arg1;
-	    arg1 -= tmp2;
-	    tmp1 = arg1 > tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	case 1:
-	    tmp0 = arg1;
-	    arg1 += tmp2;
-	    tmp1 = arg1 < tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	}
-	break;
-    case 1:
-	switch (M) {
-	case 0:
-	    tmp0 = arg1;
-	    arg1 += tmp2;
-	    tmp1 = arg1 < tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	case 1:
-	    tmp0 = arg1;
-	    arg1 -= tmp2;
-	    tmp1 = arg1 > tmp0;
-	    switch (Q) {
-	    case 0:
-		if (tmp1 == 0)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    case 1:
-		if (tmp1)
-		    SETQ;
-		else
-		    CLRQ;
-		break;
-	    }
-	    break;
-	}
-	break;
-    }
-    if (Q == M)
-	SETT;
-    else
-	CLRT;
-    //printf("Output: arg1=0x%08x M=%d Q=%d T=%d\n", arg1, M, Q, T);
-    return arg1;
-}
-
 void helper_macl(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
 {
     int64_t res;
diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 4ef0398..d4046f8 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -59,7 +59,8 @@ enum {
 /* global register indexes */
 static TCGv_ptr cpu_env;
 static TCGv cpu_gregs[24];
-static TCGv cpu_pc, cpu_sr, cpu_sr_t, cpu_ssr, cpu_spc, cpu_gbr;
+static TCGv cpu_sr, cpu_sr_m, cpu_sr_q, cpu_sr_t;
+static TCGv cpu_pc, cpu_ssr, cpu_spc, cpu_gbr;
 static TCGv cpu_vbr, cpu_sgr, cpu_dbr, cpu_mach, cpu_macl;
 static TCGv cpu_pr, cpu_fpscr, cpu_fpul, cpu_ldst;
 static TCGv cpu_fregs[32];
@@ -107,6 +108,10 @@ void sh4_translate_init(void)
                                     offsetof(CPUSH4State, pc), "PC");
     cpu_sr = tcg_global_mem_new_i32(TCG_AREG0,
                                     offsetof(CPUSH4State, sr), "SR");
+    cpu_sr_m = tcg_global_mem_new_i32(TCG_AREG0,
+                                    offsetof(CPUSH4State, sr_m), "SR_M");
+    cpu_sr_q = tcg_global_mem_new_i32(TCG_AREG0,
+                                    offsetof(CPUSH4State, sr_q), "SR_Q");
     cpu_sr_t = tcg_global_mem_new_i32(TCG_AREG0,
                                     offsetof(CPUSH4State, sr_t), "SR_T");
     cpu_ssr = tcg_global_mem_new_i32(TCG_AREG0,
@@ -175,14 +180,28 @@ void superh_cpu_dump_state(CPUState *cs, FILE *f,
 }
 static void gen_read_sr(TCGv dst)
 {
-    tcg_gen_andi_i32(dst, cpu_sr, ~(1u << SR_T));
-    tcg_gen_or_i32(dst, dst, cpu_sr_t);
+    TCGv t0 = tcg_temp_new();
+    tcg_gen_andi_i32(dst, cpu_sr,
+                     ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
+    tcg_gen_shli_i32(t0, cpu_sr_q, SR_Q);
+    tcg_gen_or_i32(dst, dst, t0);
+    tcg_gen_shli_i32(t0, cpu_sr_m, SR_M);
+    tcg_gen_or_i32(dst, dst, t0);
+    tcg_gen_shli_i32(t0, cpu_sr_t, SR_T);
+    tcg_gen_or_i32(dst, dst, t0);
+    tcg_temp_free_i32(t0);
 }
 
 static void gen_write_sr(TCGv src)
 {
-    tcg_gen_andi_i32(cpu_sr, src, ~(1u << SR_T));
-    tcg_gen_andi_i32(cpu_sr_t, src, (1u << SR_T));
+    tcg_gen_andi_i32(cpu_sr, src,
+                     ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
+    tcg_gen_shri_i32(cpu_sr_q, src, SR_Q);
+    tcg_gen_andi_i32(cpu_sr_q, cpu_sr_q, 1);
+    tcg_gen_shri_i32(cpu_sr_m, src, SR_M);
+    tcg_gen_andi_i32(cpu_sr_m, cpu_sr_m, 1);
+    tcg_gen_shri_i32(cpu_sr_t, src, SR_T);
+    tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
 }
 
 static void gen_goto_tb(DisasContext * ctx, int n, target_ulong dest)
@@ -389,7 +408,8 @@ static void _decode_opc(DisasContext * ctx)
 
     switch (ctx->opcode) {
     case 0x0019:		/* div0u */
-        tcg_gen_andi_i32(cpu_sr, cpu_sr, ~((1u << SR_M) | (1u << SR_Q)));
+        tcg_gen_movi_i32(cpu_sr_m, 0);
+        tcg_gen_movi_i32(cpu_sr_q, 0);
         tcg_gen_movi_i32(cpu_sr_t, 0);
 	return;
     case 0x000b:		/* rts */
@@ -708,8 +728,8 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x2007:		/* div0s Rm,Rn */
 	{
-            gen_copy_bit_i32(cpu_sr, SR_Q, REG(B11_8), 31);     /* SR_Q */
-            gen_copy_bit_i32(cpu_sr, SR_M, REG(B7_4), 31);      /* SR_M */
+            tcg_gen_shri_i32(cpu_sr_q, REG(B11_8), 31);         /* SR_Q */
+            tcg_gen_mov_i32(cpu_sr_m, cpu_sr_q);                /* SR_M */
 	    TCGv val = tcg_temp_new();
             tcg_gen_xor_i32(cpu_sr_t, REG(B7_4), REG(B11_8));
             tcg_gen_shri_i32(cpu_sr_t, cpu_sr_t, 31);           /* SR_T */
@@ -717,7 +737,36 @@ static void _decode_opc(DisasContext * ctx)
 	}
 	return;
     case 0x3004:		/* div1 Rm,Rn */
-        gen_helper_div1(REG(B11_8), cpu_env, REG(B7_4), REG(B11_8));
+        {
+            TCGv t0 = tcg_temp_new();
+            TCGv t1 = tcg_temp_new();
+            TCGv t2 = tcg_temp_new();
+            TCGv zero = tcg_const_i32(0);
+
+            /* shift left arg1, saving the bit being pushed out and inserting
+               T on the right */
+            tcg_gen_shri_i32(t0, REG(B11_8), 31);
+            tcg_gen_shli_i32(REG(B11_8), REG(B11_8), 1);
+            tcg_gen_or_i32(REG(B11_8), REG(B11_8), cpu_sr_t);
+
+            /* add or subtract arg0 from arg1 depending if Q == M */
+            tcg_gen_xor_i32(t1, cpu_sr_q, cpu_sr_m);
+            tcg_gen_subi_i32(t1, t1, 1);
+            tcg_gen_neg_i32(t2, REG(B7_4));
+            tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, zero, REG(B7_4), t2);
+            tcg_gen_add2_i32(REG(B11_8), t1, REG(B11_8), zero, t2, t1);
+
+            /* compute T and Q depending on carry */
+            tcg_gen_andi_i32(t1, t1, 1);
+            tcg_gen_xor_i32(t1, t1, t0);
+            tcg_gen_xori_i32(cpu_sr_t, t1, 1);
+            tcg_gen_xor_i32(cpu_sr_q, cpu_sr_m, t1);
+
+            tcg_temp_free(zero);
+            tcg_temp_free(t2);
+            tcg_temp_free(t1);
+            tcg_temp_free(t0);
+        }
 	return;
     case 0x300d:		/* dmuls.l Rm,Rn */
         tcg_gen_muls2_i32(cpu_macl, cpu_mach, REG(B7_4), REG(B11_8));
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 7/9] target-sh4: factorize fmov implementation
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
                   ` (5 preceding siblings ...)
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 6/9] target-sh4: split out Q and M from of SR and optimize div1 Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:46   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 8/9] target-sh4: remove dead code Aurelien Jarno
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 9/9] target-sh4: simplify tas instruction Aurelien Jarno
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/translate.c |   18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index d4046f8..45eb839 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -1030,24 +1030,18 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0xf00b: /* fmov {F,D,X}Rm,@-Rn - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
+	const int fr = XREG(B7_4);
+	TCGv addr = tcg_temp_new_i32();
+	tcg_gen_subi_i32(addr, REG(B11_8), 4);
         if (ctx->flags & FPSCR_SZ) {
-	    TCGv addr = tcg_temp_new_i32();
-	    int fr = XREG(B7_4);
-	    tcg_gen_subi_i32(addr, REG(B11_8), 4);
             tcg_gen_qemu_st_i32(cpu_fregs[fr+1], addr, ctx->memidx, MO_TEUL);
 	    tcg_gen_subi_i32(addr, addr, 4);
             tcg_gen_qemu_st_i32(cpu_fregs[fr], addr, ctx->memidx, MO_TEUL);
-	    tcg_gen_mov_i32(REG(B11_8), addr);
-	    tcg_temp_free(addr);
 	} else {
-	    TCGv addr;
-	    addr = tcg_temp_new_i32();
-	    tcg_gen_subi_i32(addr, REG(B11_8), 4);
-            tcg_gen_qemu_st_i32(cpu_fregs[FREG(B7_4)], addr,
-                                ctx->memidx, MO_TEUL);
-	    tcg_gen_mov_i32(REG(B11_8), addr);
-	    tcg_temp_free(addr);
+            tcg_gen_qemu_st_i32(cpu_fregs[fr], addr, ctx->memidx, MO_TEUL);
 	}
+	tcg_gen_mov_i32(REG(B11_8), addr);
+	tcg_temp_free(addr);
 	return;
     case 0xf006: /* fmov @(R0,Rm),{F,D,X}Rm - FPSCR: Nothing */
 	CHECK_FPU_ENABLED
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 8/9] target-sh4: remove dead code
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
                   ` (6 preceding siblings ...)
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 7/9] target-sh4: factorize fmov implementation Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:46   ` Richard Henderson
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 9/9] target-sh4: simplify tas instruction Aurelien Jarno
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/translate.c |    1 -
 1 file changed, 1 deletion(-)

diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 45eb839..9a878d0 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -18,7 +18,6 @@
  */
 
 #define DEBUG_DISAS
-//#define SH4_SINGLE_STEP
 
 #include "cpu.h"
 #include "disas/disas.h"
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [Qemu-devel] [PATCH v2 9/9] target-sh4: simplify tas instruction
  2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
                   ` (7 preceding siblings ...)
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 8/9] target-sh4: remove dead code Aurelien Jarno
@ 2013-12-22 11:50 ` Aurelien Jarno
  2013-12-24 14:49   ` Richard Henderson
  8 siblings, 1 reply; 19+ messages in thread
From: Aurelien Jarno @ 2013-12-22 11:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Now that setcondi is used instead of branches, temp_local are not needed
anymore.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 target-sh4/translate.c |   10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/target-sh4/translate.c b/target-sh4/translate.c
index 9a878d0..e73932c 100644
--- a/target-sh4/translate.c
+++ b/target-sh4/translate.c
@@ -1673,16 +1673,12 @@ static void _decode_opc(DisasContext * ctx)
 	return;
     case 0x401b:		/* tas.b @Rn */
 	{
-	    TCGv addr, val;
-	    addr = tcg_temp_local_new();
-	    tcg_gen_mov_i32(addr, REG(B11_8));
-	    val = tcg_temp_local_new();
-            tcg_gen_qemu_ld_i32(val, addr, ctx->memidx, MO_UB);
+	    TCGv val = tcg_temp_new();
+            tcg_gen_qemu_ld_i32(val, REG(B11_8), ctx->memidx, MO_UB);
             tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
 	    tcg_gen_ori_i32(val, val, 0x80);
-            tcg_gen_qemu_st_i32(val, addr, ctx->memidx, MO_UB);
+            tcg_gen_qemu_st_i32(val, REG(B11_8), ctx->memidx, MO_UB);
 	    tcg_temp_free(val);
-	    tcg_temp_free(addr);
 	}
 	return;
     case 0xf00d: /* fsts FPUL,FRn - FPSCR: Nothing */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/9] target-sh4: use bit number for SR constants
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 1/9] target-sh4: use bit number for SR constants Aurelien Jarno
@ 2013-12-24 14:10   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:10 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> Use the bit number for SR constants instead of using a bit mask. This
> make possible to also use the constants for shifts.
> 
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-sh4/cpu.c       |    3 +-
>  target-sh4/cpu.h       |   30 +++++++++----------
>  target-sh4/gdbstub.c   |    4 +--
>  target-sh4/helper.c    |   27 ++++++++---------
>  target-sh4/op_helper.c |   26 ++++++++---------
>  target-sh4/translate.c |   75 +++++++++++++++++++++++++-----------------------
>  6 files changed, 85 insertions(+), 80 deletions(-)

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 2/9] target-sh4: Split out T from SR
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 2/9] target-sh4: Split out T from SR Aurelien Jarno
@ 2013-12-24 14:13   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:13 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> +static inline target_ulong cpu_read_sr(CPUSH4State *env)
> +{
> +    return (env->sr & ~(1u << SR_T)) | (env->sr_t << SR_T);
> +}
> +
> +static inline void cpu_write_sr(CPUSH4State *env, target_ulong sr)
> +{
> +    env->sr_t = sr & (1u << SR_T);
> +    env->sr = sr & ~(1u << SR_T);
> +}
...
> +static void gen_read_sr(TCGv dst)
> +{
> +    tcg_gen_andi_i32(dst, cpu_sr, ~(1u << SR_T));
> +    tcg_gen_or_i32(dst, dst, cpu_sr_t);
> +}
> +
> +static void gen_write_sr(TCGv src)
> +{
> +    tcg_gen_andi_i32(cpu_sr, src, ~(1u << SR_T));
> +    tcg_gen_andi_i32(cpu_sr_t, src, (1u << SR_T));
> +}

If the writer always clears SR_T when assigning to env->sr,
then there's no need to clear it when reading from env->sr.
Or vice versa.

Otherwise,

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 3/9] target-sh4: optimize addc using add2
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 3/9] target-sh4: optimize addc using add2 Aurelien Jarno
@ 2013-12-24 14:18   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:18 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> +            t2 = tcg_temp_new();
> +            tcg_gen_add2_i32(t1, t2, REG(B11_8), t0, REG(B7_4), t0);
> +            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t, t1, t2, cpu_sr_t, t0);

FWIW, one can avoid an extra temporary by consuming cpu_sr_t in the first add
rather than the second.  But otherwise,

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/9] target-sh4: optimize subc using sub2
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 4/9] target-sh4: optimize subc using sub2 Aurelien Jarno
@ 2013-12-24 14:22   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:22 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-sh4/translate.c |   15 ++++++---------
>  1 file changed, 6 insertions(+), 9 deletions(-)

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 5/9] target-sh4: optimize negc using add2 and sub2
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 5/9] target-sh4: optimize negc using add2 and sub2 Aurelien Jarno
@ 2013-12-24 14:25   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:25 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> +            TCGv t0 = tcg_const_i32(0);
> +            tcg_gen_add2_i32(REG(B11_8), cpu_sr_t,
> +                             REG(B7_4), t0, cpu_sr_t, t0);
> +            tcg_gen_sub2_i32(REG(B11_8), cpu_sr_t,
> +                             t0, t0, REG(B11_8), cpu_sr_t);

While this formulation is correct, I wonder why you strayed from the two
subtracts model of subc, since negc is similarly described in the manual?

Otherwise,

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 6/9] target-sh4: split out Q and M from of SR and optimize div1
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 6/9] target-sh4: split out Q and M from of SR and optimize div1 Aurelien Jarno
@ 2013-12-24 14:44   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:44 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
>  static void gen_read_sr(TCGv dst)
>  {
> -    tcg_gen_andi_i32(dst, cpu_sr, ~(1u << SR_T));
> -    tcg_gen_or_i32(dst, dst, cpu_sr_t);
> +    TCGv t0 = tcg_temp_new();
> +    tcg_gen_andi_i32(dst, cpu_sr,
> +                     ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
> +    tcg_gen_shli_i32(t0, cpu_sr_q, SR_Q);
> +    tcg_gen_or_i32(dst, dst, t0);
> +    tcg_gen_shli_i32(t0, cpu_sr_m, SR_M);
> +    tcg_gen_or_i32(dst, dst, t0);
> +    tcg_gen_shli_i32(t0, cpu_sr_t, SR_T);
> +    tcg_gen_or_i32(dst, dst, t0);
> +    tcg_temp_free_i32(t0);
>  }

Similar comments for SR_[QM] as for SR_T wrt who clears the relevant bits in
env->sr.


>      case 0x2007:		/* div0s Rm,Rn */
>  	{
> -            gen_copy_bit_i32(cpu_sr, SR_Q, REG(B11_8), 31);     /* SR_Q */
> -            gen_copy_bit_i32(cpu_sr, SR_M, REG(B7_4), 31);      /* SR_M */
> +            tcg_gen_shri_i32(cpu_sr_q, REG(B11_8), 31);         /* SR_Q */
> +            tcg_gen_mov_i32(cpu_sr_m, cpu_sr_q);                /* SR_M */
>  	    TCGv val = tcg_temp_new();
>              tcg_gen_xor_i32(cpu_sr_t, REG(B7_4), REG(B11_8));
>              tcg_gen_shri_i32(cpu_sr_t, cpu_sr_t, 31);           /* SR_T */

Error setting M.  Q and M are set from different source registers.

And as a point of optimization, T no longer needs the shift if one uses the
extracted Q and M as inputs.

> +            /* add or subtract arg0 from arg1 depending if Q == M */
> +            tcg_gen_xor_i32(t1, cpu_sr_q, cpu_sr_m);
> +            tcg_gen_subi_i32(t1, t1, 1);
> +            tcg_gen_neg_i32(t2, REG(B7_4));
> +            tcg_gen_movcond_i32(TCG_COND_EQ, t2, t1, zero, REG(B7_4), t2);
> +            tcg_gen_add2_i32(REG(B11_8), t1, REG(B11_8), zero, t2, t1);

Why so complicated with the comparison?  I'd have expected

  tcg_gen_movcond_i32(TCG_COND_EQ, t2, cpu_sr_q, cpu_sr_m, REG(B7_4), t2);

Hmm... except I see you're re-using the condition as the high-part of the add2.
 That's pretty tricky.  Perhaps expand upon the comment?


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 7/9] target-sh4: factorize fmov implementation
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 7/9] target-sh4: factorize fmov implementation Aurelien Jarno
@ 2013-12-24 14:46   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:46 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-sh4/translate.c |   18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 8/9] target-sh4: remove dead code
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 8/9] target-sh4: remove dead code Aurelien Jarno
@ 2013-12-24 14:46   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:46 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-sh4/translate.c |    1 -
>  1 file changed, 1 deletion(-)

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Qemu-devel] [PATCH v2 9/9] target-sh4: simplify tas instruction
  2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 9/9] target-sh4: simplify tas instruction Aurelien Jarno
@ 2013-12-24 14:49   ` Richard Henderson
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Henderson @ 2013-12-24 14:49 UTC (permalink / raw)
  To: Aurelien Jarno, qemu-devel

On 12/22/2013 03:50 AM, Aurelien Jarno wrote:
> Now that setcondi is used instead of branches, temp_local are not needed
> anymore.
> 
> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  target-sh4/translate.c |   10 +++-------
>  1 file changed, 3 insertions(+), 7 deletions(-)

Reviewed-by: Richard Henderson <rth@twiddle.net>


r~

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2013-12-24 14:49 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-12-22 11:50 [Qemu-devel] [PATCH v2 0/9] target-sh4: optimizations and cleanups Aurelien Jarno
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 1/9] target-sh4: use bit number for SR constants Aurelien Jarno
2013-12-24 14:10   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 2/9] target-sh4: Split out T from SR Aurelien Jarno
2013-12-24 14:13   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 3/9] target-sh4: optimize addc using add2 Aurelien Jarno
2013-12-24 14:18   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 4/9] target-sh4: optimize subc using sub2 Aurelien Jarno
2013-12-24 14:22   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 5/9] target-sh4: optimize negc using add2 and sub2 Aurelien Jarno
2013-12-24 14:25   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 6/9] target-sh4: split out Q and M from of SR and optimize div1 Aurelien Jarno
2013-12-24 14:44   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 7/9] target-sh4: factorize fmov implementation Aurelien Jarno
2013-12-24 14:46   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 8/9] target-sh4: remove dead code Aurelien Jarno
2013-12-24 14:46   ` Richard Henderson
2013-12-22 11:50 ` [Qemu-devel] [PATCH v2 9/9] target-sh4: simplify tas instruction Aurelien Jarno
2013-12-24 14:49   ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).