qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v2 0/7] tcg: movcond
@ 2012-09-21 17:13 Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 1/7] tcg: Introduce movcond Richard Henderson
                   ` (8 more replies)
  0 siblings, 9 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

The patch set is available at

   git://repo.or.cz/qemu/rth.git tcg-movcond

Changes v1->v2:
   Patch 1 drops the spurrious return.

   Patch 4 incorporates the fix Aurelian posted.

   Patch 5 drops the non-movcond commutative optimization that is
     present in Aurelian's tcg optimization patch set.

   Patches 6 & 7 are new, incrementally improving 32-bit code generation.

   Rebased vs c26032b2c91721245bfec542d94f37a0238e986e:
     target-xtensa: don't emit extra tcg_gen_goto_tb


r~


Richard Henderson (7):
  tcg: Introduce movcond
  target-alpha: Use movcond
  tcg-i386: Implement movcond
  tcg: Optimize movcond for constant comparisons
  tcg: Optimize two-address commutative operations
  tcg: Streamline movcond_i64 using 32-bit arithmetic
  tcg: Streamline movcond_i64 using movcond_i32

 target-alpha/translate.c | 102 ++++++++++++++++++++++-------------------------
 tcg/README               |   6 +++
 tcg/arm/tcg-target.h     |   1 +
 tcg/hppa/tcg-target.h    |   1 +
 tcg/i386/tcg-target.c    |  29 ++++++++++++++
 tcg/i386/tcg-target.h    |   7 ++++
 tcg/ia64/tcg-target.h    |   2 +
 tcg/mips/tcg-target.h    |   1 +
 tcg/optimize.c           |  53 ++++++++++++++++++++++++
 tcg/ppc/tcg-target.h     |   1 +
 tcg/ppc64/tcg-target.h   |   2 +
 tcg/s390/tcg-target.h    |   2 +
 tcg/sparc/tcg-target.h   |   2 +
 tcg/tcg-op.h             |  68 +++++++++++++++++++++++++++++++
 tcg/tcg-opc.h            |   2 +
 tcg/tcg.c                |  11 +++--
 tcg/tcg.h                |   1 +
 tcg/tci/tcg-target.h     |   2 +
 18 files changed, 233 insertions(+), 60 deletions(-)

-- 
1.7.11.4

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 1/7] tcg: Introduce movcond
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
@ 2012-09-21 17:13 ` Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 2/7] target-alpha: Use movcond Richard Henderson
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Implemented with setcond if the target does not provide
the optional opcode.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/README             |  6 ++++++
 tcg/arm/tcg-target.h   |  1 +
 tcg/hppa/tcg-target.h  |  1 +
 tcg/i386/tcg-target.h  |  2 ++
 tcg/ia64/tcg-target.h  |  2 ++
 tcg/mips/tcg-target.h  |  1 +
 tcg/ppc/tcg-target.h   |  1 +
 tcg/ppc64/tcg-target.h |  2 ++
 tcg/s390/tcg-target.h  |  2 ++
 tcg/sparc/tcg-target.h |  2 ++
 tcg/tcg-op.h           | 40 ++++++++++++++++++++++++++++++++++++++++
 tcg/tcg-opc.h          |  2 ++
 tcg/tcg.c              | 11 +++++------
 tcg/tcg.h              |  1 +
 tcg/tci/tcg-target.h   |  2 ++
 15 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/tcg/README b/tcg/README
index cfdfd96..d03ae05 100644
--- a/tcg/README
+++ b/tcg/README
@@ -307,6 +307,12 @@ dest = (t1 cond t2)
 
 Set DEST to 1 if (T1 cond T2) is true, otherwise set to 0.
 
+* movcond_i32/i64 cond, dest, c1, c2, v1, v2
+
+dest = (c1 cond c2 ? v1 : v2)
+
+Set DEST to V1 if (C1 cond C2) is true, otherwise set to V2.
+
 ********* Type conversions
 
 * ext_i32_i64 t0, t1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index c0b8f72..e2299ca 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -73,6 +73,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #define TCG_TARGET_HAS_GUEST_BASE
 
diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
index 01ef960..4defd28 100644
--- a/tcg/hppa/tcg-target.h
+++ b/tcg/hppa/tcg-target.h
@@ -96,6 +96,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
+#define TCG_TARGET_HAS_movcond_i32      0
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub rd, 0, rs */
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 8be42f3..504f953 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -86,6 +86,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -107,6 +108,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      1
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index c22962a..368aee4 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -133,6 +133,8 @@ typedef enum {
 #define TCG_TARGET_HAS_rot_i64          1
 #define TCG_TARGET_HAS_deposit_i32      0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i64      0
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub r1, r0, r3 */
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 1c61931..9c68a32 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -90,6 +90,7 @@ typedef enum {
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub  rd, zero, rt   */
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 2f37fd2..177eea1 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -92,6 +92,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         1
 #define TCG_TARGET_HAS_nor_i32          1
 #define TCG_TARGET_HAS_deposit_i32      1
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index 97eec08..57569e8 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -83,6 +83,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rot_i64          0
@@ -103,6 +104,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index 4f7dfab..ed55c33 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -63,6 +63,7 @@ typedef enum TCGReg {
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -84,6 +85,7 @@ typedef enum TCGReg {
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif
 
 #define TCG_TARGET_HAS_GUEST_BASE
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 0ea87be..d762574 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -102,6 +102,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      0
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div_i64          1
@@ -123,6 +124,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif
 
 #ifdef CONFIG_SOLARIS
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 169d3b2..6d28f82 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -2118,6 +2118,44 @@ static inline void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1,
     tcg_temp_free_i64(t1);
 }
 
+static inline void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret,
+                                       TCGv_i32 c1, TCGv_i32 c2,
+                                       TCGv_i32 v1, TCGv_i32 v2)
+{
+    if (TCG_TARGET_HAS_movcond_i32) {
+        tcg_gen_op6i_i32(INDEX_op_movcond_i32, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        tcg_gen_setcond_i32(cond, t0, c1, c2);
+        tcg_gen_neg_i32(t0, t0);
+        tcg_gen_and_i32(t1, v1, t0);
+        tcg_gen_andc_i32(ret, v2, t0);
+        tcg_gen_or_i32(ret, ret, t1);
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    }
+}
+
+static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
+                                       TCGv_i64 c1, TCGv_i64 c2,
+                                       TCGv_i64 v1, TCGv_i64 v2)
+{
+    if (TCG_TARGET_HAS_movcond_i64) {
+        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
+    } else {
+        TCGv_i64 t0 = tcg_temp_new_i64();
+        TCGv_i64 t1 = tcg_temp_new_i64();
+        tcg_gen_setcond_i64(cond, t0, c1, c2);
+        tcg_gen_neg_i64(t0, t0);
+        tcg_gen_and_i64(t1, v1, t0);
+        tcg_gen_andc_i64(ret, v2, t0);
+        tcg_gen_or_i64(ret, ret, t1);
+        tcg_temp_free_i64(t0);
+        tcg_temp_free_i64(t1);
+    }
+}
+
 /***************************************/
 /* QEMU specific operations. Their type depend on the QEMU CPU
    type. */
@@ -2434,6 +2472,7 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_deposit_tl tcg_gen_deposit_i64
 #define tcg_const_tl tcg_const_i64
 #define tcg_const_local_tl tcg_const_local_i64
+#define tcg_gen_movcond_tl tcg_gen_movcond_i64
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -2505,6 +2544,7 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv addr, int mem_index)
 #define tcg_gen_deposit_tl tcg_gen_deposit_i32
 #define tcg_const_tl tcg_const_i32
 #define tcg_const_local_tl tcg_const_local_i32
+#define tcg_gen_movcond_tl tcg_gen_movcond_i32
 #endif
 
 #if TCG_TARGET_REG_BITS == 32
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index d12e8d0..dbb0e39 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -51,6 +51,7 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_SIDE_EFFECTS)
 DEF(mov_i32, 1, 1, 0, 0)
 DEF(movi_i32, 1, 0, 1, 0)
 DEF(setcond_i32, 1, 2, 1, 0)
+DEF(movcond_i32, 1, 4, 1, IMPL(TCG_TARGET_HAS_movcond_i32))
 /* load/store */
 DEF(ld8u_i32, 1, 1, 1, 0)
 DEF(ld8s_i32, 1, 1, 1, 0)
@@ -107,6 +108,7 @@ DEF(nor_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_nor_i32))
 DEF(mov_i64, 1, 1, 0, IMPL64)
 DEF(movi_i64, 1, 0, 1, IMPL64)
 DEF(setcond_i64, 1, 2, 1, IMPL64)
+DEF(movcond_i64, 1, 4, 1, IMPL64 | IMPL(TCG_TARGET_HAS_movcond_i64))
 /* load/store */
 DEF(ld8u_i64, 1, 1, 1, IMPL64)
 DEF(ld8s_i64, 1, 1, 1, IMPL64)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index b8a1bec..bb9c995 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -991,16 +991,15 @@ void tcg_dump_ops(TCGContext *s)
             }
             switch (c) {
             case INDEX_op_brcond_i32:
-#if TCG_TARGET_REG_BITS == 32
-            case INDEX_op_brcond2_i32:
-#elif TCG_TARGET_REG_BITS == 64
-            case INDEX_op_brcond_i64:
-#endif
             case INDEX_op_setcond_i32:
+            case INDEX_op_movcond_i32:
 #if TCG_TARGET_REG_BITS == 32
+            case INDEX_op_brcond2_i32:
             case INDEX_op_setcond2_i32:
-#elif TCG_TARGET_REG_BITS == 64
+#else
+            case INDEX_op_brcond_i64:
             case INDEX_op_setcond_i64:
+            case INDEX_op_movcond_i64:
 #endif
                 if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]]) {
                     qemu_log(",%s", cond_name[args[k++]]);
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 477775d..7e903f3 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -79,6 +79,7 @@ typedef uint64_t TCGRegSet;
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      0
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif
 
 #ifndef TCG_TARGET_deposit_i32_valid
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 30a0f21..6d89495 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -75,6 +75,7 @@
 #define TCG_TARGET_HAS_not_i32          1
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_rot_i32          1
+#define TCG_TARGET_HAS_movcond_i32      0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_bswap16_i64      1
@@ -98,6 +99,7 @@
 #define TCG_TARGET_HAS_not_i64          1
 #define TCG_TARGET_HAS_orc_i64          0
 #define TCG_TARGET_HAS_rot_i64          1
+#define TCG_TARGET_HAS_movcond_i64      0
 #endif /* TCG_TARGET_REG_BITS == 64 */
 
 /* Offset to user memory in user mode. */
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 2/7] target-alpha: Use movcond
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 1/7] tcg: Introduce movcond Richard Henderson
@ 2012-09-21 17:13 ` Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond Richard Henderson
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

For proper cmov insns, as well as the non-goto-tb case
of conditional branch.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-alpha/translate.c | 102 ++++++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 54 deletions(-)

diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index 12de6a3..4a9011a 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -426,27 +426,15 @@ static ExitStatus gen_bcond_internal(DisasContext *ctx, TCGCond cond,
 
         return EXIT_GOTO_TB;
     } else {
-        int lab_over = gen_new_label();
-
-        /* ??? Consider using either
-             movi pc, next
-             addi tmp, pc, disp
-             movcond pc, cond, 0, tmp, pc
-           or
-             setcond tmp, cond, 0
-             movi pc, next
-             neg tmp, tmp
-             andi tmp, tmp, disp
-             add pc, pc, tmp
-           The current diamond subgraph surely isn't efficient.  */
+        TCGv_i64 z = tcg_const_i64(0);
+        TCGv_i64 d = tcg_const_i64(dest);
+        TCGv_i64 p = tcg_const_i64(ctx->pc);
 
-        tcg_gen_brcondi_i64(cond, cmp, 0, lab_true);
-        tcg_gen_movi_i64(cpu_pc, ctx->pc);
-        tcg_gen_br(lab_over);
-        gen_set_label(lab_true);
-        tcg_gen_movi_i64(cpu_pc, dest);
-        gen_set_label(lab_over);
+        tcg_gen_movcond_i64(cond, cpu_pc, cmp, z, d, p);
 
+        tcg_temp_free_i64(z);
+        tcg_temp_free_i64(d);
+        tcg_temp_free_i64(p);
         return EXIT_PC_UPDATED;
     }
 }
@@ -521,61 +509,67 @@ static ExitStatus gen_fbcond(DisasContext *ctx, TCGCond cond, int ra,
 static void gen_cmov(TCGCond cond, int ra, int rb, int rc,
                      int islit, uint8_t lit, int mask)
 {
-    TCGCond inv_cond = tcg_invert_cond(cond);
-    int l1;
+    TCGv_i64 c1, z, v1;
 
-    if (unlikely(rc == 31))
+    if (unlikely(rc == 31)) {
         return;
+    }
 
-    l1 = gen_new_label();
-
-    if (ra != 31) {
-        if (mask) {
-            TCGv tmp = tcg_temp_new();
-            tcg_gen_andi_i64(tmp, cpu_ir[ra], 1);
-            tcg_gen_brcondi_i64(inv_cond, tmp, 0, l1);
-            tcg_temp_free(tmp);
-        } else
-            tcg_gen_brcondi_i64(inv_cond, cpu_ir[ra], 0, l1);
-    } else {
+    if (ra == 31) {
         /* Very uncommon case - Do not bother to optimize.  */
-        TCGv tmp = tcg_const_i64(0);
-        tcg_gen_brcondi_i64(inv_cond, tmp, 0, l1);
-        tcg_temp_free(tmp);
+        c1 = tcg_const_i64(0);
+    } else if (mask) {
+        c1 = tcg_const_i64(1);
+        tcg_gen_and_i64(c1, c1, cpu_ir[ra]);
+    } else {
+        c1 = cpu_ir[ra];
     }
+    if (islit) {
+        v1 = tcg_const_i64(lit);
+    } else {
+        v1 = cpu_ir[rb];
+    }
+    z = tcg_const_i64(0);
 
-    if (islit)
-        tcg_gen_movi_i64(cpu_ir[rc], lit);
-    else
-        tcg_gen_mov_i64(cpu_ir[rc], cpu_ir[rb]);
-    gen_set_label(l1);
+    tcg_gen_movcond_i64(cond, cpu_ir[rc], c1, z, v1, cpu_ir[rc]);
+
+    tcg_temp_free_i64(z);
+    if (ra == 31 || mask) {
+        tcg_temp_free_i64(c1);
+    }
+    if (islit) {
+        tcg_temp_free_i64(v1);
+    }
 }
 
 static void gen_fcmov(TCGCond cond, int ra, int rb, int rc)
 {
-    TCGv cmp_tmp;
-    int l1;
+    TCGv_i64 c1, z, v1;
 
     if (unlikely(rc == 31)) {
         return;
     }
 
-    cmp_tmp = tcg_temp_new();
+    c1 = tcg_temp_new_i64();
     if (unlikely(ra == 31)) {
-        tcg_gen_movi_i64(cmp_tmp, 0);
+        tcg_gen_movi_i64(c1, 0);
+    } else {
+        gen_fold_mzero(cond, c1, cpu_fir[ra]);
+    }
+    if (rb == 31) {
+        v1 = tcg_const_i64(0);
     } else {
-        gen_fold_mzero(cond, cmp_tmp, cpu_fir[ra]);
+        v1 = cpu_fir[rb];
     }
+    z = tcg_const_i64(0);
 
-    l1 = gen_new_label();
-    tcg_gen_brcondi_i64(tcg_invert_cond(cond), cmp_tmp, 0, l1);
-    tcg_temp_free(cmp_tmp);
+    tcg_gen_movcond_i64(cond, cpu_fir[rc], c1, z, v1, cpu_fir[rc]);
 
-    if (rb != 31)
-        tcg_gen_mov_i64(cpu_fir[rc], cpu_fir[rb]);
-    else
-        tcg_gen_movi_i64(cpu_fir[rc], 0);
-    gen_set_label(l1);
+    tcg_temp_free_i64(z);
+    tcg_temp_free_i64(c1);
+    if (rb == 31) {
+        tcg_temp_free_i64(v1);
+    }
 }
 
 #define QUAL_RM_N       0x080   /* Round mode nearest even */
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 1/7] tcg: Introduce movcond Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 2/7] target-alpha: Use movcond Richard Henderson
@ 2012-09-21 17:13 ` Richard Henderson
  2012-09-24 21:37   ` Alex Barcelo
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 4/7] tcg: Optimize movcond for constant comparisons Richard Henderson
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
---
 tcg/i386/tcg-target.c | 29 +++++++++++++++++++++++++++++
 tcg/i386/tcg-target.h |  7 ++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 3017858..aa1fa9f 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -249,6 +249,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_CALL_Jz	(0xe8)
+#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 #define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
 #define OPC_DEC_r32	(0x48)
 #define OPC_IMUL_GvEv	(0xaf | P_EXT)
@@ -936,6 +937,24 @@ static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
 }
 #endif
 
+static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
+                              TCGArg c1, TCGArg c2, int const_c2,
+                              TCGArg v1)
+{
+    tcg_out_cmp(s, c1, c2, const_c2, 0);
+    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond], dest, v1);
+}
+
+#if TCG_TARGET_REG_BITS == 64
+static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGArg dest,
+                              TCGArg c1, TCGArg c2, int const_c2,
+                              TCGArg v1)
+{
+    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
+    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | P_REXW, dest, v1);
+}
+#endif
+
 static void tcg_out_branch(TCGContext *s, int call, tcg_target_long dest)
 {
     tcg_target_long disp = dest - (tcg_target_long)s->code_ptr - 5;
@@ -1668,6 +1687,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_setcond32(s, args[3], args[0], args[1],
                           args[2], const_args[2]);
         break;
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond32(s, args[5], args[0], args[1],
+                          args[2], const_args[2], args[3]);
+        break;
 
     OP_32_64(bswap16):
         tcg_out_rolw_8(s, args[0]);
@@ -1796,6 +1819,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_setcond64(s, args[3], args[0], args[1],
                           args[2], const_args[2]);
         break;
+    case INDEX_op_movcond_i64:
+        tcg_out_movcond64(s, args[5], args[0], args[1],
+                          args[2], const_args[2], args[3]);
+        break;
 
     case INDEX_op_bswap64_i64:
         tcg_out_bswap64(s, args[0]);
@@ -1880,6 +1907,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_setcond_i32, { "q", "r", "ri" } },
 
     { INDEX_op_deposit_i32, { "Q", "0", "Q" } },
+    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
 
 #if TCG_TARGET_REG_BITS == 32
     { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
@@ -1934,6 +1962,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_ext32u_i64, { "r", "r" } },
 
     { INDEX_op_deposit_i64, { "Q", "0", "Q" } },
+    { INDEX_op_movcond_i64, { "r", "r", "re", "r", "0" } },
 #endif
 
 #if TCG_TARGET_REG_BITS == 64
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 504f953..b356d76 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -86,7 +86,12 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_nor_i32          0
 #define TCG_TARGET_HAS_deposit_i32      1
+#if defined(__x86_64__) || defined(__i686__)
+/* Use cmov only if the compiler is already doing so.  */
+#define TCG_TARGET_HAS_movcond_i32      1
+#else
 #define TCG_TARGET_HAS_movcond_i32      0
+#endif
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -108,7 +113,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i64         0
 #define TCG_TARGET_HAS_nor_i64          0
 #define TCG_TARGET_HAS_deposit_i64      1
-#define TCG_TARGET_HAS_movcond_i64      0
+#define TCG_TARGET_HAS_movcond_i64      1
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 4/7] tcg: Optimize movcond for constant comparisons
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
                   ` (2 preceding siblings ...)
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond Richard Henderson
@ 2012-09-21 17:13 ` Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 5/7] tcg: Optimize two-address commutative operations Richard Henderson
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 9da333c..26038a6 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -394,6 +394,14 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 args[3] = tcg_swap_cond(args[3]);
             }
             break;
+        CASE_OP_32_64(movcond):
+            if (temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[2]].state != TCG_TEMP_CONST) {
+                tmp = args[1];
+                args[1] = args[2];
+                args[2] = tmp;
+                args[5] = tcg_swap_cond(args[5]);
+            }
         default:
             break;
         }
@@ -614,6 +622,38 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             }
             args += 4;
             break;
+        CASE_OP_32_64(movcond):
+            if (temps[args[1]].state == TCG_TEMP_CONST
+                && temps[args[2]].state == TCG_TEMP_CONST) {
+                tmp = do_constant_folding_cond(op, temps[args[1]].val,
+                                               temps[args[2]].val, args[5]);
+                if (args[0] == args[4-tmp]
+                    || (temps[args[4-tmp]].state == TCG_TEMP_COPY
+                        && temps[args[4-tmp]].val == args[0])) {
+                    gen_opc_buf[op_index] = INDEX_op_nop;
+                } else if (temps[args[4-tmp]].state == TCG_TEMP_CONST) {
+                    gen_opc_buf[op_index] = op_to_movi(op);
+                    tcg_opt_gen_movi(gen_args, args[0], temps[args[4-tmp]].val,
+                                     nb_temps, nb_globals);
+                    gen_args += 2;
+                } else {
+                    gen_opc_buf[op_index] = op_to_mov(op);
+                    tcg_opt_gen_mov(gen_args, args[0], args[4-tmp],
+                                    nb_temps, nb_globals);
+                    gen_args += 2;
+                }
+            } else {
+                reset_temp(args[0], nb_temps, nb_globals);
+                gen_args[0] = args[0];
+                gen_args[1] = args[1];
+                gen_args[2] = args[2];
+                gen_args[3] = args[3];
+                gen_args[4] = args[4];
+                gen_args[5] = args[5];
+                gen_args += 6;
+            }
+            args += 6;
+            break;
         case INDEX_op_call:
             nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
             if (!(args[nb_call_args + 1] & (TCG_CALL_CONST | TCG_CALL_PURE))) {
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 5/7] tcg: Optimize two-address commutative operations
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
                   ` (3 preceding siblings ...)
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 4/7] tcg: Optimize movcond for constant comparisons Richard Henderson
@ 2012-09-21 17:13 ` Richard Henderson
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic Richard Henderson
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

While swapping constants to the second operand, swap
sources matching destinations to the first operand.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 26038a6..1be7631 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -334,6 +334,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
     const TCGOpDef *def;
     TCGArg *gen_args;
     TCGArg tmp;
+    TCGCond cond;
+
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
        If this temp is a copy of other ones then this equivalence class'
@@ -395,13 +397,24 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             }
             break;
         CASE_OP_32_64(movcond):
+            cond = args[5];
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state != TCG_TEMP_CONST) {
                 tmp = args[1];
                 args[1] = args[2];
                 args[2] = tmp;
-                args[5] = tcg_swap_cond(args[5]);
+                cond = tcg_swap_cond(cond);
+            }
+            /* For movcond, we canonicalize the "false" input reg to match
+               the destination reg so that the tcg backend can implement
+               a "move if true" operation.  */
+            if (args[0] == args[3]) {
+                tmp = args[3];
+                args[3] = args[4];
+                args[4] = tmp;
+                cond = tcg_invert_cond(cond);
             }
+            args[5] = cond;
         default:
             break;
         }
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
                   ` (4 preceding siblings ...)
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 5/7] tcg: Optimize two-address commutative operations Richard Henderson
@ 2012-09-21 17:13 ` Richard Henderson
  2012-09-21 21:15   ` Aurelien Jarno
  2012-09-22 18:11   ` Aurelien Jarno
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32 Richard Henderson
                   ` (2 subsequent siblings)
  8 siblings, 2 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Avoiding 64-bit arithmetic (outside of the compare) reduces the
generated op count from 15 to 12, and the generated code size on
i686 from 105 to 88 bytes.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/tcg-op.h | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 6d28f82..3e375ea 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -2141,18 +2141,38 @@ static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
                                        TCGv_i64 c1, TCGv_i64 c2,
                                        TCGv_i64 v1, TCGv_i64 v2)
 {
-    if (TCG_TARGET_HAS_movcond_i64) {
-        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
+    if (TCG_TARGET_REG_BITS == 32) {
+        TCGv_i32 t0 = tcg_temp_new_i32();
+        TCGv_i32 t1 = tcg_temp_new_i32();
+        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
+                         TCGV_LOW(c1), TCGV_HIGH(c1),
+                         TCGV_LOW(c2), TCGV_HIGH(c2), cond);
+        tcg_gen_neg_i32(t0, t0);
+
+        tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
+        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
+        tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
+
+        tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
+        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
+        tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
+
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
     } else {
-        TCGv_i64 t0 = tcg_temp_new_i64();
-        TCGv_i64 t1 = tcg_temp_new_i64();
-        tcg_gen_setcond_i64(cond, t0, c1, c2);
-        tcg_gen_neg_i64(t0, t0);
-        tcg_gen_and_i64(t1, v1, t0);
-        tcg_gen_andc_i64(ret, v2, t0);
-        tcg_gen_or_i64(ret, ret, t1);
-        tcg_temp_free_i64(t0);
-        tcg_temp_free_i64(t1);
+        if (TCG_TARGET_HAS_movcond_i64) {
+            tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
+        } else {
+            TCGv_i64 t0 = tcg_temp_new_i64();
+            TCGv_i64 t1 = tcg_temp_new_i64();
+            tcg_gen_setcond_i64(cond, t0, c1, c2);
+            tcg_gen_neg_i64(t0, t0);
+            tcg_gen_and_i64(t1, v1, t0);
+            tcg_gen_andc_i64(ret, v2, t0);
+            tcg_gen_or_i64(ret, ret, t1);
+            tcg_temp_free_i64(t0);
+            tcg_temp_free_i64(t1);
+        }
     }
 }
 
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
                   ` (5 preceding siblings ...)
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic Richard Henderson
@ 2012-09-21 17:13 ` Richard Henderson
  2012-09-21 21:23   ` Aurelien Jarno
  2012-09-21 18:14 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond Aurelien Jarno
  2012-09-21 20:10 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version) malc
  8 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 17:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

When movcond_i32 is available we can further reduce the generated
op count from 12 to 6, and the generated code size on i686 from
88 to 74 bytes.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/tcg-op.h | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 3e375ea..0145a09 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -2147,16 +2147,24 @@ static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
         tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
                          TCGV_LOW(c1), TCGV_HIGH(c1),
                          TCGV_LOW(c2), TCGV_HIGH(c2), cond);
-        tcg_gen_neg_i32(t0, t0);
 
-        tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
-        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
-        tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
+        if (TCG_TARGET_HAS_movcond_i32) {
+            tcg_gen_movi_i32(t1, 0);
+            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_LOW(ret), t0, t1,
+                                TCGV_LOW(v1), TCGV_LOW(v2));
+            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_HIGH(ret), t0, t1,
+                                TCGV_HIGH(v1), TCGV_HIGH(v2));
+        } else {
+            tcg_gen_neg_i32(t0, t0);
 
-        tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
-        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
-        tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
+            tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
+            tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
+            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
 
+            tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
+            tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
+            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
+        }
         tcg_temp_free_i32(t0);
         tcg_temp_free_i32(t1);
     } else {
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/7] tcg: movcond
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
                   ` (6 preceding siblings ...)
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32 Richard Henderson
@ 2012-09-21 18:14 ` Aurelien Jarno
  2012-09-21 20:10 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version) malc
  8 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-21 18:14 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Sep 21, 2012 at 10:13:33AM -0700, Richard Henderson wrote:
> The patch set is available at
> 
>    git://repo.or.cz/qemu/rth.git tcg-movcond
> 
> Changes v1->v2:
>    Patch 1 drops the spurrious return.
> 
>    Patch 4 incorporates the fix Aurelian posted.
> 
>    Patch 5 drops the non-movcond commutative optimization that is
>      present in Aurelian's tcg optimization patch set.
> 
>    Patches 6 & 7 are new, incrementally improving 32-bit code generation.
> 
>    Rebased vs c26032b2c91721245bfec542d94f37a0238e986e:
>      target-xtensa: don't emit extra tcg_gen_goto_tb
> 

Thanks for the new version. I have applied patches 1 to 5. I leave some
time for people to review patches 6 and 7.

-- 
Aurelien Jarno	                        GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version)
  2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
                   ` (7 preceding siblings ...)
  2012-09-21 18:14 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond Aurelien Jarno
@ 2012-09-21 20:10 ` malc
  2012-09-21 22:21   ` Richard Henderson
  2012-09-22 14:38   ` Blue Swirl
  8 siblings, 2 replies; 21+ messages in thread
From: malc @ 2012-09-21 20:10 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, Aurelien Jarno

diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 26c4b33..0fb6fc7 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -390,6 +390,7 @@ static int tcg_target_const_match(tcg_target_long val,
 #define ORC    XO31(412)
 #define EQV    XO31(284)
 #define NAND   XO31(476)
+#define ISEL   XO31( 15)
 
 #define LBZX   XO31( 87)
 #define LHZX   XO31(279)
@@ -1269,6 +1270,75 @@ static void tcg_out_setcond2 (TCGContext *s, const TCGArg *args,
         );
 }
 
+static void tcg_out_movcond (TCGContext *s, TCGCond cond,
+                             TCGArg dest,
+                             TCGArg c1, TCGArg c2,
+                             TCGArg v1, TCGArg v2,
+                             int const_c2)
+{
+    tcg_out_cmp (s, cond, c1, c2, const_c2, 7);
+
+    if (1) {
+        /* At least here on 7747A bit twiddling hacks are outperformed
+           by jumpy code (the testing was not scientific) */
+        void *label_ptr;
+
+        if (dest == v2) {
+            label_ptr = s->code_ptr;
+            tcg_out32 (s, tcg_to_bc[tcg_invert_cond (cond)]);
+            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
+            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
+        }
+        else {
+            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
+            label_ptr = s->code_ptr;
+            tcg_out32 (s, tcg_to_bc[cond]);
+            tcg_out_mov (s, TCG_TYPE_I32, dest, v2);
+            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
+        }
+    }
+    else {
+        /* isel version, if (1) above should be replaced once a way to
+           figure out availability of isel on the underlying hardware
+           is found */
+        int tab, bc;
+
+        switch (cond) {
+        case TCG_COND_EQ:
+            tab = TAB (dest, v1, v2);
+            bc = CR_EQ;
+            break;
+        case TCG_COND_NE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_EQ;
+            break;
+        case TCG_COND_LTU:
+        case TCG_COND_LT:
+            tab = TAB (dest, v1, v2);
+            bc = CR_LT;
+            break;
+        case TCG_COND_GEU:
+        case TCG_COND_GE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_LT;
+            break;
+        case TCG_COND_LEU:
+        case TCG_COND_LE:
+            tab = TAB (dest, v2, v1);
+            bc = CR_GT;
+            break;
+        case TCG_COND_GTU:
+        case TCG_COND_GT:
+            tab = TAB (dest, v1, v2);
+            bc = CR_GT;
+            break;
+        default:
+            tcg_abort ();
+        }
+        tcg_out32 (s, ISEL | tab | ((bc + 28) << 6));
+    }
+}
+
 static void tcg_out_brcond (TCGContext *s, TCGCond cond,
                             TCGArg arg1, TCGArg arg2, int const_arg2,
                             int label_index)
@@ -1826,6 +1896,13 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
             );
         break;
 
+    case INDEX_op_movcond_i32:
+        tcg_out_movcond (s, args[5], args[0],
+                         args[1], args[2],
+                         args[3], args[4],
+                         const_args[2]);
+        break;
+
     default:
         tcg_dump_ops (s);
         tcg_abort ();
@@ -1922,6 +1999,7 @@ static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_ext16u_i32, { "r", "r" } },
 
     { INDEX_op_deposit_i32, { "r", "0", "r" } },
+    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "r" } },
 
     { -1 },
 };
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 177eea1..3259d89 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -92,7 +92,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nand_i32         1
 #define TCG_TARGET_HAS_nor_i32          1
 #define TCG_TARGET_HAS_deposit_i32      1
-#define TCG_TARGET_HAS_movcond_i32      0
+#define TCG_TARGET_HAS_movcond_i32      1
 
 #define TCG_AREG0 TCG_REG_R27
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic Richard Henderson
@ 2012-09-21 21:15   ` Aurelien Jarno
  2012-09-22 18:11   ` Aurelien Jarno
  1 sibling, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-21 21:15 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Sep 21, 2012 at 10:13:39AM -0700, Richard Henderson wrote:
> Avoiding 64-bit arithmetic (outside of the compare) reduces the
> generated op count from 15 to 12, and the generated code size on
> i686 from 105 to 88 bytes.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/tcg-op.h | 42 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 31 insertions(+), 11 deletions(-)
> 
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 6d28f82..3e375ea 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -2141,18 +2141,38 @@ static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
>                                         TCGv_i64 c1, TCGv_i64 c2,
>                                         TCGv_i64 v1, TCGv_i64 v2)
>  {
> -    if (TCG_TARGET_HAS_movcond_i64) {
> -        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
> +    if (TCG_TARGET_REG_BITS == 32) {
> +        TCGv_i32 t0 = tcg_temp_new_i32();
> +        TCGv_i32 t1 = tcg_temp_new_i32();
> +        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
> +                         TCGV_LOW(c1), TCGV_HIGH(c1),
> +                         TCGV_LOW(c2), TCGV_HIGH(c2), cond);
> +        tcg_gen_neg_i32(t0, t0);
> +
> +        tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
> +        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
> +        tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
> +
> +        tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
> +        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
> +        tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
> +
> +        tcg_temp_free_i32(t0);
> +        tcg_temp_free_i32(t1);
>      } else {
> -        TCGv_i64 t0 = tcg_temp_new_i64();
> -        TCGv_i64 t1 = tcg_temp_new_i64();
> -        tcg_gen_setcond_i64(cond, t0, c1, c2);
> -        tcg_gen_neg_i64(t0, t0);
> -        tcg_gen_and_i64(t1, v1, t0);
> -        tcg_gen_andc_i64(ret, v2, t0);
> -        tcg_gen_or_i64(ret, ret, t1);
> -        tcg_temp_free_i64(t0);
> -        tcg_temp_free_i64(t1);
> +        if (TCG_TARGET_HAS_movcond_i64) {
> +            tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
> +        } else {
> +            TCGv_i64 t0 = tcg_temp_new_i64();
> +            TCGv_i64 t1 = tcg_temp_new_i64();
> +            tcg_gen_setcond_i64(cond, t0, c1, c2);
> +            tcg_gen_neg_i64(t0, t0);
> +            tcg_gen_and_i64(t1, v1, t0);
> +            tcg_gen_andc_i64(ret, v2, t0);
> +            tcg_gen_or_i64(ret, ret, t1);
> +            tcg_temp_free_i64(t0);
> +            tcg_temp_free_i64(t1);
> +        }
>      }
>  }
>  

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32 Richard Henderson
@ 2012-09-21 21:23   ` Aurelien Jarno
  2012-09-21 22:27     ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-21 21:23 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Sep 21, 2012 at 10:13:40AM -0700, Richard Henderson wrote:
> When movcond_i32 is available we can further reduce the generated
> op count from 12 to 6, and the generated code size on i686 from
> 88 to 74 bytes.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/tcg-op.h | 22 +++++++++++++++-------
>  1 file changed, 15 insertions(+), 7 deletions(-)
> 
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 3e375ea..0145a09 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -2147,16 +2147,24 @@ static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
>          tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
>                           TCGV_LOW(c1), TCGV_HIGH(c1),
>                           TCGV_LOW(c2), TCGV_HIGH(c2), cond);
> -        tcg_gen_neg_i32(t0, t0);
>  
> -        tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
> -        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
> -        tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
> +        if (TCG_TARGET_HAS_movcond_i32) {
> +            tcg_gen_movi_i32(t1, 0);
> +            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_LOW(ret), t0, t1,
> +                                TCGV_LOW(v1), TCGV_LOW(v2));
> +            tcg_gen_movcond_i32(TCG_COND_NE, TCGV_HIGH(ret), t0, t1,
> +                                TCGV_HIGH(v1), TCGV_HIGH(v2));
> +        } else {
> +            tcg_gen_neg_i32(t0, t0);
>  
> -        tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
> -        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
> -        tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
> +            tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
> +            tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
> +            tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
>  
> +            tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
> +            tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
> +            tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
> +        }
>          tcg_temp_free_i32(t0);
>          tcg_temp_free_i32(t1);
>      } else {

At some point I tried to think how to implement movcond_i64 for 
MIPS directly in the backend. I just tried your patch, and I got
this kind of code:

| 0x2bb2ae58:  sltu       at,zero,s4
| 0x2bb2ae5c:  sltu       t0,zero,s3
| 0x2bb2ae60:  or s3,at,t0
| 0x2bb2ae64:  movz       s1,s5,s3
| 0x2bb2ae68:  movz       s2,s6,s3
|
| (in some cases some constants/globals loading appear in the middle, but
| that's not due to movcond).

It's basically the kind of code I would have written. It's clearly
better to implement it directly in TCG.

Now I wonder if it wouldn't be better to write brcond2 as setcond2 +
brcond. And even setcond2 as a pair of setcond in TCG, which would allow
some optimizations in case both high parts are zero.

Tested-by: Aurelien Jarno <aurelien@aurel32.net>
Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version)
  2012-09-21 20:10 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version) malc
@ 2012-09-21 22:21   ` Richard Henderson
  2012-09-21 22:34     ` malc
  2012-09-22 14:38   ` Blue Swirl
  1 sibling, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 22:21 UTC (permalink / raw)
  To: malc; +Cc: qemu-devel, Aurelien Jarno

On 09/21/2012 01:10 PM, malc wrote:
> +        if (dest == v2) {
> +            label_ptr = s->code_ptr;
> +            tcg_out32 (s, tcg_to_bc[tcg_invert_cond (cond)]);
> +            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
> +            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
> +        }
> +        else {
> +            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
> +            label_ptr = s->code_ptr;
> +            tcg_out32 (s, tcg_to_bc[cond]);
> +            tcg_out_mov (s, TCG_TYPE_I32, dest, v2);
> +            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
> +        }

How about

    if (dest == v2) {
        cond = tcg_invert_cond(cond);
        v2 = v1;
    } else if (dest != v1) {
        tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
    }
    /* Branch forward over one insn.  */
    tcg_out32 (s, tcg_to_bc[cond] | 4);
    tcg_out_mov(s, TCG_TYPE_I32, dest, v2);

which avoids an extra mov if dest == v1, and also minimizes the code.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32
  2012-09-21 21:23   ` Aurelien Jarno
@ 2012-09-21 22:27     ` Richard Henderson
  2012-09-22  9:30       ` Aurelien Jarno
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-21 22:27 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: qemu-devel

On 09/21/2012 02:23 PM, Aurelien Jarno wrote:
> Now I wonder if it wouldn't be better to write brcond2 as setcond2 +
> brcond. And even setcond2 as a pair of setcond in TCG, which would allow
> some optimizations in case both high parts are zero.

I think brcond2 vs setcond2 is a choice that has to be made on a
host-by-host basis.  E.g. for i386 we implement setcond2 with branches.
E.g. for hppa setcond2, while not using branches, is twice the size of
brcond2.  But there's nothing saying you couldn't have the mips
version of brcond2 use setcond2 internals to do its job.

On the other hand, having tcg/optimize.c reduce both setcond2 and brcond2
to setcond and brcond with the appropriate values of zero would be a most
welcome improvement.

Also, I've been thinking about having a tcg.h function that produces the
cond-without-equality table that I introduced to fix hppa recently.  Using
that could reduce code size in some of the other backends as well.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version)
  2012-09-21 22:21   ` Richard Henderson
@ 2012-09-21 22:34     ` malc
  0 siblings, 0 replies; 21+ messages in thread
From: malc @ 2012-09-21 22:34 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, Aurelien Jarno

On Fri, 21 Sep 2012, Richard Henderson wrote:

> On 09/21/2012 01:10 PM, malc wrote:
> > +        if (dest == v2) {
> > +            label_ptr = s->code_ptr;
> > +            tcg_out32 (s, tcg_to_bc[tcg_invert_cond (cond)]);
> > +            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
> > +            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
> > +        }
> > +        else {
> > +            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
> > +            label_ptr = s->code_ptr;
> > +            tcg_out32 (s, tcg_to_bc[cond]);
> > +            tcg_out_mov (s, TCG_TYPE_I32, dest, v2);
> > +            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
> > +        }
> 
> How about
> 
>     if (dest == v2) {
>         cond = tcg_invert_cond(cond);
>         v2 = v1;
>     } else if (dest != v1) {
>         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
>     }
>     /* Branch forward over one insn.  */
>     tcg_out32 (s, tcg_to_bc[cond] | 4);
>     tcg_out_mov(s, TCG_TYPE_I32, dest, v2);
> 
> which avoids an extra mov if dest == v1, and also minimizes the code.

Yes, thanks, that's better (save for | 4 part which is 4 too little)

-- 
mailto:av1474@comtv.ru

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32
  2012-09-21 22:27     ` Richard Henderson
@ 2012-09-22  9:30       ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-22  9:30 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Sep 21, 2012 at 03:27:52PM -0700, Richard Henderson wrote:
> On 09/21/2012 02:23 PM, Aurelien Jarno wrote:
> > Now I wonder if it wouldn't be better to write brcond2 as setcond2 +
> > brcond. And even setcond2 as a pair of setcond in TCG, which would allow
> > some optimizations in case both high parts are zero.
> 
> I think brcond2 vs setcond2 is a choice that has to be made on a
> host-by-host basis.  E.g. for i386 we implement setcond2 with branches.
> E.g. for hppa setcond2, while not using branches, is twice the size of
> brcond2.  But there's nothing saying you couldn't have the mips
> version of brcond2 use setcond2 internals to do its job.

Yes and now. Currently the MIPS version of setcond2, uses 2 temporary
registers and the return argument. This is therefore not possible to
make brcond2 to use setcond2. 

I have just seen it's possible to reorder the instructions to use two 
temporaries only, but you see the problem there. Doing that as the TCG
level means you can use more temporaries, at the backend level means you
have to play with the registers you have.

> On the other hand, having tcg/optimize.c reduce both setcond2 and brcond2
> to setcond and brcond with the appropriate values of zero would be a most
> welcome improvement.

That's a good idea.

> Also, I've been thinking about having a tcg.h function that produces the
> cond-without-equality table that I introduced to fix hppa recently.  Using
> that could reduce code size in some of the other backends as well.
> 

I am using such a table in the MIPS backend, but done with a big switch.

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version)
  2012-09-21 20:10 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version) malc
  2012-09-21 22:21   ` Richard Henderson
@ 2012-09-22 14:38   ` Blue Swirl
  1 sibling, 0 replies; 21+ messages in thread
From: Blue Swirl @ 2012-09-22 14:38 UTC (permalink / raw)
  To: malc; +Cc: qemu-devel, Aurelien Jarno, Richard Henderson

On Fri, Sep 21, 2012 at 8:10 PM, malc <av1474@comtv.ru> wrote:
> diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
> index 26c4b33..0fb6fc7 100644
> --- a/tcg/ppc/tcg-target.c
> +++ b/tcg/ppc/tcg-target.c
> @@ -390,6 +390,7 @@ static int tcg_target_const_match(tcg_target_long val,
>  #define ORC    XO31(412)
>  #define EQV    XO31(284)
>  #define NAND   XO31(476)
> +#define ISEL   XO31( 15)
>
>  #define LBZX   XO31( 87)
>  #define LHZX   XO31(279)
> @@ -1269,6 +1270,75 @@ static void tcg_out_setcond2 (TCGContext *s, const TCGArg *args,
>          );
>  }
>
> +static void tcg_out_movcond (TCGContext *s, TCGCond cond,
> +                             TCGArg dest,
> +                             TCGArg c1, TCGArg c2,
> +                             TCGArg v1, TCGArg v2,
> +                             int const_c2)
> +{
> +    tcg_out_cmp (s, cond, c1, c2, const_c2, 7);
> +
> +    if (1) {
> +        /* At least here on 7747A bit twiddling hacks are outperformed
> +           by jumpy code (the testing was not scientific) */
> +        void *label_ptr;
> +
> +        if (dest == v2) {
> +            label_ptr = s->code_ptr;
> +            tcg_out32 (s, tcg_to_bc[tcg_invert_cond (cond)]);
> +            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
> +            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
> +        }
> +        else {

} else {

> +            tcg_out_mov (s, TCG_TYPE_I32, dest, v1);
> +            label_ptr = s->code_ptr;
> +            tcg_out32 (s, tcg_to_bc[cond]);
> +            tcg_out_mov (s, TCG_TYPE_I32, dest, v2);
> +            reloc_pc14 (label_ptr, (tcg_target_long) s->code_ptr);
> +        }
> +    }
> +    else {

} else {

> +        /* isel version, if (1) above should be replaced once a way to
> +           figure out availability of isel on the underlying hardware
> +           is found */
> +        int tab, bc;
> +
> +        switch (cond) {
> +        case TCG_COND_EQ:
> +            tab = TAB (dest, v1, v2);
> +            bc = CR_EQ;
> +            break;
> +        case TCG_COND_NE:
> +            tab = TAB (dest, v2, v1);
> +            bc = CR_EQ;
> +            break;
> +        case TCG_COND_LTU:
> +        case TCG_COND_LT:
> +            tab = TAB (dest, v1, v2);
> +            bc = CR_LT;
> +            break;
> +        case TCG_COND_GEU:
> +        case TCG_COND_GE:
> +            tab = TAB (dest, v2, v1);
> +            bc = CR_LT;
> +            break;
> +        case TCG_COND_LEU:
> +        case TCG_COND_LE:
> +            tab = TAB (dest, v2, v1);
> +            bc = CR_GT;
> +            break;
> +        case TCG_COND_GTU:
> +        case TCG_COND_GT:
> +            tab = TAB (dest, v1, v2);
> +            bc = CR_GT;
> +            break;
> +        default:
> +            tcg_abort ();
> +        }
> +        tcg_out32 (s, ISEL | tab | ((bc + 28) << 6));
> +    }
> +}
> +
>  static void tcg_out_brcond (TCGContext *s, TCGCond cond,
>                              TCGArg arg1, TCGArg arg2, int const_arg2,
>                              int label_index)
> @@ -1826,6 +1896,13 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
>              );
>          break;
>
> +    case INDEX_op_movcond_i32:
> +        tcg_out_movcond (s, args[5], args[0],
> +                         args[1], args[2],
> +                         args[3], args[4],
> +                         const_args[2]);
> +        break;
> +
>      default:
>          tcg_dump_ops (s);
>          tcg_abort ();
> @@ -1922,6 +1999,7 @@ static const TCGTargetOpDef ppc_op_defs[] = {
>      { INDEX_op_ext16u_i32, { "r", "r" } },
>
>      { INDEX_op_deposit_i32, { "r", "0", "r" } },
> +    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "r" } },
>
>      { -1 },
>  };
> diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
> index 177eea1..3259d89 100644
> --- a/tcg/ppc/tcg-target.h
> +++ b/tcg/ppc/tcg-target.h
> @@ -92,7 +92,7 @@ typedef enum {
>  #define TCG_TARGET_HAS_nand_i32         1
>  #define TCG_TARGET_HAS_nor_i32          1
>  #define TCG_TARGET_HAS_deposit_i32      1
> -#define TCG_TARGET_HAS_movcond_i32      0
> +#define TCG_TARGET_HAS_movcond_i32      1
>
>  #define TCG_AREG0 TCG_REG_R27
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic Richard Henderson
  2012-09-21 21:15   ` Aurelien Jarno
@ 2012-09-22 18:11   ` Aurelien Jarno
  1 sibling, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-22 18:11 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Sep 21, 2012 at 10:13:39AM -0700, Richard Henderson wrote:
> Avoiding 64-bit arithmetic (outside of the compare) reduces the
> generated op count from 15 to 12, and the generated code size on
> i686 from 105 to 88 bytes.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/tcg-op.h | 42 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 31 insertions(+), 11 deletions(-)
> 
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 6d28f82..3e375ea 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -2141,18 +2141,38 @@ static inline void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret,
>                                         TCGv_i64 c1, TCGv_i64 c2,
>                                         TCGv_i64 v1, TCGv_i64 v2)
>  {
> -    if (TCG_TARGET_HAS_movcond_i64) {
> -        tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
> +    if (TCG_TARGET_REG_BITS == 32) {

Using such a construction doesn't compile on a 64-bit host.

> +        TCGv_i32 t0 = tcg_temp_new_i32();
> +        TCGv_i32 t1 = tcg_temp_new_i32();
> +        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, t0,
> +                         TCGV_LOW(c1), TCGV_HIGH(c1),
> +                         TCGV_LOW(c2), TCGV_HIGH(c2), cond);

Because there TCGV_LOW and TCGV_HIGH do not exist. You should use #ifdef
instead.

> +        tcg_gen_neg_i32(t0, t0);
> +
> +        tcg_gen_and_i32(t1, TCGV_LOW(v1), t0);
> +        tcg_gen_andc_i32(TCGV_LOW(ret), TCGV_LOW(v2), t0);
> +        tcg_gen_or_i32(TCGV_LOW(ret), TCGV_LOW(ret), t1);
> +
> +        tcg_gen_and_i32(t1, TCGV_HIGH(v1), t0);
> +        tcg_gen_andc_i32(TCGV_HIGH(ret), TCGV_HIGH(v2), t0);
> +        tcg_gen_or_i32(TCGV_HIGH(ret), TCGV_HIGH(ret), t1);
> +
> +        tcg_temp_free_i32(t0);
> +        tcg_temp_free_i32(t1);
>      } else {
> -        TCGv_i64 t0 = tcg_temp_new_i64();
> -        TCGv_i64 t1 = tcg_temp_new_i64();
> -        tcg_gen_setcond_i64(cond, t0, c1, c2);
> -        tcg_gen_neg_i64(t0, t0);
> -        tcg_gen_and_i64(t1, v1, t0);
> -        tcg_gen_andc_i64(ret, v2, t0);
> -        tcg_gen_or_i64(ret, ret, t1);
> -        tcg_temp_free_i64(t0);
> -        tcg_temp_free_i64(t1);
> +        if (TCG_TARGET_HAS_movcond_i64) {
> +            tcg_gen_op6i_i64(INDEX_op_movcond_i64, ret, c1, c2, v1, v2, cond);
> +        } else {
> +            TCGv_i64 t0 = tcg_temp_new_i64();
> +            TCGv_i64 t1 = tcg_temp_new_i64();
> +            tcg_gen_setcond_i64(cond, t0, c1, c2);
> +            tcg_gen_neg_i64(t0, t0);
> +            tcg_gen_and_i64(t1, v1, t0);
> +            tcg_gen_andc_i64(ret, v2, t0);
> +            tcg_gen_or_i64(ret, ret, t1);
> +            tcg_temp_free_i64(t0);
> +            tcg_temp_free_i64(t1);
> +        }
>      }
>  }
>  
> -- 
> 1.7.11.4
> 
> 
> 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond
  2012-09-21 17:13 ` [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond Richard Henderson
@ 2012-09-24 21:37   ` Alex Barcelo
  2012-09-24 21:54     ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Alex Barcelo @ 2012-09-24 21:37 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, Aurelien Jarno

just finished a git-bisect and I found this... and now I do not fully
understand why I have the problem.

To replicate the error (in a i386 machine, at least):
$ make clean && ./configure --enable-debug && make -j && make install
[Note: I tried both ppc and i386 targets, so doesn't seem machine-dependent]
$ ./path/to/qemu/bin/qemu-i386 doesntmatter
Invalid op definition for movcond_i32
/mnt/DATA/DARCO/qemu-git/tcg/tcg.c:1170: tcg fatal error
Aborted

"doesntmatter" can be a non-existing file, or an existing file. It
seems to fail before checking file existence.

On Fri, Sep 21, 2012 at 7:13 PM, Richard Henderson <rth@twiddle.net> wrote:
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
> ---
>  tcg/i386/tcg-target.c | 29 +++++++++++++++++++++++++++++
>  tcg/i386/tcg-target.h |  7 ++++++-
>  2 files changed, 35 insertions(+), 1 deletion(-)
>
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 3017858..aa1fa9f 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -249,6 +249,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define OPC_ADD_GvEv   (OPC_ARITH_GvEv | (ARITH_ADD << 3))
>  #define OPC_BSWAP      (0xc8 | P_EXT)
>  #define OPC_CALL_Jz    (0xe8)
> +#define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
>  #define OPC_CMP_GvEv   (OPC_ARITH_GvEv | (ARITH_CMP << 3))
>  #define OPC_DEC_r32    (0x48)
>  #define OPC_IMUL_GvEv  (0xaf | P_EXT)
> @@ -936,6 +937,24 @@ static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
>  }
>  #endif
>
> +static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGArg dest,
> +                              TCGArg c1, TCGArg c2, int const_c2,
> +                              TCGArg v1)
> +{
> +    tcg_out_cmp(s, c1, c2, const_c2, 0);
> +    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond], dest, v1);
> +}
> +
> +#if TCG_TARGET_REG_BITS == 64
> +static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGArg dest,
> +                              TCGArg c1, TCGArg c2, int const_c2,
> +                              TCGArg v1)
> +{
> +    tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
> +    tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | P_REXW, dest, v1);
> +}
> +#endif
> +
>  static void tcg_out_branch(TCGContext *s, int call, tcg_target_long dest)
>  {
>      tcg_target_long disp = dest - (tcg_target_long)s->code_ptr - 5;
> @@ -1668,6 +1687,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          tcg_out_setcond32(s, args[3], args[0], args[1],
>                            args[2], const_args[2]);
>          break;
> +    case INDEX_op_movcond_i32:
> +        tcg_out_movcond32(s, args[5], args[0], args[1],
> +                          args[2], const_args[2], args[3]);
> +        break;
>
>      OP_32_64(bswap16):
>          tcg_out_rolw_8(s, args[0]);
> @@ -1796,6 +1819,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          tcg_out_setcond64(s, args[3], args[0], args[1],
>                            args[2], const_args[2]);
>          break;
> +    case INDEX_op_movcond_i64:
> +        tcg_out_movcond64(s, args[5], args[0], args[1],
> +                          args[2], const_args[2], args[3]);
> +        break;
>
>      case INDEX_op_bswap64_i64:
>          tcg_out_bswap64(s, args[0]);
> @@ -1880,6 +1907,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_setcond_i32, { "q", "r", "ri" } },
>
>      { INDEX_op_deposit_i32, { "Q", "0", "Q" } },
> +    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
>
>  #if TCG_TARGET_REG_BITS == 32
>      { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
> @@ -1934,6 +1962,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_ext32u_i64, { "r", "r" } },
>
>      { INDEX_op_deposit_i64, { "Q", "0", "Q" } },
> +    { INDEX_op_movcond_i64, { "r", "r", "re", "r", "0" } },
>  #endif
>
>  #if TCG_TARGET_REG_BITS == 64
> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> index 504f953..b356d76 100644
> --- a/tcg/i386/tcg-target.h
> +++ b/tcg/i386/tcg-target.h
> @@ -86,7 +86,12 @@ typedef enum {
>  #define TCG_TARGET_HAS_nand_i32         0
>  #define TCG_TARGET_HAS_nor_i32          0
>  #define TCG_TARGET_HAS_deposit_i32      1
> +#if defined(__x86_64__) || defined(__i686__)
> +/* Use cmov only if the compiler is already doing so.  */
> +#define TCG_TARGET_HAS_movcond_i32      1
> +#else
>  #define TCG_TARGET_HAS_movcond_i32      0
> +#endif
>
>  #if TCG_TARGET_REG_BITS == 64
>  #define TCG_TARGET_HAS_div2_i64         1
> @@ -108,7 +113,7 @@ typedef enum {
>  #define TCG_TARGET_HAS_nand_i64         0
>  #define TCG_TARGET_HAS_nor_i64          0
>  #define TCG_TARGET_HAS_deposit_i64      1
> -#define TCG_TARGET_HAS_movcond_i64      0
> +#define TCG_TARGET_HAS_movcond_i64      1
>  #endif
>
>  #define TCG_TARGET_deposit_i32_valid(ofs, len) \
> --
> 1.7.11.4
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond
  2012-09-24 21:37   ` Alex Barcelo
@ 2012-09-24 21:54     ` Richard Henderson
  2012-09-25 22:48       ` Aurelien Jarno
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-24 21:54 UTC (permalink / raw)
  To: Alex Barcelo; +Cc: qemu-devel, Aurelien Jarno

On 09/24/2012 02:37 PM, Alex Barcelo wrote:
> just finished a git-bisect and I found this... and now I do not fully
> understand why I have the problem.
> 
> To replicate the error (in a i386 machine, at least):
> $ make clean && ./configure --enable-debug && make -j && make install
> [Note: I tried both ppc and i386 targets, so doesn't seem machine-dependent]
> $ ./path/to/qemu/bin/qemu-i386 doesntmatter
> Invalid op definition for movcond_i32
> /mnt/DATA/DARCO/qemu-git/tcg/tcg.c:1170: tcg fatal error
> Aborted

Ah, right.  I only tried with -march=i686.

>> +    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },

Should be protected by 

#if TCG_TARGET_HAS_movcond_i32

If no one beats me to this, I'll submit a patch this evening.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond
  2012-09-24 21:54     ` Richard Henderson
@ 2012-09-25 22:48       ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-25 22:48 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Alex Barcelo, qemu-devel

On Mon, Sep 24, 2012 at 02:54:27PM -0700, Richard Henderson wrote:
> On 09/24/2012 02:37 PM, Alex Barcelo wrote:
> > just finished a git-bisect and I found this... and now I do not fully
> > understand why I have the problem.
> > 
> > To replicate the error (in a i386 machine, at least):
> > $ make clean && ./configure --enable-debug && make -j && make install
> > [Note: I tried both ppc and i386 targets, so doesn't seem machine-dependent]
> > $ ./path/to/qemu/bin/qemu-i386 doesntmatter
> > Invalid op definition for movcond_i32
> > /mnt/DATA/DARCO/qemu-git/tcg/tcg.c:1170: tcg fatal error
> > Aborted
> 
> Ah, right.  I only tried with -march=i686.
> 
> >> +    { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
> 
> Should be protected by 
> 
> #if TCG_TARGET_HAS_movcond_i32
> 
> If no one beats me to this, I'll submit a patch this evening.
> 
> 

I have applied the following patch to fix the issue.


commit f813cb838f19ee8637d3c365659e6a6bb0c9c974
Author: Aurelien Jarno <aurelien@aurel32.net>
Date:   Wed Sep 26 00:30:12 2012 +0200

    tcg/i386: fix build with -march < i686
    
    The movcond_i32 op has to be protected with TCG_TARGET_HAS_movcond_i32
    to fix the build with -march < i686.
    
    Thanks to Richard Henderson for the hint.
    
    Reported-by: Alex Barcelo <abarcelo@ac.upc.edu>
    Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 122d636..0e218c8 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1893,7 +1893,9 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_setcond_i32, { "q", "r", "ri" } },
 
     { INDEX_op_deposit_i32, { "Q", "0", "Q" } },
+#if TCG_TARGET_HAS_movcond_i32
     { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } },
+#endif
 
 #if TCG_TARGET_REG_BITS == 32
     { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2012-09-25 22:48 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-21 17:13 [Qemu-devel] [PATCH v2 0/7] tcg: movcond Richard Henderson
2012-09-21 17:13 ` [Qemu-devel] [PATCH 1/7] tcg: Introduce movcond Richard Henderson
2012-09-21 17:13 ` [Qemu-devel] [PATCH 2/7] target-alpha: Use movcond Richard Henderson
2012-09-21 17:13 ` [Qemu-devel] [PATCH 3/7] tcg-i386: Implement movcond Richard Henderson
2012-09-24 21:37   ` Alex Barcelo
2012-09-24 21:54     ` Richard Henderson
2012-09-25 22:48       ` Aurelien Jarno
2012-09-21 17:13 ` [Qemu-devel] [PATCH 4/7] tcg: Optimize movcond for constant comparisons Richard Henderson
2012-09-21 17:13 ` [Qemu-devel] [PATCH 5/7] tcg: Optimize two-address commutative operations Richard Henderson
2012-09-21 17:13 ` [Qemu-devel] [PATCH 6/7] tcg: Streamline movcond_i64 using 32-bit arithmetic Richard Henderson
2012-09-21 21:15   ` Aurelien Jarno
2012-09-22 18:11   ` Aurelien Jarno
2012-09-21 17:13 ` [Qemu-devel] [PATCH 7/7] tcg: Streamline movcond_i64 using movcond_i32 Richard Henderson
2012-09-21 21:23   ` Aurelien Jarno
2012-09-21 22:27     ` Richard Henderson
2012-09-22  9:30       ` Aurelien Jarno
2012-09-21 18:14 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond Aurelien Jarno
2012-09-21 20:10 ` [Qemu-devel] [PATCH v2 0/7] tcg: movcond (ppc32 version) malc
2012-09-21 22:21   ` Richard Henderson
2012-09-21 22:34     ` malc
2012-09-22 14:38   ` Blue Swirl

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).