[Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements
@ 2012-09-27 17:19 Richard Henderson
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 1/7] tcg: Split out swap_commutative as a subroutine Richard Henderson
                   ` (6 more replies)
  0 siblings, 7 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

In another thread we talked about doing the brcond2 -> brcond optimization
generically for LT/GTE comparisons vs zero.  That's patch 4 of this series.
In addition, it seemed trivial to go ahead and apply the existing code we
have for fully optimizing constant comparisons.

Tested host i686, guest alpha.  Where even the final patch activates during
the boot rom.  (FYI, implver is sorta like cpuid, querying cpu revision.)

0xfffffc0000003748:  implver    t0
0xfffffc000000374c:  cmpeq      t0,0x1,t1
0xfffffc0000003750:  bne        t1,0x40000003784

OP:
 ---- 0xfffffc0000003748
 movi_i32 ir1_0,$0x2
 movi_i32 ir1_1,$0x0

 ---- 0xfffffc000000374c
 movi_i32 tmp0,$0x1
 movi_i32 tmp1,$0x0
 setcond2_i32 ir2_0,ir1_0,ir1_1,tmp0,tmp1,eq
 movi_i32 ir2_1,$0x0

 ---- 0xfffffc0000003750
 mov_i32 tmp0,ir2_0
 mov_i32 tmp1,ir2_1
 movi_i32 tmp2,$0x0
 movi_i32 tmp3,$0x0
 brcond2_i32 tmp0,tmp1,tmp2,tmp3,ne,$0x0
 ...

OP after optimization and liveness analysis:
 ---- 0xfffffc0000003748
 movi_i32 ir1_0,$0x2
 movi_i32 ir1_1,$0x0

 ---- 0xfffffc000000374c
 nopn $0x2,$0x2
 nopn $0x2,$0x2
 movi_i32 ir2_0,$0x0
 movi_i32 ir2_1,$0x0

 ---- 0xfffffc0000003750
 nopn $0x2,$0x2
 nopn $0x2,$0x2
 nopn $0x2,$0x2
 nopn $0x2,$0x2
 nop 
 ...



r~


Richard Henderson (7):
  tcg: Split out swap_commutative as a subroutine
  tcg: Optimize add2 + sub2
  tcg: Swap commutative double-word comparisons
  tcg: Optimize double-word comparisons against zero
  tcg: Split out subroutines from do_constant_folding_cond
  tcg: Tidy brcond optimization
  tcg: Do constant folding on double-word comparisons

 tcg/optimize.c | 366 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 267 insertions(+), 99 deletions(-)

-- 
1.7.11.4

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 1/7] tcg: Split out swap_commutative as a subroutine
  2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
@ 2012-09-27 17:19 ` Richard Henderson
  2012-09-27 21:45   ` Aurelien Jarno
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2 Richard Henderson
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Reduces code duplication and prefers

  movcond d, c1, c2, const, s
to
  movcond d, c1, c2, s, const

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 54 ++++++++++++++++++++++--------------------------------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 35532a1..55f2a24 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -382,6 +382,21 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
     tcg_abort();
 }
 
+static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
+{
+    TCGArg a1 = *p1, a2 = *p2;
+    /* Prefer the constant in second argument, and then the form
+       op a, a, b, which is better handled on non-RISC hosts. */
+    if (temps[a1].state == TCG_TEMP_CONST
+        || (dest != (TCGArg)-1 && dest == a2
+            && temps[a2].state != TCG_TEMP_CONST)) {
+        *p1 = a2;
+        *p2 = a1;
+        return true;
+    }
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
@@ -391,7 +406,6 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
     const TCGOpDef *def;
     TCGArg *gen_args;
     TCGArg tmp;
-    TCGCond cond;
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -434,52 +448,28 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(eqv):
         CASE_OP_32_64(nand):
         CASE_OP_32_64(nor):
-            /* Prefer the constant in second argument, and then the form
-               op a, a, b, which is better handled on non-RISC hosts. */
-            if (temps[args[1]].state == TCG_TEMP_CONST || (args[0] == args[2]
-                && temps[args[2]].state != TCG_TEMP_CONST)) {
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
-            }
+            swap_commutative(args[0], &args[1], &args[2]);
             break;
         CASE_OP_32_64(brcond):
-            if (temps[args[0]].state == TCG_TEMP_CONST
-                && temps[args[1]].state != TCG_TEMP_CONST) {
-                tmp = args[0];
-                args[0] = args[1];
-                args[1] = tmp;
+            if (swap_commutative(-1, &args[0], &args[1])) {
                 args[2] = tcg_swap_cond(args[2]);
             }
             break;
         CASE_OP_32_64(setcond):
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state != TCG_TEMP_CONST) {
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
+            if (swap_commutative(args[0], &args[1], &args[2])) {
                 args[3] = tcg_swap_cond(args[3]);
             }
             break;
         CASE_OP_32_64(movcond):
-            cond = args[5];
-            if (temps[args[1]].state == TCG_TEMP_CONST
-                && temps[args[2]].state != TCG_TEMP_CONST) {
-                tmp = args[1];
-                args[1] = args[2];
-                args[2] = tmp;
-                cond = tcg_swap_cond(cond);
+            if (swap_commutative(-1, &args[1], &args[2])) {
+                args[5] = tcg_swap_cond(args[5]);
             }
             /* For movcond, we canonicalize the "false" input reg to match
                the destination reg so that the tcg backend can implement
                a "move if true" operation.  */
-            if (args[0] == args[3]) {
-                tmp = args[3];
-                args[3] = args[4];
-                args[4] = tmp;
-                cond = tcg_invert_cond(cond);
+            if (swap_commutative(args[0], &args[4], &args[3])) {
+                args[5] = tcg_invert_cond(args[5]);
             }
-            args[5] = cond;
         default:
             break;
         }
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 1/7] tcg: Split out swap_commutative as a subroutine
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 1/7] tcg: Split out swap_commutative as a subroutine Richard Henderson
@ 2012-09-27 21:45   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-27 21:45 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 10:19:51AM -0700, Richard Henderson wrote:
> Reduces code duplication and prefers
> 
>   movcond d, c1, c2, const, s
> to
>   movcond d, c1, c2, s, const
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 54 ++++++++++++++++++++++--------------------------------
>  1 file changed, 22 insertions(+), 32 deletions(-)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 35532a1..55f2a24 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -382,6 +382,21 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
>      tcg_abort();
>  }
>  
> +static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
> +{
> +    TCGArg a1 = *p1, a2 = *p2;
> +    /* Prefer the constant in second argument, and then the form
> +       op a, a, b, which is better handled on non-RISC hosts. */
> +    if (temps[a1].state == TCG_TEMP_CONST
> +        || (dest != (TCGArg)-1 && dest == a2
> +            && temps[a2].state != TCG_TEMP_CONST)) {
> +        *p1 = a2;
> +        *p2 = a1;
> +        return true;
> +    }
> +    return false;
> +}
> +
>  /* Propagate constants and copies, fold constant expressions. */
>  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>                                      TCGArg *args, TCGOpDef *tcg_op_defs)
> @@ -391,7 +406,6 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>      const TCGOpDef *def;
>      TCGArg *gen_args;
>      TCGArg tmp;
> -    TCGCond cond;
>  
>      /* Array VALS has an element for each temp.
>         If this temp holds a constant then its value is kept in VALS' element.
> @@ -434,52 +448,28 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>          CASE_OP_32_64(eqv):
>          CASE_OP_32_64(nand):
>          CASE_OP_32_64(nor):
> -            /* Prefer the constant in second argument, and then the form
> -               op a, a, b, which is better handled on non-RISC hosts. */
> -            if (temps[args[1]].state == TCG_TEMP_CONST || (args[0] == args[2]
> -                && temps[args[2]].state != TCG_TEMP_CONST)) {
> -                tmp = args[1];
> -                args[1] = args[2];
> -                args[2] = tmp;
> -            }
> +            swap_commutative(args[0], &args[1], &args[2]);
>              break;
>          CASE_OP_32_64(brcond):
> -            if (temps[args[0]].state == TCG_TEMP_CONST
> -                && temps[args[1]].state != TCG_TEMP_CONST) {
> -                tmp = args[0];
> -                args[0] = args[1];
> -                args[1] = tmp;
> +            if (swap_commutative(-1, &args[0], &args[1])) {
>                  args[2] = tcg_swap_cond(args[2]);
>              }
>              break;
>          CASE_OP_32_64(setcond):
> -            if (temps[args[1]].state == TCG_TEMP_CONST
> -                && temps[args[2]].state != TCG_TEMP_CONST) {
> -                tmp = args[1];
> -                args[1] = args[2];
> -                args[2] = tmp;
> +            if (swap_commutative(args[0], &args[1], &args[2])) {
>                  args[3] = tcg_swap_cond(args[3]);
>              }
>              break;
>          CASE_OP_32_64(movcond):
> -            cond = args[5];
> -            if (temps[args[1]].state == TCG_TEMP_CONST
> -                && temps[args[2]].state != TCG_TEMP_CONST) {
> -                tmp = args[1];
> -                args[1] = args[2];
> -                args[2] = tmp;
> -                cond = tcg_swap_cond(cond);
> +            if (swap_commutative(-1, &args[1], &args[2])) {
> +                args[5] = tcg_swap_cond(args[5]);
>              }
>              /* For movcond, we canonicalize the "false" input reg to match
>                 the destination reg so that the tcg backend can implement
>                 a "move if true" operation.  */
> -            if (args[0] == args[3]) {
> -                tmp = args[3];
> -                args[3] = args[4];
> -                args[4] = tmp;
> -                cond = tcg_invert_cond(cond);
> +            if (swap_commutative(args[0], &args[4], &args[3])) {
> +                args[5] = tcg_invert_cond(args[5]);
>              }
> -            args[5] = cond;
>          default:
>              break;
>          }

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2
  2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 1/7] tcg: Split out swap_commutative as a subroutine Richard Henderson
@ 2012-09-27 17:19 ` Richard Henderson
  2012-09-27 23:20   ` Aurelien Jarno
  2012-09-30  7:04   ` Blue Swirl
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 3/7] tcg: Swap commutative double-word comparisons Richard Henderson
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

We can't do complete constant folding because we lack "mov2",
or the ability to insert opcodes in the stream.  But we can
at least canonicalize add2 operand ordering and simplify
add2 to add when the lowpart adds a constant 0.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 55f2a24..004c336 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -470,6 +470,11 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             if (swap_commutative(args[0], &args[4], &args[3])) {
                 args[5] = tcg_invert_cond(args[5]);
             }
+            break;
+        case INDEX_op_add2_i32:
+            swap_commutative(args[0], &args[2], &args[4]);
+            swap_commutative(args[1], &args[3], &args[5]);
+            break;
         default:
             break;
         }
@@ -522,6 +527,32 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                 continue;
             }
             break;
+        case INDEX_op_add2_i32:
+        case INDEX_op_sub2_i32:
+            /* Simplify op rl, rh, al, ah, 0, bh => op rh, ah, bh.
+               The zero implies there will be no carry into the high part.
+               But only when rl == al, since we can't insert the extra move
+               that would be required.  */
+            if (temps[args[4]].state == TCG_TEMP_CONST
+                && temps[args[4]].val == 0
+                && temps_are_copies(args[0], args[2])) {
+                if (temps[args[5]].state == TCG_TEMP_CONST
+                    && temps[args[5]].val == 0
+                    && temps_are_copies(args[1], args[3])) {
+                    gen_opc_buf[op_index] = INDEX_op_nop;
+                } else {
+                    gen_opc_buf[op_index] = (op == INDEX_op_add2_i32
+                                             ? INDEX_op_add_i32
+                                             : INDEX_op_sub_i32);
+                    args[0] = args[1];
+                    args[1] = args[3];
+                    args[2] = args[5];
+                    gen_args += 3;
+                }
+                args += 6;
+                continue;
+            }
+            break;
         default:
             break;
         }
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2 Richard Henderson
@ 2012-09-27 23:20   ` Aurelien Jarno
  2012-09-27 23:28     ` Richard Henderson
  2012-09-30  7:04   ` Blue Swirl
  1 sibling, 1 reply; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-27 23:20 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 10:19:52AM -0700, Richard Henderson wrote:
> We can't do complete constant folding because we lack "mov2",
> or the ability to insert opcodes in the stream.  But we can
> at least canonicalize add2 operand ordering and simplify
> add2 to add when the lowpart adds a constant 0.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 31 +++++++++++++++++++++++++++++++
>  1 file changed, 31 insertions(+)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 55f2a24..004c336 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -470,6 +470,11 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              if (swap_commutative(args[0], &args[4], &args[3])) {
>                  args[5] = tcg_invert_cond(args[5]);
>              }
> +            break;
> +        case INDEX_op_add2_i32:
> +            swap_commutative(args[0], &args[2], &args[4]);
> +            swap_commutative(args[1], &args[3], &args[5]);
> +            break;
>          default:
>              break;
>          }
> @@ -522,6 +527,32 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>                  continue;
>              }
>              break;
> +        case INDEX_op_add2_i32:
> +        case INDEX_op_sub2_i32:
> +            /* Simplify op rl, rh, al, ah, 0, bh => op rh, ah, bh.
> +               The zero implies there will be no carry into the high part.
> +               But only when rl == al, since we can't insert the extra move
> +               that would be required.  */
> +            if (temps[args[4]].state == TCG_TEMP_CONST
> +                && temps[args[4]].val == 0
> +                && temps_are_copies(args[0], args[2])) {
> +                if (temps[args[5]].state == TCG_TEMP_CONST
> +                    && temps[args[5]].val == 0
> +                    && temps_are_copies(args[1], args[3])) {
> +                    gen_opc_buf[op_index] = INDEX_op_nop;
> +                } else {
> +                    gen_opc_buf[op_index] = (op == INDEX_op_add2_i32
> +                                             ? INDEX_op_add_i32
> +                                             : INDEX_op_sub_i32);
> +                    args[0] = args[1];
> +                    args[1] = args[3];
> +                    args[2] = args[5];
> +                    gen_args += 3;
> +                }
> +                args += 6;
> +                continue;
> +            }
> +            break;
>          default:
>              break;
>          }
> -- 
> 1.7.11.4
> 

I understand that we can't easily insert an instruction, so the
limitation comes from here, but is it really something happening often?
Doing an optimization has a CPU cost, so if it is not used often, it
might be worse than without.

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2
  2012-09-27 23:20   ` Aurelien Jarno
@ 2012-09-27 23:28     ` Richard Henderson
  2012-10-01 17:46       ` Aurelien Jarno
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 23:28 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: qemu-devel

On 09/27/2012 04:20 PM, Aurelien Jarno wrote:
> I understand that we can't easily insert an instruction, so the
> limitation comes from here, but is it really something happening often?

It will certainly appear sometimes.  E.g. s390x has an add immediate
instruction that does exactly: r1 += imm16 << 32.

Or did you mean specifically the full constant being folded?  That
would happen quite a bit more often.  That you can see with most any
64-bit RISC guest when they attempt to generate a constant from 
addition primitives instead of logical primitives.

For a 32-bit host, we've already decomposed logical primitives to 32-bit
operations.  And we can constant-fold through all of those.  But when
addition comes into play, we can't constant-fold through add2.

r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2
  2012-09-27 23:28     ` Richard Henderson
@ 2012-10-01 17:46       ` Aurelien Jarno
  2012-10-01 18:41         ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Aurelien Jarno @ 2012-10-01 17:46 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 04:28:47PM -0700, Richard Henderson wrote:
> On 09/27/2012 04:20 PM, Aurelien Jarno wrote:
> > I understand that we can't easily insert an instruction, so the
> > limitation comes from here, but is it really something happening often?
> 
> It will certainly appear sometimes.  E.g. s390x has an add immediate
> instruction that does exactly: r1 += imm16 << 32.
> 
> Or did you mean specifically the full constant being folded?  That
> would happen quite a bit more often.  That you can see with most any
> 64-bit RISC guest when they attempt to generate a constant from 
> addition primitives instead of logical primitives.
> 
> For a 32-bit host, we've already decomposed logical primitives to 32-bit
> operations.  And we can constant-fold through all of those.  But when
> addition comes into play, we can't constant-fold through add2.
> 

I tried this patch on an i386 host running an x86_64 target, but it
even fails to start seabios, there is probably a wrong logic somewhere
in the patch.

For the first add2 that seemed to have work correctly, this patch
optimized 0.2% of them. I am not sure it worth it as is.

I think optimizing add2, and in general all *2 ops is a good idea, but
we should be able to do more agressive optimization. Maybe, a bit like
Blue was suggesting, add2 should always be followed by a nop, so we can
do more optimizations?

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2
  2012-10-01 17:46       ` Aurelien Jarno
@ 2012-10-01 18:41         ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2012-10-01 18:41 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: qemu-devel

On 2012-10-01 10:46, Aurelien Jarno wrote:
> For the first add2 that seemed to have work correctly, this patch
> optimized 0.2% of them. I am not sure it worth it as is.

You're probably right.

> I think optimizing add2, and in general all *2 ops is a good idea, but
> we should be able to do more agressive optimization. Maybe, a bit like
> Blue was suggesting, add2 should always be followed by a nop, so we can
> do more optimizations?

Adding an extra nop sounds like a better idea than add2_part[12].  And
it's probably easier than adding mov2 opcodes -- one little assert inside
the optimizer and we're golden.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2 Richard Henderson
  2012-09-27 23:20   ` Aurelien Jarno
@ 2012-09-30  7:04   ` Blue Swirl
  2012-10-01 18:36     ` Richard Henderson
  1 sibling, 1 reply; 21+ messages in thread
From: Blue Swirl @ 2012-09-30  7:04 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, Aurelien Jarno

On Thu, Sep 27, 2012 at 5:19 PM, Richard Henderson <rth@twiddle.net> wrote:
> We can't do complete constant folding because we lack "mov2",
> or the ability to insert opcodes in the stream.  But we can
> at least canonicalize add2 operand ordering and simplify
> add2 to add when the lowpart adds a constant 0.

Couldn't we introduce add2_part1 and add2_part2, the latter being nop
for architectures that don't need it?

>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 31 +++++++++++++++++++++++++++++++
>  1 file changed, 31 insertions(+)
>
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 55f2a24..004c336 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -470,6 +470,11 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              if (swap_commutative(args[0], &args[4], &args[3])) {
>                  args[5] = tcg_invert_cond(args[5]);
>              }
> +            break;
> +        case INDEX_op_add2_i32:
> +            swap_commutative(args[0], &args[2], &args[4]);
> +            swap_commutative(args[1], &args[3], &args[5]);
> +            break;
>          default:
>              break;
>          }
> @@ -522,6 +527,32 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>                  continue;
>              }
>              break;
> +        case INDEX_op_add2_i32:
> +        case INDEX_op_sub2_i32:
> +            /* Simplify op rl, rh, al, ah, 0, bh => op rh, ah, bh.
> +               The zero implies there will be no carry into the high part.
> +               But only when rl == al, since we can't insert the extra move
> +               that would be required.  */
> +            if (temps[args[4]].state == TCG_TEMP_CONST
> +                && temps[args[4]].val == 0
> +                && temps_are_copies(args[0], args[2])) {
> +                if (temps[args[5]].state == TCG_TEMP_CONST
> +                    && temps[args[5]].val == 0
> +                    && temps_are_copies(args[1], args[3])) {
> +                    gen_opc_buf[op_index] = INDEX_op_nop;
> +                } else {
> +                    gen_opc_buf[op_index] = (op == INDEX_op_add2_i32
> +                                             ? INDEX_op_add_i32
> +                                             : INDEX_op_sub_i32);
> +                    args[0] = args[1];
> +                    args[1] = args[3];
> +                    args[2] = args[5];
> +                    gen_args += 3;
> +                }
> +                args += 6;
> +                continue;
> +            }
> +            break;
>          default:
>              break;
>          }
> --
> 1.7.11.4
>
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2
  2012-09-30  7:04   ` Blue Swirl
@ 2012-10-01 18:36     ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2012-10-01 18:36 UTC (permalink / raw)
  To: Blue Swirl; +Cc: qemu-devel, Aurelien Jarno

On 2012-09-30 00:04, Blue Swirl wrote:
>> We can't do complete constant folding because we lack "mov2",
>> > or the ability to insert opcodes in the stream.  But we can
>> > at least canonicalize add2 operand ordering and simplify
>> > add2 to add when the lowpart adds a constant 0.
> Couldn't we introduce add2_part1 and add2_part2, the latter being nop
> for architectures that don't need it?

Possibly.  It certainly would be easy to model these as addcc + addx on
targets like sparc where CC never gets clobbered during moves.

I'm a bit worried about i386 though, since loading 0 wants to use xor
and clobber the flags.  We could possibly work around this by taking
care of the constant loading for add2_part2 manually.  E.g.

  { INDEX_op_add2_part2, { "r", "ri", "ri" } }

  if (args[2] == args[0] && !const_args[2]) {
    // swap arg1 arg2
  }
  if (const_args[1]) {
    mov $args[1], args[0]
  } else {
    mov args[1], args[0]
  }
  adcl args[2], args[0]

which means that tcg_out_movi will not have to be called in between.
It's all a bit fragile though.

That said, I do wonder if having a synthetic mov2{rr,ri,ii} opcodes
isn't just easier.  That could be broken up into two moves by tcg.c
without the backends having to care about it.

r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 3/7] tcg: Swap commutative double-word comparisons
  2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 1/7] tcg: Split out swap_commutative as a subroutine Richard Henderson
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2 Richard Henderson
@ 2012-09-27 17:19 ` Richard Henderson
  2012-09-27 23:22   ` Aurelien Jarno
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 4/7] tcg: Optimize double-word comparisons against zero Richard Henderson
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 004c336..d39926e 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -397,6 +397,22 @@ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
     return false;
 }
 
+static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+{
+    int sum = 0;
+    sum += temps[p1[0]].state == TCG_TEMP_CONST;
+    sum += temps[p1[1]].state == TCG_TEMP_CONST;
+    sum -= temps[p2[0]].state == TCG_TEMP_CONST;
+    sum -= temps[p2[1]].state == TCG_TEMP_CONST;
+    if (sum > 0) {
+        TCGArg t;
+        t = p1[0], p1[0] = p2[0], p2[0] = t;
+        t = p1[1], p1[1] = p2[1], p2[1] = t;
+        return true;
+    }
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
@@ -475,6 +491,16 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             swap_commutative(args[0], &args[2], &args[4]);
             swap_commutative(args[1], &args[3], &args[5]);
             break;
+        case INDEX_op_brcond2_i32:
+            if (swap_commutative2(&args[0], &args[2])) {
+                args[4] = tcg_swap_cond(args[4]);
+            }
+            break;
+        case INDEX_op_setcond2_i32:
+            if (swap_commutative2(&args[1], &args[3])) {
+                args[5] = tcg_swap_cond(args[5]);
+            }
+            break;
         default:
             break;
         }
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 3/7] tcg: Swap commutative double-word comparisons
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 3/7] tcg: Swap commutative double-word comparisons Richard Henderson
@ 2012-09-27 23:22   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-09-27 23:22 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 10:19:53AM -0700, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 004c336..d39926e 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -397,6 +397,22 @@ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
>      return false;
>  }
>  
> +static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
> +{
> +    int sum = 0;
> +    sum += temps[p1[0]].state == TCG_TEMP_CONST;
> +    sum += temps[p1[1]].state == TCG_TEMP_CONST;
> +    sum -= temps[p2[0]].state == TCG_TEMP_CONST;
> +    sum -= temps[p2[1]].state == TCG_TEMP_CONST;
> +    if (sum > 0) {
> +        TCGArg t;
> +        t = p1[0], p1[0] = p2[0], p2[0] = t;
> +        t = p1[1], p1[1] = p2[1], p2[1] = t;
> +        return true;
> +    }
> +    return false;
> +}
> +
>  /* Propagate constants and copies, fold constant expressions. */
>  static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>                                      TCGArg *args, TCGOpDef *tcg_op_defs)
> @@ -475,6 +491,16 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              swap_commutative(args[0], &args[2], &args[4]);
>              swap_commutative(args[1], &args[3], &args[5]);
>              break;
> +        case INDEX_op_brcond2_i32:
> +            if (swap_commutative2(&args[0], &args[2])) {
> +                args[4] = tcg_swap_cond(args[4]);
> +            }
> +            break;
> +        case INDEX_op_setcond2_i32:
> +            if (swap_commutative2(&args[1], &args[3])) {
> +                args[5] = tcg_swap_cond(args[5]);
> +            }
> +            break;
>          default:
>              break;
>          }

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 4/7] tcg: Optimize double-word comparisons against zero
  2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
                   ` (2 preceding siblings ...)
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 3/7] tcg: Swap commutative double-word comparisons Richard Henderson
@ 2012-09-27 17:19 ` Richard Henderson
  2012-10-01 18:43   ` Aurelien Jarno
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 5/7] tcg: Split out subroutines from do_constant_folding_cond Richard Henderson
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index d39926e..c972e4f 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -799,6 +799,57 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             }
             args += 6;
             break;
+        case INDEX_op_brcond2_i32:
+            /* Simplify LT/GE comparisons vs zero to a single compare
+               vs the high word of the input.  */
+            if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
+                && temps[args[2]].state == TCG_TEMP_CONST
+                && temps[args[3]].state == TCG_TEMP_CONST
+                && temps[args[2]].val == 0
+                && temps[args[2]].val == 0) {
+                gen_opc_buf[op_index] = INDEX_op_brcond_i32;
+                args[0] = args[1];
+                args[1] = args[3];
+                args[2] = args[4];
+                args[3] = args[5];
+                gen_args += 4;
+            } else {
+                gen_args[0] = args[0];
+                gen_args[1] = args[1];
+                gen_args[2] = args[2];
+                gen_args[3] = args[3];
+                gen_args[4] = args[4];
+                gen_args[5] = args[5];
+                gen_args += 6;
+            }
+            memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
+            args += 6;
+            break;
+        case INDEX_op_setcond2_i32:
+            /* Simplify LT/GE comparisons vs zero to a single compare
+               vs the high word of the input.  */
+            if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
+                && temps[args[3]].state == TCG_TEMP_CONST
+                && temps[args[4]].state == TCG_TEMP_CONST
+                && temps[args[3]].val == 0
+                && temps[args[4]].val == 0) {
+                gen_opc_buf[op_index] = INDEX_op_setcond_i32;
+                args[1] = args[2];
+                args[2] = args[4];
+                args[3] = args[5];
+                gen_args += 4;
+            } else {
+                reset_temp(args[0]);
+                gen_args[0] = args[0];
+                gen_args[1] = args[1];
+                gen_args[2] = args[2];
+                gen_args[3] = args[3];
+                gen_args[4] = args[4];
+                gen_args[5] = args[5];
+                gen_args += 6;
+            }
+            args += 6;
+            break;
         case INDEX_op_call:
             nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
             if (!(args[nb_call_args + 1] & (TCG_CALL_CONST | TCG_CALL_PURE))) {
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/7] tcg: Optimize double-word comparisons against zero
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 4/7] tcg: Optimize double-word comparisons against zero Richard Henderson
@ 2012-10-01 18:43   ` Aurelien Jarno
  2012-10-01 18:47     ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Aurelien Jarno @ 2012-10-01 18:43 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 10:19:54AM -0700, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 51 insertions(+)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index d39926e..c972e4f 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -799,6 +799,57 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              }
>              args += 6;
>              break;
> +        case INDEX_op_brcond2_i32:
> +            /* Simplify LT/GE comparisons vs zero to a single compare
> +               vs the high word of the input.  */
> +            if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
> +                && temps[args[2]].state == TCG_TEMP_CONST
> +                && temps[args[3]].state == TCG_TEMP_CONST
> +                && temps[args[2]].val == 0
> +                && temps[args[2]].val == 0) {

The value comparison there is wrong, probably copy & paste issue. I 
wonder how it could work.

> +                gen_opc_buf[op_index] = INDEX_op_brcond_i32;
> +                args[0] = args[1];
> +                args[1] = args[3];
> +                args[2] = args[4];
> +                args[3] = args[5];
> +                gen_args += 4;
> +            } else {
> +                gen_args[0] = args[0];
> +                gen_args[1] = args[1];
> +                gen_args[2] = args[2];
> +                gen_args[3] = args[3];
> +                gen_args[4] = args[4];
> +                gen_args[5] = args[5];
> +                gen_args += 6;
> +            }
> +            memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
> +            args += 6;
> +            break;
> +        case INDEX_op_setcond2_i32:
> +            /* Simplify LT/GE comparisons vs zero to a single compare
> +               vs the high word of the input.  */
> +            if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
> +                && temps[args[3]].state == TCG_TEMP_CONST
> +                && temps[args[4]].state == TCG_TEMP_CONST
> +                && temps[args[3]].val == 0
> +                && temps[args[4]].val == 0) {

Here it is fine.

> +                gen_opc_buf[op_index] = INDEX_op_setcond_i32;
> +                args[1] = args[2];
> +                args[2] = args[4];
> +                args[3] = args[5];
> +                gen_args += 4;
> +            } else {
> +                reset_temp(args[0]);
> +                gen_args[0] = args[0];
> +                gen_args[1] = args[1];
> +                gen_args[2] = args[2];
> +                gen_args[3] = args[3];
> +                gen_args[4] = args[4];
> +                gen_args[5] = args[5];
> +                gen_args += 6;
> +            }
> +            args += 6;
> +            break;
>          case INDEX_op_call:
>              nb_call_args = (args[0] >> 16) + (args[0] & 0xffff);
>              if (!(args[nb_call_args + 1] & (TCG_CALL_CONST | TCG_CALL_PURE))) {

While it's a nice optimization to have, one that seems to happen a lot
more often is the two high parts being equal. It happens when the guest
is working on (u)int32_t.

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/7] tcg: Optimize double-word comparisons against zero
  2012-10-01 18:43   ` Aurelien Jarno
@ 2012-10-01 18:47     ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2012-10-01 18:47 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: qemu-devel

On 2012-10-01 11:43, Aurelien Jarno wrote:
> While it's a nice optimization to have, one that seems to happen a lot
> more often is the two high parts being equal. It happens when the guest
> is working on (u)int32_t.

It depends on what target you're looking at.  For alpha guest, all branches
are comparisons vs zero, so LT/GE happens with some regularity.



r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 5/7] tcg: Split out subroutines from do_constant_folding_cond
  2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
                   ` (3 preceding siblings ...)
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 4/7] tcg: Optimize double-word comparisons against zero Richard Henderson
@ 2012-09-27 17:19 ` Richard Henderson
  2012-10-01 18:46   ` Aurelien Jarno
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 6/7] tcg: Tidy brcond optimization Richard Henderson
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 7/7] tcg: Do constant folding on double-word comparisons Richard Henderson
  6 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

We can re-use these for implementing double-word folding.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 146 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 81 insertions(+), 65 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index c972e4f..c1881fa 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -292,6 +292,82 @@ static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
     return res;
 }
 
+static bool do_constant_folding_cond_32(uint32_t x, uint32_t y, TCGCond c)
+{
+    switch (c) {
+    case TCG_COND_EQ:
+        return x == y;
+    case TCG_COND_NE:
+        return x != y;
+    case TCG_COND_LT:
+        return (int32_t)x < (int32_t)y;
+    case TCG_COND_GE:
+        return (int32_t)x >= (int32_t)y;
+    case TCG_COND_LE:
+        return (int32_t)x <= (int32_t)y;
+    case TCG_COND_GT:
+        return (int32_t)x > (int32_t)y;
+    case TCG_COND_LTU:
+        return x < y;
+    case TCG_COND_GEU:
+        return x >= y;
+    case TCG_COND_LEU:
+        return x <= y;
+    case TCG_COND_GTU:
+        return x > y;
+    default:
+        tcg_abort();
+    }
+}
+
+static bool do_constant_folding_cond_64(uint64_t x, uint64_t y, TCGCond c)
+{
+    switch (c) {
+    case TCG_COND_EQ:
+        return x == y;
+    case TCG_COND_NE:
+        return x != y;
+    case TCG_COND_LT:
+        return (int64_t)x < (int64_t)y;
+    case TCG_COND_GE:
+        return (int64_t)x >= (int64_t)y;
+    case TCG_COND_LE:
+        return (int64_t)x <= (int64_t)y;
+    case TCG_COND_GT:
+        return (int64_t)x > (int64_t)y;
+    case TCG_COND_LTU:
+        return x < y;
+    case TCG_COND_GEU:
+        return x >= y;
+    case TCG_COND_LEU:
+        return x <= y;
+    case TCG_COND_GTU:
+        return x > y;
+    default:
+        tcg_abort();
+    }
+}
+
+static bool do_constant_folding_cond_eq(TCGCond c)
+{
+    switch (c) {
+    case TCG_COND_GT:
+    case TCG_COND_LTU:
+    case TCG_COND_LT:
+    case TCG_COND_GTU:
+    case TCG_COND_NE:
+        return 0;
+    case TCG_COND_GE:
+    case TCG_COND_GEU:
+    case TCG_COND_LE:
+    case TCG_COND_LEU:
+    case TCG_COND_EQ:
+        return 1;
+    default:
+        tcg_abort();
+    }
+}
+
 /* Return 2 if the condition can't be simplified, and the result
    of the condition (0 or 1) if it can */
 static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
@@ -300,69 +376,14 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
     if (temps[x].state == TCG_TEMP_CONST && temps[y].state == TCG_TEMP_CONST) {
         switch (op_bits(op)) {
         case 32:
-            switch (c) {
-            case TCG_COND_EQ:
-                return (uint32_t)temps[x].val == (uint32_t)temps[y].val;
-            case TCG_COND_NE:
-                return (uint32_t)temps[x].val != (uint32_t)temps[y].val;
-            case TCG_COND_LT:
-                return (int32_t)temps[x].val < (int32_t)temps[y].val;
-            case TCG_COND_GE:
-                return (int32_t)temps[x].val >= (int32_t)temps[y].val;
-            case TCG_COND_LE:
-                return (int32_t)temps[x].val <= (int32_t)temps[y].val;
-            case TCG_COND_GT:
-                return (int32_t)temps[x].val > (int32_t)temps[y].val;
-            case TCG_COND_LTU:
-                return (uint32_t)temps[x].val < (uint32_t)temps[y].val;
-            case TCG_COND_GEU:
-                return (uint32_t)temps[x].val >= (uint32_t)temps[y].val;
-            case TCG_COND_LEU:
-                return (uint32_t)temps[x].val <= (uint32_t)temps[y].val;
-            case TCG_COND_GTU:
-                return (uint32_t)temps[x].val > (uint32_t)temps[y].val;
-            }
-            break;
+            return do_constant_folding_cond_32(temps[x].val, temps[y].val, c);
         case 64:
-            switch (c) {
-            case TCG_COND_EQ:
-                return (uint64_t)temps[x].val == (uint64_t)temps[y].val;
-            case TCG_COND_NE:
-                return (uint64_t)temps[x].val != (uint64_t)temps[y].val;
-            case TCG_COND_LT:
-                return (int64_t)temps[x].val < (int64_t)temps[y].val;
-            case TCG_COND_GE:
-                return (int64_t)temps[x].val >= (int64_t)temps[y].val;
-            case TCG_COND_LE:
-                return (int64_t)temps[x].val <= (int64_t)temps[y].val;
-            case TCG_COND_GT:
-                return (int64_t)temps[x].val > (int64_t)temps[y].val;
-            case TCG_COND_LTU:
-                return (uint64_t)temps[x].val < (uint64_t)temps[y].val;
-            case TCG_COND_GEU:
-                return (uint64_t)temps[x].val >= (uint64_t)temps[y].val;
-            case TCG_COND_LEU:
-                return (uint64_t)temps[x].val <= (uint64_t)temps[y].val;
-            case TCG_COND_GTU:
-                return (uint64_t)temps[x].val > (uint64_t)temps[y].val;
-            }
-            break;
+            return do_constant_folding_cond_64(temps[x].val, temps[y].val, c);
+        default:
+            tcg_abort();
         }
     } else if (temps_are_copies(x, y)) {
-        switch (c) {
-        case TCG_COND_GT:
-        case TCG_COND_LTU:
-        case TCG_COND_LT:
-        case TCG_COND_GTU:
-        case TCG_COND_NE:
-            return 0;
-        case TCG_COND_GE:
-        case TCG_COND_GEU:
-        case TCG_COND_LE:
-        case TCG_COND_LEU:
-        case TCG_COND_EQ:
-            return 1;
-        }
+        return do_constant_folding_cond_eq(c);
     } else if (temps[y].state == TCG_TEMP_CONST && temps[y].val == 0) {
         switch (c) {
         case TCG_COND_LTU:
@@ -375,11 +396,6 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
     } else {
         return 2;
     }
-
-    fprintf(stderr,
-            "Unrecognized bitness %d or condition %d in "
-            "do_constant_folding_cond.\n", op_bits(op), c);
-    tcg_abort();
 }
 
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 5/7] tcg: Split out subroutines from do_constant_folding_cond
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 5/7] tcg: Split out subroutines from do_constant_folding_cond Richard Henderson
@ 2012-10-01 18:46   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-10-01 18:46 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 10:19:55AM -0700, Richard Henderson wrote:
> We can re-use these for implementing double-word folding.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 146 ++++++++++++++++++++++++++++++++-------------------------
>  1 file changed, 81 insertions(+), 65 deletions(-)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index c972e4f..c1881fa 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -292,6 +292,82 @@ static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
>      return res;
>  }
>  
> +static bool do_constant_folding_cond_32(uint32_t x, uint32_t y, TCGCond c)
> +{
> +    switch (c) {
> +    case TCG_COND_EQ:
> +        return x == y;
> +    case TCG_COND_NE:
> +        return x != y;
> +    case TCG_COND_LT:
> +        return (int32_t)x < (int32_t)y;
> +    case TCG_COND_GE:
> +        return (int32_t)x >= (int32_t)y;
> +    case TCG_COND_LE:
> +        return (int32_t)x <= (int32_t)y;
> +    case TCG_COND_GT:
> +        return (int32_t)x > (int32_t)y;
> +    case TCG_COND_LTU:
> +        return x < y;
> +    case TCG_COND_GEU:
> +        return x >= y;
> +    case TCG_COND_LEU:
> +        return x <= y;
> +    case TCG_COND_GTU:
> +        return x > y;
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> +static bool do_constant_folding_cond_64(uint64_t x, uint64_t y, TCGCond c)
> +{
> +    switch (c) {
> +    case TCG_COND_EQ:
> +        return x == y;
> +    case TCG_COND_NE:
> +        return x != y;
> +    case TCG_COND_LT:
> +        return (int64_t)x < (int64_t)y;
> +    case TCG_COND_GE:
> +        return (int64_t)x >= (int64_t)y;
> +    case TCG_COND_LE:
> +        return (int64_t)x <= (int64_t)y;
> +    case TCG_COND_GT:
> +        return (int64_t)x > (int64_t)y;
> +    case TCG_COND_LTU:
> +        return x < y;
> +    case TCG_COND_GEU:
> +        return x >= y;
> +    case TCG_COND_LEU:
> +        return x <= y;
> +    case TCG_COND_GTU:
> +        return x > y;
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> +static bool do_constant_folding_cond_eq(TCGCond c)
> +{
> +    switch (c) {
> +    case TCG_COND_GT:
> +    case TCG_COND_LTU:
> +    case TCG_COND_LT:
> +    case TCG_COND_GTU:
> +    case TCG_COND_NE:
> +        return 0;
> +    case TCG_COND_GE:
> +    case TCG_COND_GEU:
> +    case TCG_COND_LE:
> +    case TCG_COND_LEU:
> +    case TCG_COND_EQ:
> +        return 1;
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
>  /* Return 2 if the condition can't be simplified, and the result
>     of the condition (0 or 1) if it can */
>  static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
> @@ -300,69 +376,14 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
>      if (temps[x].state == TCG_TEMP_CONST && temps[y].state == TCG_TEMP_CONST) {
>          switch (op_bits(op)) {
>          case 32:
> -            switch (c) {
> -            case TCG_COND_EQ:
> -                return (uint32_t)temps[x].val == (uint32_t)temps[y].val;
> -            case TCG_COND_NE:
> -                return (uint32_t)temps[x].val != (uint32_t)temps[y].val;
> -            case TCG_COND_LT:
> -                return (int32_t)temps[x].val < (int32_t)temps[y].val;
> -            case TCG_COND_GE:
> -                return (int32_t)temps[x].val >= (int32_t)temps[y].val;
> -            case TCG_COND_LE:
> -                return (int32_t)temps[x].val <= (int32_t)temps[y].val;
> -            case TCG_COND_GT:
> -                return (int32_t)temps[x].val > (int32_t)temps[y].val;
> -            case TCG_COND_LTU:
> -                return (uint32_t)temps[x].val < (uint32_t)temps[y].val;
> -            case TCG_COND_GEU:
> -                return (uint32_t)temps[x].val >= (uint32_t)temps[y].val;
> -            case TCG_COND_LEU:
> -                return (uint32_t)temps[x].val <= (uint32_t)temps[y].val;
> -            case TCG_COND_GTU:
> -                return (uint32_t)temps[x].val > (uint32_t)temps[y].val;
> -            }
> -            break;
> +            return do_constant_folding_cond_32(temps[x].val, temps[y].val, c);
>          case 64:
> -            switch (c) {
> -            case TCG_COND_EQ:
> -                return (uint64_t)temps[x].val == (uint64_t)temps[y].val;
> -            case TCG_COND_NE:
> -                return (uint64_t)temps[x].val != (uint64_t)temps[y].val;
> -            case TCG_COND_LT:
> -                return (int64_t)temps[x].val < (int64_t)temps[y].val;
> -            case TCG_COND_GE:
> -                return (int64_t)temps[x].val >= (int64_t)temps[y].val;
> -            case TCG_COND_LE:
> -                return (int64_t)temps[x].val <= (int64_t)temps[y].val;
> -            case TCG_COND_GT:
> -                return (int64_t)temps[x].val > (int64_t)temps[y].val;
> -            case TCG_COND_LTU:
> -                return (uint64_t)temps[x].val < (uint64_t)temps[y].val;
> -            case TCG_COND_GEU:
> -                return (uint64_t)temps[x].val >= (uint64_t)temps[y].val;
> -            case TCG_COND_LEU:
> -                return (uint64_t)temps[x].val <= (uint64_t)temps[y].val;
> -            case TCG_COND_GTU:
> -                return (uint64_t)temps[x].val > (uint64_t)temps[y].val;
> -            }
> -            break;
> +            return do_constant_folding_cond_64(temps[x].val, temps[y].val, c);
> +        default:
> +            tcg_abort();
>          }
>      } else if (temps_are_copies(x, y)) {
> -        switch (c) {
> -        case TCG_COND_GT:
> -        case TCG_COND_LTU:
> -        case TCG_COND_LT:
> -        case TCG_COND_GTU:
> -        case TCG_COND_NE:
> -            return 0;
> -        case TCG_COND_GE:
> -        case TCG_COND_GEU:
> -        case TCG_COND_LE:
> -        case TCG_COND_LEU:
> -        case TCG_COND_EQ:
> -            return 1;
> -        }
> +        return do_constant_folding_cond_eq(c);
>      } else if (temps[y].state == TCG_TEMP_CONST && temps[y].val == 0) {
>          switch (c) {
>          case TCG_COND_LTU:
> @@ -375,11 +396,6 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
>      } else {
>          return 2;
>      }
> -
> -    fprintf(stderr,
> -            "Unrecognized bitness %d or condition %d in "
> -            "do_constant_folding_cond.\n", op_bits(op), c);
> -    tcg_abort();
>  }
>  
>  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 6/7] tcg: Tidy brcond optimization
  2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
                   ` (4 preceding siblings ...)
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 5/7] tcg: Split out subroutines from do_constant_folding_cond Richard Henderson
@ 2012-09-27 17:19 ` Richard Henderson
  2012-10-01 18:48   ` Aurelien Jarno
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 7/7] tcg: Do constant folding on double-word comparisons Richard Henderson
  6 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Do the memset once.  Don't reset_temp before doing so.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index c1881fa..dfac877 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -771,22 +771,22 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
                     gen_opc_buf[op_index] = INDEX_op_br;
                     gen_args[0] = args[3];
                     gen_args += 1;
                 } else {
                     gen_opc_buf[op_index] = INDEX_op_nop;
+                    args += 4;
+                    break;
                 }
             } else {
-                memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
-                reset_temp(args[0]);
                 gen_args[0] = args[0];
                 gen_args[1] = args[1];
                 gen_args[2] = args[2];
                 gen_args[3] = args[3];
                 gen_args += 4;
             }
+            memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
             args += 4;
             break;
         CASE_OP_32_64(movcond):
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 6/7] tcg: Tidy brcond optimization
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 6/7] tcg: Tidy brcond optimization Richard Henderson
@ 2012-10-01 18:48   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-10-01 18:48 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 10:19:56AM -0700, Richard Henderson wrote:
> Do the memset once.  Don't reset_temp before doing so.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index c1881fa..dfac877 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -771,22 +771,22 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              tmp = do_constant_folding_cond(op, args[0], args[1], args[2]);
>              if (tmp != 2) {
>                  if (tmp) {
> -                    memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
>                      gen_opc_buf[op_index] = INDEX_op_br;
>                      gen_args[0] = args[3];
>                      gen_args += 1;
>                  } else {
>                      gen_opc_buf[op_index] = INDEX_op_nop;
> +                    args += 4;
> +                    break;
>                  }
>              } else {
> -                memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
> -                reset_temp(args[0]);
>                  gen_args[0] = args[0];
>                  gen_args[1] = args[1];
>                  gen_args[2] = args[2];
>                  gen_args[3] = args[3];
>                  gen_args += 4;
>              }
> +            memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
>              args += 4;
>              break;
>          CASE_OP_32_64(movcond):
> -- 
> 1.7.11.4
> 

Removing the useless reset_temp() is indeed something to do. I am not so
sure that factorizing the memset() and putting a break in the nop case
is easier to read. Nevertheless:

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 7/7] tcg: Do constant folding on double-word comparisons
  2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
                   ` (5 preceding siblings ...)
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 6/7] tcg: Tidy brcond optimization Richard Henderson
@ 2012-09-27 17:19 ` Richard Henderson
  2012-10-01 18:50   ` Aurelien Jarno
  6 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2012-09-27 17:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Aurelien Jarno

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 134 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 94 insertions(+), 40 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index dfac877..f6a16fd 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -398,6 +398,40 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
     }
 }
 
+/* Return 2 if the condition can't be simplified, and the result
+   of the condition (0 or 1) if it can */
+static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+{
+    TCGArg al = p1[0], ah = p1[1];
+    TCGArg bl = p2[0], bh = p2[1];
+
+    if (temps[bl].state == TCG_TEMP_CONST
+        && temps[bh].state == TCG_TEMP_CONST) {
+        uint64_t b = ((uint64_t)temps[bh].val << 32) | (uint32_t)temps[bl].val;
+
+        if (temps[al].state == TCG_TEMP_CONST
+            && temps[ah].state == TCG_TEMP_CONST) {
+            uint64_t a;
+            a = ((uint64_t)temps[ah].val << 32) | (uint32_t)temps[al].val;
+            return do_constant_folding_cond_64(a, b, c);
+        }
+        if (b == 0) {
+            switch (c) {
+            case TCG_COND_LTU:
+                return 0;
+            case TCG_COND_GEU:
+                return 1;
+            default:
+                break;
+            }
+        }
+    }
+    if (temps_are_copies(al, bl) && temps_are_copies(ah, bh)) {
+        return do_constant_folding_cond_eq(c);
+    }
+    return 2;
+}
+
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
@@ -816,53 +850,73 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             args += 6;
             break;
         case INDEX_op_brcond2_i32:
-            /* Simplify LT/GE comparisons vs zero to a single compare
-               vs the high word of the input.  */
-            if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
-                && temps[args[2]].state == TCG_TEMP_CONST
-                && temps[args[3]].state == TCG_TEMP_CONST
-                && temps[args[2]].val == 0
-                && temps[args[2]].val == 0) {
-                gen_opc_buf[op_index] = INDEX_op_brcond_i32;
-                args[0] = args[1];
-                args[1] = args[3];
-                args[2] = args[4];
-                args[3] = args[5];
-                gen_args += 4;
+            tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
+            if (tmp != 2) {
+                if (tmp) {
+                    gen_opc_buf[op_index] = INDEX_op_br;
+                    gen_args[0] = args[5];
+                    gen_args += 1;
+                } else {
+                    gen_opc_buf[op_index] = INDEX_op_nop;
+                    args += 6;
+                    break;
+                }
             } else {
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args[2] = args[2];
-                gen_args[3] = args[3];
-                gen_args[4] = args[4];
-                gen_args[5] = args[5];
-                gen_args += 6;
+                /* Simplify LT/GE comparisons vs zero to a single compare
+                   vs the high word of the input.  */
+                if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
+                    && temps[args[2]].state == TCG_TEMP_CONST
+                    && temps[args[3]].state == TCG_TEMP_CONST
+                    && temps[args[2]].val == 0
+                    && temps[args[2]].val == 0) {
+                    gen_opc_buf[op_index] = INDEX_op_brcond_i32;
+                    args[0] = args[1];
+                    args[1] = args[3];
+                    args[2] = args[4];
+                    args[3] = args[5];
+                    gen_args += 4;
+                } else {
+                    gen_args[0] = args[0];
+                    gen_args[1] = args[1];
+                    gen_args[2] = args[2];
+                    gen_args[3] = args[3];
+                    gen_args[4] = args[4];
+                    gen_args[5] = args[5];
+                    gen_args += 6;
+                }
             }
             memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
             args += 6;
             break;
         case INDEX_op_setcond2_i32:
-            /* Simplify LT/GE comparisons vs zero to a single compare
-               vs the high word of the input.  */
-            if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
-                && temps[args[3]].state == TCG_TEMP_CONST
-                && temps[args[4]].state == TCG_TEMP_CONST
-                && temps[args[3]].val == 0
-                && temps[args[4]].val == 0) {
-                gen_opc_buf[op_index] = INDEX_op_setcond_i32;
-                args[1] = args[2];
-                args[2] = args[4];
-                args[3] = args[5];
-                gen_args += 4;
+            tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
+            if (tmp != 2) {
+                gen_opc_buf[op_index] = INDEX_op_movi_i32;
+                tcg_opt_gen_movi(gen_args, args[0], tmp);
+                gen_args += 2;
             } else {
-                reset_temp(args[0]);
-                gen_args[0] = args[0];
-                gen_args[1] = args[1];
-                gen_args[2] = args[2];
-                gen_args[3] = args[3];
-                gen_args[4] = args[4];
-                gen_args[5] = args[5];
-                gen_args += 6;
+                /* Simplify LT/GE comparisons vs zero to a single compare
+                   vs the high word of the input.  */
+                if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
+                    && temps[args[3]].state == TCG_TEMP_CONST
+                    && temps[args[4]].state == TCG_TEMP_CONST
+                    && temps[args[3]].val == 0
+                    && temps[args[4]].val == 0) {
+                    gen_opc_buf[op_index] = INDEX_op_setcond_i32;
+                    args[1] = args[2];
+                    args[2] = args[4];
+                    args[3] = args[5];
+                    gen_args += 4;
+                } else {
+                    reset_temp(args[0]);
+                    gen_args[0] = args[0];
+                    gen_args[1] = args[1];
+                    gen_args[2] = args[2];
+                    gen_args[3] = args[3];
+                    gen_args[4] = args[4];
+                    gen_args[5] = args[5];
+                    gen_args += 6;
+                }
             }
             args += 6;
             break;
-- 
1.7.11.4

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 7/7] tcg: Do constant folding on double-word comparisons
  2012-09-27 17:19 ` [Qemu-devel] [PATCH 7/7] tcg: Do constant folding on double-word comparisons Richard Henderson
@ 2012-10-01 18:50   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2012-10-01 18:50 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Sep 27, 2012 at 10:19:57AM -0700, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 134 ++++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 94 insertions(+), 40 deletions(-)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index dfac877..f6a16fd 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -398,6 +398,40 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
>      }
>  }
>  
> +/* Return 2 if the condition can't be simplified, and the result
> +   of the condition (0 or 1) if it can */
> +static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
> +{
> +    TCGArg al = p1[0], ah = p1[1];
> +    TCGArg bl = p2[0], bh = p2[1];
> +
> +    if (temps[bl].state == TCG_TEMP_CONST
> +        && temps[bh].state == TCG_TEMP_CONST) {
> +        uint64_t b = ((uint64_t)temps[bh].val << 32) | (uint32_t)temps[bl].val;
> +
> +        if (temps[al].state == TCG_TEMP_CONST
> +            && temps[ah].state == TCG_TEMP_CONST) {
> +            uint64_t a;
> +            a = ((uint64_t)temps[ah].val << 32) | (uint32_t)temps[al].val;
> +            return do_constant_folding_cond_64(a, b, c);
> +        }
> +        if (b == 0) {
> +            switch (c) {
> +            case TCG_COND_LTU:
> +                return 0;
> +            case TCG_COND_GEU:
> +                return 1;
> +            default:
> +                break;
> +            }
> +        }
> +    }
> +    if (temps_are_copies(al, bl) && temps_are_copies(ah, bh)) {
> +        return do_constant_folding_cond_eq(c);
> +    }
> +    return 2;
> +}
> +
>  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
>  {
>      TCGArg a1 = *p1, a2 = *p2;
> @@ -816,53 +850,73 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>              args += 6;
>              break;
>          case INDEX_op_brcond2_i32:
> -            /* Simplify LT/GE comparisons vs zero to a single compare
> -               vs the high word of the input.  */
> -            if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
> -                && temps[args[2]].state == TCG_TEMP_CONST
> -                && temps[args[3]].state == TCG_TEMP_CONST
> -                && temps[args[2]].val == 0
> -                && temps[args[2]].val == 0) {
> -                gen_opc_buf[op_index] = INDEX_op_brcond_i32;
> -                args[0] = args[1];
> -                args[1] = args[3];
> -                args[2] = args[4];
> -                args[3] = args[5];
> -                gen_args += 4;
> +            tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
> +            if (tmp != 2) {
> +                if (tmp) {
> +                    gen_opc_buf[op_index] = INDEX_op_br;
> +                    gen_args[0] = args[5];
> +                    gen_args += 1;
> +                } else {
> +                    gen_opc_buf[op_index] = INDEX_op_nop;
> +                    args += 6;
> +                    break;
> +                }
>              } else {
> -                gen_args[0] = args[0];
> -                gen_args[1] = args[1];
> -                gen_args[2] = args[2];
> -                gen_args[3] = args[3];
> -                gen_args[4] = args[4];
> -                gen_args[5] = args[5];
> -                gen_args += 6;
> +                /* Simplify LT/GE comparisons vs zero to a single compare
> +                   vs the high word of the input.  */
> +                if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
> +                    && temps[args[2]].state == TCG_TEMP_CONST
> +                    && temps[args[3]].state == TCG_TEMP_CONST
> +                    && temps[args[2]].val == 0
> +                    && temps[args[2]].val == 0) {
> +                    gen_opc_buf[op_index] = INDEX_op_brcond_i32;
> +                    args[0] = args[1];
> +                    args[1] = args[3];
> +                    args[2] = args[4];
> +                    args[3] = args[5];
> +                    gen_args += 4;
> +                } else {
> +                    gen_args[0] = args[0];
> +                    gen_args[1] = args[1];
> +                    gen_args[2] = args[2];
> +                    gen_args[3] = args[3];
> +                    gen_args[4] = args[4];
> +                    gen_args[5] = args[5];
> +                    gen_args += 6;
> +                }
>              }
>              memset(temps, 0, nb_temps * sizeof(struct tcg_temp_info));
>              args += 6;
>              break;
>          case INDEX_op_setcond2_i32:
> -            /* Simplify LT/GE comparisons vs zero to a single compare
> -               vs the high word of the input.  */
> -            if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
> -                && temps[args[3]].state == TCG_TEMP_CONST
> -                && temps[args[4]].state == TCG_TEMP_CONST
> -                && temps[args[3]].val == 0
> -                && temps[args[4]].val == 0) {
> -                gen_opc_buf[op_index] = INDEX_op_setcond_i32;
> -                args[1] = args[2];
> -                args[2] = args[4];
> -                args[3] = args[5];
> -                gen_args += 4;
> +            tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
> +            if (tmp != 2) {
> +                gen_opc_buf[op_index] = INDEX_op_movi_i32;
> +                tcg_opt_gen_movi(gen_args, args[0], tmp);
> +                gen_args += 2;
>              } else {
> -                reset_temp(args[0]);
> -                gen_args[0] = args[0];
> -                gen_args[1] = args[1];
> -                gen_args[2] = args[2];
> -                gen_args[3] = args[3];
> -                gen_args[4] = args[4];
> -                gen_args[5] = args[5];
> -                gen_args += 6;
> +                /* Simplify LT/GE comparisons vs zero to a single compare
> +                   vs the high word of the input.  */
> +                if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
> +                    && temps[args[3]].state == TCG_TEMP_CONST
> +                    && temps[args[4]].state == TCG_TEMP_CONST
> +                    && temps[args[3]].val == 0
> +                    && temps[args[4]].val == 0) {
> +                    gen_opc_buf[op_index] = INDEX_op_setcond_i32;
> +                    args[1] = args[2];
> +                    args[2] = args[4];
> +                    args[3] = args[5];
> +                    gen_args += 4;
> +                } else {
> +                    reset_temp(args[0]);
> +                    gen_args[0] = args[0];
> +                    gen_args[1] = args[1];
> +                    gen_args[2] = args[2];
> +                    gen_args[3] = args[3];
> +                    gen_args[4] = args[4];
> +                    gen_args[5] = args[5];
> +                    gen_args += 6;
> +                }
>              }
>              args += 6;
>              break;

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2012-10-01 18:50 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-09-27 17:19 [Qemu-devel] [PATCH 0/7] Double-word tcg/optimize improvements Richard Henderson
2012-09-27 17:19 ` [Qemu-devel] [PATCH 1/7] tcg: Split out swap_commutative as a subroutine Richard Henderson
2012-09-27 21:45   ` Aurelien Jarno
2012-09-27 17:19 ` [Qemu-devel] [PATCH 2/7] tcg: Optimize add2 + sub2 Richard Henderson
2012-09-27 23:20   ` Aurelien Jarno
2012-09-27 23:28     ` Richard Henderson
2012-10-01 17:46       ` Aurelien Jarno
2012-10-01 18:41         ` Richard Henderson
2012-09-30  7:04   ` Blue Swirl
2012-10-01 18:36     ` Richard Henderson
2012-09-27 17:19 ` [Qemu-devel] [PATCH 3/7] tcg: Swap commutative double-word comparisons Richard Henderson
2012-09-27 23:22   ` Aurelien Jarno
2012-09-27 17:19 ` [Qemu-devel] [PATCH 4/7] tcg: Optimize double-word comparisons against zero Richard Henderson
2012-10-01 18:43   ` Aurelien Jarno
2012-10-01 18:47     ` Richard Henderson
2012-09-27 17:19 ` [Qemu-devel] [PATCH 5/7] tcg: Split out subroutines from do_constant_folding_cond Richard Henderson
2012-10-01 18:46   ` Aurelien Jarno
2012-09-27 17:19 ` [Qemu-devel] [PATCH 6/7] tcg: Tidy brcond optimization Richard Henderson
2012-10-01 18:48   ` Aurelien Jarno
2012-09-27 17:19 ` [Qemu-devel] [PATCH 7/7] tcg: Do constant folding on double-word comparisons Richard Henderson
2012-10-01 18:50   ` Aurelien Jarno

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).