qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH] RFC: TCG constant propagation.
@ 2009-07-21 14:37 Filip Navara
  2009-07-23  9:08 ` Filip Navara
  2009-08-05  8:13 ` Pablo Virolainen
  0 siblings, 2 replies; 10+ messages in thread
From: Filip Navara @ 2009-07-21 14:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: @codesourcery.compaul

Add support for constant propagation to TCG. This has to be paired with the liveness
analysis to remove the dead code. Not all possible operations are covered, but the
most common ones are. This improves the code generation for several ARM instructions,
like MVN (immediate), and it may help other targets as well.
---
 tcg/tcg.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 164 insertions(+), 0 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 4cb5934..10b6a99 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1025,7 +1025,169 @@ void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs)
 #endif
         tdefs++;
     }
+}
+
+static void tcg_const_analysis(TCGContext *s)
+{
+    int nb_cargs, nb_iargs, nb_oargs, dest, src, src2, del_args, i;
+    TCGArg *args;
+    uint16_t op;
+    uint16_t *opc_ptr;
+    const TCGOpDef *def;
+    uint8_t *const_temps;
+    tcg_target_ulong *temp_values;
+    tcg_target_ulong val, mask;
+    tcg_target_ulong dest_val, src_val, src2_val;
+
+    const_temps = tcg_malloc(s->nb_temps);
+    memset(const_temps, 0, s->nb_temps);
+    temp_values = tcg_malloc(s->nb_temps * sizeof(uint32_t));
+
+    opc_ptr = gen_opc_buf;
+    args = gen_opparam_buf;
+    while (opc_ptr < gen_opc_ptr) {
+        op = *opc_ptr;
+        def = &tcg_op_defs[op];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
+        nb_cargs = def->nb_cargs;
+        del_args = 0;
+        mask = ~((tcg_target_ulong)0);
 
+        switch(op) {
+        case INDEX_op_movi_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_movi_i64:
+#endif
+            dest = args[0];
+            val = args[1];
+            const_temps[dest] = 1;
+            temp_values[dest] = val;
+            break;
+        case INDEX_op_mov_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_mov_i64:
+#endif
+            dest = args[0];
+            src = args[1];
+            const_temps[dest] = const_temps[src];
+            temp_values[dest] = temp_values[src];
+            break;
+        case INDEX_op_not_i32:
+#if TCG_TARGET_REG_BITS == 64
+            mask = 0xffffffff;
+        case INDEX_op_not_i64:
+#endif
+            dest = args[0];
+            src = args[1];
+            if (const_temps[src]) {
+                const_temps[dest] = 1;
+                dest_val = ~temp_values[src];
+                *opc_ptr = INDEX_op_movi_i32;
+                args[1] = temp_values[dest] = dest_val & mask;
+            } else {
+                const_temps[dest] = 0;
+            }
+            break;
+        case INDEX_op_add_i32:
+        case INDEX_op_sub_i32:
+        case INDEX_op_mul_i32:
+        case INDEX_op_and_i32:
+        case INDEX_op_or_i32:
+        case INDEX_op_xor_i32:
+        case INDEX_op_shl_i32:
+        case INDEX_op_shr_i32:
+#if TCG_TARGET_REG_BITS == 64
+            mask = 0xffffffff;
+        case INDEX_op_add_i64:
+        case INDEX_op_sub_i64:
+        case INDEX_op_mul_i64:
+        case INDEX_op_and_i64:
+        case INDEX_op_or_i64:
+        case INDEX_op_xor_i64:
+        case INDEX_op_shl_i64:
+        case INDEX_op_shr_i64:
+#endif
+
+            dest = args[0];
+            src = args[1];
+            src2 = args[2];
+            if (const_temps[src] && const_temps[src2]) {
+                src_val = temp_values[src];
+                src2_val = temp_values[src2];
+                const_temps[dest] = 1;
+                switch (op) {
+                case INDEX_op_add_i32:
+                    dest_val = src_val + src2_val;
+                    break;
+                case INDEX_op_sub_i32:
+                    dest_val = src_val - src2_val;
+                    break;
+                case INDEX_op_mul_i32:
+                    dest_val = src_val * src2_val;
+                    break;
+                case INDEX_op_and_i32:
+                    dest_val = src_val & src2_val;
+                    break;
+                case INDEX_op_or_i32:
+                    dest_val = src_val | src2_val;
+                    break;
+                case INDEX_op_xor_i32:
+                    dest_val = src_val ^ src2_val;
+                    break;
+                case INDEX_op_shl_i32:
+                    dest_val = src_val << src2_val;
+                    break;
+                case INDEX_op_shr_i32:
+                    dest_val = src_val >> src2_val;
+                    break;
+                default:
+                    tcg_abort();
+                    return;
+                }
+                *opc_ptr = INDEX_op_movi_i32;                
+                args[1] = temp_values[dest] = dest_val & mask;
+                del_args = 1;
+            } else {
+                const_temps[dest] = 0;
+            }
+            break;
+        case INDEX_op_call:
+            nb_oargs = args[0] >> 16;
+            nb_iargs = args[0] & 0xffff;
+            nb_cargs = def->nb_cargs;
+            args++;
+            for (i = 0; i < nb_oargs; i++) {
+                const_temps[args[i]] = 0;
+            }
+            break;
+        case INDEX_op_nopn:
+            /* variable number of arguments */
+            nb_cargs = args[0];
+            break;
+        case INDEX_op_set_label:
+            memset(const_temps, 0, s->nb_temps);
+            break;
+        default:
+            if (def->flags & TCG_OPF_BB_END) {
+                memset(const_temps, 0, s->nb_temps);
+            } else {
+                for (i = 0; i < nb_oargs; i++) {
+                    const_temps[args[i]] = 0;
+                }
+            }
+            break;
+        }
+        opc_ptr++;
+        args += nb_iargs + nb_oargs + nb_cargs - del_args;
+        if (del_args > 0) {
+            gen_opparam_ptr -= del_args;
+            memmove(args, args + del_args, (gen_opparam_ptr - args) * sizeof(*args));
+        }
+    }
+
+    if (args != gen_opparam_ptr)
+        tcg_abort();
 }
 
 #ifdef USE_LIVENESS_ANALYSIS
@@ -1895,6 +2057,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
     }
 #endif
 
+    tcg_const_analysis(s);
+
 #ifdef CONFIG_PROFILER
     s->la_time -= profile_getclock();
 #endif
-- 
1.6.3.2.1299.gee46c

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-21 14:37 [Qemu-devel] [PATCH] RFC: TCG constant propagation Filip Navara
@ 2009-07-23  9:08 ` Filip Navara
  2009-07-23  9:25   ` Laurent Desnogues
                     ` (2 more replies)
  2009-08-05  8:13 ` Pablo Virolainen
  1 sibling, 3 replies; 10+ messages in thread
From: Filip Navara @ 2009-07-23  9:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paul Brook

[-- Attachment #1: Type: text/plain, Size: 988 bytes --]

On Tue, Jul 21, 2009 at 4:38 PM, Filip Navara<filip.navara@gmail.com> wrote:
> Add support for constant propagation to TCG. This has to be paired with the liveness
> analysis to remove the dead code. Not all possible operations are covered, but the
> most common ones are. This improves the code generation for several ARM instructions,
> like MVN (immediate), and it may help other targets as well.
> ---
>  tcg/tcg.c |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 164 insertions(+), 0 deletions(-)

Attached is updated version of the patch which adds profiling code for
the TCG pass and hopefully fixes it for TCG 64-bit targets.

Also it looks that the actual code size is worse for TCG ARM target
with this optimization. To mitigate it I propose the other attached
patch which uses the barrel shifter for more optimal generation of the
"movi" operation. I can't test it, so any help is welcome.

Best regards,
Filip Navara

[-- Attachment #2: 0001-TCG-constant-propagation.patch --]
[-- Type: application/octet-stream, Size: 8340 bytes --]

From 403c7d19df34fb25a8b57d90000697a83488cd48 Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Tue, 21 Jul 2009 16:31:40 +0200
Subject: [PATCH 1/2] TCG constant propagation.

Add support for constant propagation to TCG. This has to be paired with the liveness
analysis to remove the dead code. Not all possible operations are covered, but the
most common ones are. This improves the code generation for several ARM instructions,
like MVN (immediate), and it may help other targets as well.

v1 -> v2:
Added profiling code and hopefully fixed for 64-bit TCG targets.
---
 tcg/tcg.c |  202 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg.h |    1 +
 2 files changed, 203 insertions(+), 0 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 4cb5934..e23e303 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1025,7 +1025,199 @@ void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs)
 #endif
         tdefs++;
     }
+}
+
+static void tcg_const_analysis(TCGContext *s)
+{
+    int nb_cargs, nb_iargs, nb_oargs, dest, src, src2, del_args, i;
+    TCGArg *args;
+    uint16_t op;
+    uint16_t *opc_ptr;
+    const TCGOpDef *def;
+    uint8_t *const_temps;
+    tcg_target_ulong *temp_values;
+    tcg_target_ulong val, mask;
+    tcg_target_ulong dest_val, src_val, src2_val;
+
+    const_temps = tcg_malloc(s->nb_temps);
+    memset(const_temps, 0, s->nb_temps);
+    temp_values = tcg_malloc(s->nb_temps * sizeof(uint32_t));
+
+    opc_ptr = gen_opc_buf;
+    args = gen_opparam_buf;
+    while (opc_ptr < gen_opc_ptr) {
+        op = *opc_ptr;
+        def = &tcg_op_defs[op];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
+        nb_cargs = def->nb_cargs;
+        del_args = 0;
+        mask = ~((tcg_target_ulong)0);
+
+        switch(op) {
+        case INDEX_op_movi_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_movi_i64:
+#endif
+            dest = args[0];
+            val = args[1];
+            const_temps[dest] = 1;
+            temp_values[dest] = val;
+            break;
+        case INDEX_op_mov_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_mov_i64:
+#endif
+            dest = args[0];
+            src = args[1];
+            const_temps[dest] = const_temps[src];
+            temp_values[dest] = temp_values[src];
+            break;
+#if defined(TCG_TARGET_HAS_not_i32) || defined(TCG_TARGET_HAS_not_i64)
+#ifdef TCG_TARGET_HAS_not_i32
+        case INDEX_op_not_i32:
+#endif
+#if TCG_TARGET_REG_BITS == 64
+            mask = 0xffffffff;
+#ifdef TCG_TARGET_HAS_not_i64
+        case INDEX_op_not_i64:
+#endif
+#endif
+            dest = args[0];
+            src = args[1];
+            if (const_temps[src]) {
+                const_temps[dest] = 1;
+                dest_val = ~temp_values[src];
+                *opc_ptr = INDEX_op_movi_i32;
+                args[1] = temp_values[dest] = dest_val & mask;
+            } else {
+                const_temps[dest] = 0;
+            }
+            break;
+#endif
+        case INDEX_op_add_i32:
+        case INDEX_op_sub_i32:
+        case INDEX_op_mul_i32:
+        case INDEX_op_and_i32:
+        case INDEX_op_or_i32:
+        case INDEX_op_xor_i32:
+        case INDEX_op_shl_i32:
+        case INDEX_op_shr_i32:
+#if TCG_TARGET_REG_BITS == 64
+            mask = 0xffffffff;
+        case INDEX_op_add_i64:
+        case INDEX_op_sub_i64:
+        case INDEX_op_mul_i64:
+        case INDEX_op_and_i64:
+        case INDEX_op_or_i64:
+        case INDEX_op_xor_i64:
+        case INDEX_op_shl_i64:
+        case INDEX_op_shr_i64:
+#endif
+
+            dest = args[0];
+            src = args[1];
+            src2 = args[2];
+            if (const_temps[src] && const_temps[src2]) {
+                src_val = temp_values[src];
+                src2_val = temp_values[src2];
+                const_temps[dest] = 1;
+                switch (op) {
+                case INDEX_op_add_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_add_i64:
+#endif
+                    dest_val = src_val + src2_val;
+                    break;
+                case INDEX_op_sub_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_sub_i64:
+#endif
+                    dest_val = src_val - src2_val;
+                    break;
+                case INDEX_op_mul_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_mul_i64:
+#endif
+                    dest_val = src_val * src2_val;
+                    break;
+                case INDEX_op_and_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_and_i64:
+#endif
+                    dest_val = src_val & src2_val;
+                    break;
+                case INDEX_op_or_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_or_i64:
+#endif
+                    dest_val = src_val | src2_val;
+                    break;
+                case INDEX_op_xor_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_xor_i64:
+#endif
+                    dest_val = src_val ^ src2_val;
+                    break;
+                case INDEX_op_shl_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_shl_i64:
+#endif
+                    dest_val = src_val << src2_val;
+                    break;
+                case INDEX_op_shr_i32:
+#if TCG_TARGET_REG_BITS == 64
+                case INDEX_op_shr_i64:
+#endif
+                    dest_val = src_val >> src2_val;
+                    break;
+                default:
+                    tcg_abort();
+                    return;
+                }
+                *opc_ptr = INDEX_op_movi_i32;                
+                args[1] = temp_values[dest] = dest_val & mask;
+                del_args = 1;
+            } else {
+                const_temps[dest] = 0;
+            }
+            break;
+        case INDEX_op_call:
+            nb_oargs = args[0] >> 16;
+            nb_iargs = args[0] & 0xffff;
+            nb_cargs = def->nb_cargs;
+            args++;
+            for (i = 0; i < nb_oargs; i++) {
+                const_temps[args[i]] = 0;
+            }
+            break;
+        case INDEX_op_nopn:
+            /* variable number of arguments */
+            nb_cargs = args[0];
+            break;
+        case INDEX_op_set_label:
+            memset(const_temps, 0, s->nb_temps);
+            break;
+        default:
+            if (def->flags & TCG_OPF_BB_END) {
+                memset(const_temps, 0, s->nb_temps);
+            } else {
+                for (i = 0; i < nb_oargs; i++) {
+                    const_temps[args[i]] = 0;
+                }
+            }
+            break;
+        }
+        opc_ptr++;
+        args += nb_iargs + nb_oargs + nb_cargs - del_args;
+        if (del_args > 0) {
+            gen_opparam_ptr -= del_args;
+            memmove(args, args + del_args, (gen_opparam_ptr - args) * sizeof(*args));
+        }
+    }
 
+    if (args != gen_opparam_ptr)
+        tcg_abort();
 }
 
 #ifdef USE_LIVENESS_ANALYSIS
@@ -1896,6 +2088,14 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
 #endif
 
 #ifdef CONFIG_PROFILER
+    s->const_time -= profile_getclock();
+#endif
+    tcg_const_analysis(s);
+#ifdef CONFIG_PROFILER
+    s->const_time += profile_getclock();
+#endif
+
+#ifdef CONFIG_PROFILER
     s->la_time -= profile_getclock();
 #endif
     tcg_liveness_analysis(s);
@@ -2068,6 +2268,8 @@ void tcg_dump_info(FILE *f,
                 (double)s->interm_time / tot * 100.0);
     cpu_fprintf(f, "  gen_code time     %0.1f%%\n", 
                 (double)s->code_time / tot * 100.0);
+    cpu_fprintf(f, "const/code time     %0.1f%%\n", 
+                (double)s->const_time / (s->code_time ? s->code_time : 1) * 100.0);
     cpu_fprintf(f, "liveness/code time  %0.1f%%\n", 
                 (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
     cpu_fprintf(f, "cpu_restore count   %" PRId64 "\n",
diff --git a/tcg/tcg.h b/tcg/tcg.h
index ad0bd14..f8322f2 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -288,6 +288,7 @@ struct TCGContext {
     int64_t code_out_len;
     int64_t interm_time;
     int64_t code_time;
+    int64_t const_time;
     int64_t la_time;
     int64_t restore_count;
     int64_t restore_time;
-- 
1.6.3.2.1299.gee46c


[-- Attachment #3: 0002-Use-ARM-barrel-shifter-for-more-optimal-generation-o.patch --]
[-- Type: application/octet-stream, Size: 1559 bytes --]

From 203b7284477109415989d44b71dbd149d05caadf Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Thu, 23 Jul 2009 11:01:32 +0200
Subject: [PATCH 2/2] Use ARM barrel shifter for more optimal generation of MVN, MOV instructions for the MOVI TCG operation.

---
 tcg/arm/tcg-target.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 7ef2b89..0c59984 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -334,6 +334,7 @@ static inline void tcg_out_movi32(TCGContext *s,
                 int cond, int rd, int32_t arg)
 {
     int offset = (uint32_t) arg - ((uint32_t) s->code_ptr + 8);
+    int shift;
 
     /* TODO: This is very suboptimal, we can easily have a constant
      * pool somewhere after all the instructions.  */
@@ -346,6 +347,17 @@ static inline void tcg_out_movi32(TCGContext *s,
                 tcg_out_dat_imm(s, cond, ARITH_ADD, rd, 15, offset) :
                 tcg_out_dat_imm(s, cond, ARITH_SUB, rd, 15, -offset);
 
+    if (arg < 0x100)
+        tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, arg & 0xff);
+
+    shift = ctz32(arg) & ~1;
+    if (!((arg >> shift) & ~0xff))
+        return tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, (arg >> shift) + (shift << 7));
+
+    shift = ctz32(~arg) & ~1;
+    if (!((~arg >> shift) & ~0xff))
+        return tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0, (~arg >> shift) + (shift << 7));
+
 #ifdef __ARM_ARCH_7A__
     /* use movw/movt */
     /* movw */
-- 
1.6.3.2.1299.gee46c


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-23  9:08 ` Filip Navara
@ 2009-07-23  9:25   ` Laurent Desnogues
  2009-07-23  9:30     ` Paul Brook
  2009-07-23  9:49     ` Filip Navara
  2009-07-23 20:10   ` Stuart Brady
  2009-07-23 22:02   ` Daniel Jacobowitz
  2 siblings, 2 replies; 10+ messages in thread
From: Laurent Desnogues @ 2009-07-23  9:25 UTC (permalink / raw)
  To: Filip Navara; +Cc: qemu-devel, Paul Brook

On Thu, Jul 23, 2009 at 11:08 AM, Filip Navara<filip.navara@gmail.com> wrote:
> Also it looks that the actual code size is worse for TCG ARM target
> with this optimization. To mitigate it I propose the other attached
> patch which uses the barrel shifter for more optimal generation of the
> "movi" operation. I can't test it, so any help is welcome.

This is wrong for two reasons:
   - this is a rotation, you only check for shifts
   - the rotation amount is multiplied by 2 before being applied to the
     8-bit immediate field.


Laurent

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-23  9:25   ` Laurent Desnogues
@ 2009-07-23  9:30     ` Paul Brook
  2009-07-23  9:49     ` Filip Navara
  1 sibling, 0 replies; 10+ messages in thread
From: Paul Brook @ 2009-07-23  9:30 UTC (permalink / raw)
  To: Laurent Desnogues; +Cc: Filip Navara, qemu-devel

On Thursday 23 July 2009, Laurent Desnogues wrote:
> On Thu, Jul 23, 2009 at 11:08 AM, Filip Navara<filip.navara@gmail.com> 
wrote:
> > Also it looks that the actual code size is worse for TCG ARM target
> > with this optimization. To mitigate it I propose the other attached
> > patch which uses the barrel shifter for more optimal generation of the
> > "movi" operation. I can't test it, so any help is welcome.
>
>    - the rotation amount is multiplied by 2 before being applied to the
>      8-bit immediate field.

This is compensated for by the later << 7

Paul

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-23  9:25   ` Laurent Desnogues
  2009-07-23  9:30     ` Paul Brook
@ 2009-07-23  9:49     ` Filip Navara
  1 sibling, 0 replies; 10+ messages in thread
From: Filip Navara @ 2009-07-23  9:49 UTC (permalink / raw)
  To: Laurent Desnogues; +Cc: qemu-devel, Paul Brook

[-- Attachment #1: Type: text/plain, Size: 963 bytes --]

On Thu, Jul 23, 2009 at 11:25 AM, Laurent
Desnogues<laurent.desnogues@gmail.com> wrote:
> On Thu, Jul 23, 2009 at 11:08 AM, Filip Navara<filip.navara@gmail.com> wrote:
>> Also it looks that the actual code size is worse for TCG ARM target
>> with this optimization. To mitigate it I propose the other attached
>> patch which uses the barrel shifter for more optimal generation of the
>> "movi" operation. I can't test it, so any help is welcome.
>
> This is wrong for two reasons:
>   - this is a rotation, you only check for shifts

I deliberately didn't do it, but it's surely an opportunity for
further optimization.

>   - the rotation amount is multiplied by 2 before being applied to the
>     8-bit immediate field.

As Paul already pointed out, that's compensated by the shift (<< 7
instead of << 8) and the "& ~1" after ctz32 call.

Attached is fixed version of the patch, I forgot one "return" there.

Best regards,
Filip Navara

[-- Attachment #2: 0002-Use-ARM-barrel-shifter-for-more-optimal-generation-o.patch --]
[-- Type: application/octet-stream, Size: 1566 bytes --]

From d326aabe85412a91c17e0be2ebbfd078bec16713 Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Thu, 23 Jul 2009 11:01:32 +0200
Subject: [PATCH 2/2] Use ARM barrel shifter for more optimal generation of MVN, MOV instructions for the MOVI TCG operation.

---
 tcg/arm/tcg-target.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 7ef2b89..9748ce2 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -334,6 +334,7 @@ static inline void tcg_out_movi32(TCGContext *s,
                 int cond, int rd, int32_t arg)
 {
     int offset = (uint32_t) arg - ((uint32_t) s->code_ptr + 8);
+    int shift;
 
     /* TODO: This is very suboptimal, we can easily have a constant
      * pool somewhere after all the instructions.  */
@@ -346,6 +347,17 @@ static inline void tcg_out_movi32(TCGContext *s,
                 tcg_out_dat_imm(s, cond, ARITH_ADD, rd, 15, offset) :
                 tcg_out_dat_imm(s, cond, ARITH_SUB, rd, 15, -offset);
 
+    if (arg < 0x100)
+        return tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, arg & 0xff);
+
+    shift = ctz32(arg) & ~1;
+    if (!((arg >> shift) & ~0xff))
+        return tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, (arg >> shift) + (shift << 7));
+
+    shift = ctz32(~arg) & ~1;
+    if (!((~arg >> shift) & ~0xff))
+        return tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0, (~arg >> shift) + (shift << 7));
+
 #ifdef __ARM_ARCH_7A__
     /* use movw/movt */
     /* movw */
-- 
1.6.3.2.1299.gee46c


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-23  9:08 ` Filip Navara
  2009-07-23  9:25   ` Laurent Desnogues
@ 2009-07-23 20:10   ` Stuart Brady
  2009-07-23 20:28     ` Filip Navara
  2009-07-23 22:02   ` Daniel Jacobowitz
  2 siblings, 1 reply; 10+ messages in thread
From: Stuart Brady @ 2009-07-23 20:10 UTC (permalink / raw)
  To: Filip Navara; +Cc: qemu-devel

+            dest = args[0];
+            src = args[1];
+            if (const_temps[src]) {
+                const_temps[dest] = 1;
+                dest_val = ~temp_values[src];
+                *opc_ptr = INDEX_op_movi_i32;

Hrm... is it really right to be setting *opc_ptr = INDEX_op_movi_i32
even for 64-bit ops?  Applies to both 'not'[0] and the binary ops...

Cheers,
-- 
Stuart Brady

[0] BTW, the #ifdefs will need shuffling if other unary ops (e.g. neg)
are wanted... it's trivial, but I can't see a reason not to do it. :-)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-23 20:10   ` Stuart Brady
@ 2009-07-23 20:28     ` Filip Navara
  0 siblings, 0 replies; 10+ messages in thread
From: Filip Navara @ 2009-07-23 20:28 UTC (permalink / raw)
  To: Stuart Brady; +Cc: qemu-devel

On Thu, Jul 23, 2009 at 10:10 PM, Stuart Brady<sdbrady@ntlworld.com> wrote:
> +            dest = args[0];
> +            src = args[1];
> +            if (const_temps[src]) {
> +                const_temps[dest] = 1;
> +                dest_val = ~temp_values[src];
> +                *opc_ptr = INDEX_op_movi_i32;
>
> Hrm... is it really right to be setting *opc_ptr = INDEX_op_movi_i32
> even for 64-bit ops?  Applies to both 'not'[0] and the binary ops...

You are right, another bug :) I wish I could test it on some 64-bit
target, but QEMU is not working on MinGW64 (unsigned long !=
intptr_t).

Thanks,
Filip Navara

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-23  9:08 ` Filip Navara
  2009-07-23  9:25   ` Laurent Desnogues
  2009-07-23 20:10   ` Stuart Brady
@ 2009-07-23 22:02   ` Daniel Jacobowitz
  2 siblings, 0 replies; 10+ messages in thread
From: Daniel Jacobowitz @ 2009-07-23 22:02 UTC (permalink / raw)
  To: Filip Navara; +Cc: qemu-devel, Paul Brook

On Thu, Jul 23, 2009 at 11:08:47AM +0200, Filip Navara wrote:
> +    if (arg < 0x100)
> +        tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, arg & 0xff);

return?


-- 
Daniel Jacobowitz
CodeSourcery

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-07-21 14:37 [Qemu-devel] [PATCH] RFC: TCG constant propagation Filip Navara
  2009-07-23  9:08 ` Filip Navara
@ 2009-08-05  8:13 ` Pablo Virolainen
  2009-08-05  8:48   ` Filip Navara
  1 sibling, 1 reply; 10+ messages in thread
From: Pablo Virolainen @ 2009-08-05  8:13 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 743 bytes --]

Filip Navara kirjoitti:
> Add support for constant propagation to TCG. This has to be paired with the liveness
> analysis to remove the dead code. Not all possible operations are covered, but the
> most common ones are. This improves the code generation for several ARM instructions,
> like MVN (immediate), and it may help other targets as well.

On my small benchmark, qemu-system-sh4 was about 3% slower on Intel Xeon 
E5405@2.00GHz. I'm running 64-bit mode. My mini benchmark is to build 
zlib 1.2.3, so it's 'real' world work load. Ran the benchmark several 
times and results seems to be pretty constant.

ps. I added INDEX_op_*_i64 cases to the evaluation part. I'm not 
completly sure if those &mask should be there.

Pablo Virolainen

[-- Attachment #2: const.patch --]
[-- Type: text/x-patch, Size: 6493 bytes --]

--- qemu-0.11.0-rc1_orig/tcg/tcg.c	2009-07-30 03:38:26.000000000 +0300
+++ qemu-0.11.0-rc1/tcg/tcg.c	2009-08-05 10:43:48.000000000 +0300
@@ -1021,7 +1021,194 @@
 #endif
         tdefs++;
     }
+}
 
+static void tcg_const_analysis(TCGContext *s)
+{
+    int nb_cargs, nb_iargs, nb_oargs, dest, src, src2, del_args, i;
+    TCGArg *args;
+    uint16_t op;
+    uint16_t *opc_ptr;
+    const TCGOpDef *def;
+    uint8_t *const_temps;
+    tcg_target_ulong *temp_values;
+    tcg_target_ulong val, mask;
+    tcg_target_ulong dest_val, src_val, src2_val;
+
+    const_temps = tcg_malloc(s->nb_temps);
+    memset(const_temps, 0, s->nb_temps);
+    temp_values = tcg_malloc(s->nb_temps * sizeof(uint32_t));
+
+    opc_ptr = gen_opc_buf;
+    args = gen_opparam_buf;
+    while (opc_ptr < gen_opc_ptr) {
+        op = *opc_ptr;
+        def = &tcg_op_defs[op];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
+        nb_cargs = def->nb_cargs;
+        del_args = 0;
+        mask = ~((tcg_target_ulong)0);
+
+        switch(op) {
+        case INDEX_op_movi_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_movi_i64:
+#endif
+            dest = args[0];
+            val = args[1];
+            const_temps[dest] = 1;
+            temp_values[dest] = val;
+            break;
+        case INDEX_op_mov_i32:
+#if TCG_TARGET_REG_BITS == 64
+        case INDEX_op_mov_i64:
+#endif
+            dest = args[0];
+            src = args[1];
+            const_temps[dest] = const_temps[src];
+            temp_values[dest] = temp_values[src];
+            break;
+        case INDEX_op_not_i32:
+#if TCG_TARGET_REG_BITS == 64
+            mask = 0xffffffff;
+        case INDEX_op_not_i64:
+#endif
+            dest = args[0];
+            src = args[1];
+            if (const_temps[src]) {
+                const_temps[dest] = 1;
+                dest_val = ~temp_values[src];
+                *opc_ptr = INDEX_op_movi_i32;
+                args[1] = temp_values[dest] = dest_val & mask;
+            } else {
+                const_temps[dest] = 0;
+            }
+            break;
+        case INDEX_op_add_i32:
+        case INDEX_op_sub_i32:
+        case INDEX_op_mul_i32:
+        case INDEX_op_and_i32:
+        case INDEX_op_or_i32:
+        case INDEX_op_xor_i32:
+        case INDEX_op_shl_i32:
+        case INDEX_op_shr_i32:
+#if TCG_TARGET_REG_BITS == 64
+            mask = 0xffffffff;
+        case INDEX_op_add_i64:
+        case INDEX_op_sub_i64:
+        case INDEX_op_mul_i64:
+        case INDEX_op_and_i64:
+        case INDEX_op_or_i64:
+        case INDEX_op_xor_i64:
+        case INDEX_op_shl_i64:
+        case INDEX_op_shr_i64:
+#endif
+
+            dest = args[0];
+            src = args[1];
+            src2 = args[2];
+            if (const_temps[src] && const_temps[src2]) {
+                src_val = temp_values[src];
+                src2_val = temp_values[src2];
+                const_temps[dest] = 1;
+                switch (op) {
+                case INDEX_op_add_i32:
+                    dest_val = src_val + src2_val;
+                    break;
+                case INDEX_op_add_i64:
+		    dest_val = (src_val + src2_val) & mask;
+		    break;
+                case INDEX_op_sub_i32:
+                    dest_val = src_val - src2_val;
+                    break;
+                case INDEX_op_sub_i64:
+		    dest_val = (src_val - src2_val) & mask;
+		    break;
+                case INDEX_op_mul_i32:
+                    dest_val = src_val * src2_val;
+                    break;
+                case INDEX_op_mul_i64:
+		    dest_val = (src_val * src2_val) & mask;
+		    break;
+                case INDEX_op_and_i32:
+                    dest_val = src_val & src2_val;
+                    break;
+                case INDEX_op_and_i64:
+		    dest_val = src_val & src2_val & mask;
+		    break;
+                case INDEX_op_or_i32:
+                    dest_val = src_val | src2_val;
+                    break;
+                case INDEX_op_or_i64:
+		    dest_val = (src_val | src2_val) & mask;
+		    break;
+                case INDEX_op_xor_i32:
+                    dest_val = src_val ^ src2_val;
+                    break;
+                case INDEX_op_xor_i64:
+		    dest_val = (src_val ^ src2_val) & mask;
+		    break;
+                case INDEX_op_shl_i32:
+                    dest_val = src_val << src2_val;
+                    break;
+                case INDEX_op_shl_i64:
+		    dest_val = (src_val << src2_val) & mask;
+		    break;
+                case INDEX_op_shr_i32:
+                    dest_val = src_val >> src2_val;
+                    break;
+                case INDEX_op_shr_i64:
+		    dest_val = (src_val >> src2_val) & mask;
+		    break;
+                default:
+		  fprintf(stderr,"index op %i\n",op);
+                    tcg_abort();
+                    return;
+                }
+                *opc_ptr = INDEX_op_movi_i32;                
+                args[1] = temp_values[dest] = dest_val & mask;
+                del_args = 1;
+            } else {
+                const_temps[dest] = 0;
+            }
+            break;
+        case INDEX_op_call:
+            nb_oargs = args[0] >> 16;
+            nb_iargs = args[0] & 0xffff;
+            nb_cargs = def->nb_cargs;
+            args++;
+            for (i = 0; i < nb_oargs; i++) {
+                const_temps[args[i]] = 0;
+            }
+            break;
+        case INDEX_op_nopn:
+            /* variable number of arguments */
+            nb_cargs = args[0];
+            break;
+        case INDEX_op_set_label:
+            memset(const_temps, 0, s->nb_temps);
+            break;
+        default:
+            if (def->flags & TCG_OPF_BB_END) {
+                memset(const_temps, 0, s->nb_temps);
+            } else {
+                for (i = 0; i < nb_oargs; i++) {
+                    const_temps[args[i]] = 0;
+                }
+            }
+            break;
+        }
+        opc_ptr++;
+        args += nb_iargs + nb_oargs + nb_cargs - del_args;
+        if (del_args > 0) {
+            gen_opparam_ptr -= del_args;
+            memmove(args, args + del_args, (gen_opparam_ptr - args) * sizeof(*args));
+        }
+    }
+
+    if (args != gen_opparam_ptr)
+        tcg_abort();
 }
 
 #ifdef USE_LIVENESS_ANALYSIS
@@ -1891,6 +2078,8 @@
     }
 #endif
 
+    tcg_const_analysis(s);
+
 #ifdef CONFIG_PROFILER
     s->la_time -= profile_getclock();
 #endif

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
  2009-08-05  8:13 ` Pablo Virolainen
@ 2009-08-05  8:48   ` Filip Navara
  0 siblings, 0 replies; 10+ messages in thread
From: Filip Navara @ 2009-08-05  8:48 UTC (permalink / raw)
  To: Pablo Virolainen; +Cc: qemu-devel

On Wed, Aug 5, 2009 at 10:13 AM, Pablo
Virolainen<Pablo.Virolainen@nomovok.com> wrote:
> Filip Navara kirjoitti:
>>
>> Add support for constant propagation to TCG. This has to be paired with
>> the liveness
>> analysis to remove the dead code. Not all possible operations are covered,
>> but the
>> most common ones are. This improves the code generation for several ARM
>> instructions,
>> like MVN (immediate), and it may help other targets as well.
>
> On my small benchmark, qemu-system-sh4 was about 3% slower on Intel Xeon
> E5405@2.00GHz. I'm running 64-bit mode. My mini benchmark is to build zlib
> 1.2.3, so it's 'real' world work load. Ran the benchmark several times and
> results seems to be pretty constant.

Thanks for testing and reporting the results. I'll see if I can reduce
the overhead or if I should ditch the patch.

> ps. I added INDEX_op_*_i64 cases to the evaluation part. I'm not completly
> sure if those &mask should be there.

I've rewritten the patch and fixed the 64-bit mess that was there. It
is available at
http://repo.or.cz/w/qemu/navara.git?a=commit;h=5df3a524fc0b923cf2e5e1883ff550d055d36eb5

Best regards,
Filip Navara

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2009-08-05  8:48 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-07-21 14:37 [Qemu-devel] [PATCH] RFC: TCG constant propagation Filip Navara
2009-07-23  9:08 ` Filip Navara
2009-07-23  9:25   ` Laurent Desnogues
2009-07-23  9:30     ` Paul Brook
2009-07-23  9:49     ` Filip Navara
2009-07-23 20:10   ` Stuart Brady
2009-07-23 20:28     ` Filip Navara
2009-07-23 22:02   ` Daniel Jacobowitz
2009-08-05  8:13 ` Pablo Virolainen
2009-08-05  8:48   ` Filip Navara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).