* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
2009-07-21 14:37 [Qemu-devel] [PATCH] RFC: TCG constant propagation Filip Navara
@ 2009-07-23 9:08 ` Filip Navara
2009-07-23 9:25 ` Laurent Desnogues
` (2 more replies)
2009-08-05 8:13 ` Pablo Virolainen
1 sibling, 3 replies; 10+ messages in thread
From: Filip Navara @ 2009-07-23 9:08 UTC (permalink / raw)
To: qemu-devel; +Cc: Paul Brook
[-- Attachment #1: Type: text/plain, Size: 988 bytes --]
On Tue, Jul 21, 2009 at 4:38 PM, Filip Navara<filip.navara@gmail.com> wrote:
> Add support for constant propagation to TCG. This has to be paired with the liveness
> analysis to remove the dead code. Not all possible operations are covered, but the
> most common ones are. This improves the code generation for several ARM instructions,
> like MVN (immediate), and it may help other targets as well.
> ---
> tcg/tcg.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 164 insertions(+), 0 deletions(-)
Attached is updated version of the patch which adds profiling code for
the TCG pass and hopefully fixes it for TCG 64-bit targets.
Also it looks that the actual code size is worse for TCG ARM target
with this optimization. To mitigate it I propose the other attached
patch which uses the barrel shifter for more optimal generation of the
"movi" operation. I can't test it, so any help is welcome.
Best regards,
Filip Navara
[-- Attachment #2: 0001-TCG-constant-propagation.patch --]
[-- Type: application/octet-stream, Size: 8340 bytes --]
From 403c7d19df34fb25a8b57d90000697a83488cd48 Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Tue, 21 Jul 2009 16:31:40 +0200
Subject: [PATCH 1/2] TCG constant propagation.
Add support for constant propagation to TCG. This has to be paired with the liveness
analysis to remove the dead code. Not all possible operations are covered, but the
most common ones are. This improves the code generation for several ARM instructions,
like MVN (immediate), and it may help other targets as well.
v1 -> v2:
Added profiling code and hopefully fixed for 64-bit TCG targets.
---
tcg/tcg.c | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tcg/tcg.h | 1 +
2 files changed, 203 insertions(+), 0 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 4cb5934..e23e303 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1025,7 +1025,199 @@ void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs)
#endif
tdefs++;
}
+}
+
+static void tcg_const_analysis(TCGContext *s)
+{
+ int nb_cargs, nb_iargs, nb_oargs, dest, src, src2, del_args, i;
+ TCGArg *args;
+ uint16_t op;
+ uint16_t *opc_ptr;
+ const TCGOpDef *def;
+ uint8_t *const_temps;
+ tcg_target_ulong *temp_values;
+ tcg_target_ulong val, mask;
+ tcg_target_ulong dest_val, src_val, src2_val;
+
+ const_temps = tcg_malloc(s->nb_temps);
+ memset(const_temps, 0, s->nb_temps);
+ temp_values = tcg_malloc(s->nb_temps * sizeof(uint32_t));
+
+ opc_ptr = gen_opc_buf;
+ args = gen_opparam_buf;
+ while (opc_ptr < gen_opc_ptr) {
+ op = *opc_ptr;
+ def = &tcg_op_defs[op];
+ nb_oargs = def->nb_oargs;
+ nb_iargs = def->nb_iargs;
+ nb_cargs = def->nb_cargs;
+ del_args = 0;
+ mask = ~((tcg_target_ulong)0);
+
+ switch(op) {
+ case INDEX_op_movi_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_movi_i64:
+#endif
+ dest = args[0];
+ val = args[1];
+ const_temps[dest] = 1;
+ temp_values[dest] = val;
+ break;
+ case INDEX_op_mov_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_mov_i64:
+#endif
+ dest = args[0];
+ src = args[1];
+ const_temps[dest] = const_temps[src];
+ temp_values[dest] = temp_values[src];
+ break;
+#if defined(TCG_TARGET_HAS_not_i32) || defined(TCG_TARGET_HAS_not_i64)
+#ifdef TCG_TARGET_HAS_not_i32
+ case INDEX_op_not_i32:
+#endif
+#if TCG_TARGET_REG_BITS == 64
+ mask = 0xffffffff;
+#ifdef TCG_TARGET_HAS_not_i64
+ case INDEX_op_not_i64:
+#endif
+#endif
+ dest = args[0];
+ src = args[1];
+ if (const_temps[src]) {
+ const_temps[dest] = 1;
+ dest_val = ~temp_values[src];
+ *opc_ptr = INDEX_op_movi_i32;
+ args[1] = temp_values[dest] = dest_val & mask;
+ } else {
+ const_temps[dest] = 0;
+ }
+ break;
+#endif
+ case INDEX_op_add_i32:
+ case INDEX_op_sub_i32:
+ case INDEX_op_mul_i32:
+ case INDEX_op_and_i32:
+ case INDEX_op_or_i32:
+ case INDEX_op_xor_i32:
+ case INDEX_op_shl_i32:
+ case INDEX_op_shr_i32:
+#if TCG_TARGET_REG_BITS == 64
+ mask = 0xffffffff;
+ case INDEX_op_add_i64:
+ case INDEX_op_sub_i64:
+ case INDEX_op_mul_i64:
+ case INDEX_op_and_i64:
+ case INDEX_op_or_i64:
+ case INDEX_op_xor_i64:
+ case INDEX_op_shl_i64:
+ case INDEX_op_shr_i64:
+#endif
+
+ dest = args[0];
+ src = args[1];
+ src2 = args[2];
+ if (const_temps[src] && const_temps[src2]) {
+ src_val = temp_values[src];
+ src2_val = temp_values[src2];
+ const_temps[dest] = 1;
+ switch (op) {
+ case INDEX_op_add_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_add_i64:
+#endif
+ dest_val = src_val + src2_val;
+ break;
+ case INDEX_op_sub_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_sub_i64:
+#endif
+ dest_val = src_val - src2_val;
+ break;
+ case INDEX_op_mul_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_mul_i64:
+#endif
+ dest_val = src_val * src2_val;
+ break;
+ case INDEX_op_and_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_and_i64:
+#endif
+ dest_val = src_val & src2_val;
+ break;
+ case INDEX_op_or_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_or_i64:
+#endif
+ dest_val = src_val | src2_val;
+ break;
+ case INDEX_op_xor_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_xor_i64:
+#endif
+ dest_val = src_val ^ src2_val;
+ break;
+ case INDEX_op_shl_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_shl_i64:
+#endif
+ dest_val = src_val << src2_val;
+ break;
+ case INDEX_op_shr_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_shr_i64:
+#endif
+ dest_val = src_val >> src2_val;
+ break;
+ default:
+ tcg_abort();
+ return;
+ }
+ *opc_ptr = INDEX_op_movi_i32;
+ args[1] = temp_values[dest] = dest_val & mask;
+ del_args = 1;
+ } else {
+ const_temps[dest] = 0;
+ }
+ break;
+ case INDEX_op_call:
+ nb_oargs = args[0] >> 16;
+ nb_iargs = args[0] & 0xffff;
+ nb_cargs = def->nb_cargs;
+ args++;
+ for (i = 0; i < nb_oargs; i++) {
+ const_temps[args[i]] = 0;
+ }
+ break;
+ case INDEX_op_nopn:
+ /* variable number of arguments */
+ nb_cargs = args[0];
+ break;
+ case INDEX_op_set_label:
+ memset(const_temps, 0, s->nb_temps);
+ break;
+ default:
+ if (def->flags & TCG_OPF_BB_END) {
+ memset(const_temps, 0, s->nb_temps);
+ } else {
+ for (i = 0; i < nb_oargs; i++) {
+ const_temps[args[i]] = 0;
+ }
+ }
+ break;
+ }
+ opc_ptr++;
+ args += nb_iargs + nb_oargs + nb_cargs - del_args;
+ if (del_args > 0) {
+ gen_opparam_ptr -= del_args;
+ memmove(args, args + del_args, (gen_opparam_ptr - args) * sizeof(*args));
+ }
+ }
+ if (args != gen_opparam_ptr)
+ tcg_abort();
}
#ifdef USE_LIVENESS_ANALYSIS
@@ -1896,6 +2088,14 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
#endif
#ifdef CONFIG_PROFILER
+ s->const_time -= profile_getclock();
+#endif
+ tcg_const_analysis(s);
+#ifdef CONFIG_PROFILER
+ s->const_time += profile_getclock();
+#endif
+
+#ifdef CONFIG_PROFILER
s->la_time -= profile_getclock();
#endif
tcg_liveness_analysis(s);
@@ -2068,6 +2268,8 @@ void tcg_dump_info(FILE *f,
(double)s->interm_time / tot * 100.0);
cpu_fprintf(f, " gen_code time %0.1f%%\n",
(double)s->code_time / tot * 100.0);
+ cpu_fprintf(f, "const/code time %0.1f%%\n",
+ (double)s->const_time / (s->code_time ? s->code_time : 1) * 100.0);
cpu_fprintf(f, "liveness/code time %0.1f%%\n",
(double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0);
cpu_fprintf(f, "cpu_restore count %" PRId64 "\n",
diff --git a/tcg/tcg.h b/tcg/tcg.h
index ad0bd14..f8322f2 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -288,6 +288,7 @@ struct TCGContext {
int64_t code_out_len;
int64_t interm_time;
int64_t code_time;
+ int64_t const_time;
int64_t la_time;
int64_t restore_count;
int64_t restore_time;
--
1.6.3.2.1299.gee46c
[-- Attachment #3: 0002-Use-ARM-barrel-shifter-for-more-optimal-generation-o.patch --]
[-- Type: application/octet-stream, Size: 1559 bytes --]
From 203b7284477109415989d44b71dbd149d05caadf Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Thu, 23 Jul 2009 11:01:32 +0200
Subject: [PATCH 2/2] Use ARM barrel shifter for more optimal generation of MVN, MOV instructions for the MOVI TCG operation.
---
tcg/arm/tcg-target.c | 12 ++++++++++++
1 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 7ef2b89..0c59984 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -334,6 +334,7 @@ static inline void tcg_out_movi32(TCGContext *s,
int cond, int rd, int32_t arg)
{
int offset = (uint32_t) arg - ((uint32_t) s->code_ptr + 8);
+ int shift;
/* TODO: This is very suboptimal, we can easily have a constant
* pool somewhere after all the instructions. */
@@ -346,6 +347,17 @@ static inline void tcg_out_movi32(TCGContext *s,
tcg_out_dat_imm(s, cond, ARITH_ADD, rd, 15, offset) :
tcg_out_dat_imm(s, cond, ARITH_SUB, rd, 15, -offset);
+ if (arg < 0x100)
+ tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, arg & 0xff);
+
+ shift = ctz32(arg) & ~1;
+ if (!((arg >> shift) & ~0xff))
+ return tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0, (arg >> shift) + (shift << 7));
+
+ shift = ctz32(~arg) & ~1;
+ if (!((~arg >> shift) & ~0xff))
+ return tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0, (~arg >> shift) + (shift << 7));
+
#ifdef __ARM_ARCH_7A__
/* use movw/movt */
/* movw */
--
1.6.3.2.1299.gee46c
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH] RFC: TCG constant propagation.
2009-07-21 14:37 [Qemu-devel] [PATCH] RFC: TCG constant propagation Filip Navara
2009-07-23 9:08 ` Filip Navara
@ 2009-08-05 8:13 ` Pablo Virolainen
2009-08-05 8:48 ` Filip Navara
1 sibling, 1 reply; 10+ messages in thread
From: Pablo Virolainen @ 2009-08-05 8:13 UTC (permalink / raw)
To: qemu-devel
[-- Attachment #1: Type: text/plain, Size: 743 bytes --]
Filip Navara kirjoitti:
> Add support for constant propagation to TCG. This has to be paired with the liveness
> analysis to remove the dead code. Not all possible operations are covered, but the
> most common ones are. This improves the code generation for several ARM instructions,
> like MVN (immediate), and it may help other targets as well.
On my small benchmark, qemu-system-sh4 was about 3% slower on Intel Xeon
E5405@2.00GHz. I'm running 64-bit mode. My mini benchmark is to build
zlib 1.2.3, so it's 'real' world work load. Ran the benchmark several
times and results seems to be pretty constant.
ps. I added INDEX_op_*_i64 cases to the evaluation part. I'm not
completly sure if those &mask should be there.
Pablo Virolainen
[-- Attachment #2: const.patch --]
[-- Type: text/x-patch, Size: 6493 bytes --]
--- qemu-0.11.0-rc1_orig/tcg/tcg.c 2009-07-30 03:38:26.000000000 +0300
+++ qemu-0.11.0-rc1/tcg/tcg.c 2009-08-05 10:43:48.000000000 +0300
@@ -1021,7 +1021,194 @@
#endif
tdefs++;
}
+}
+static void tcg_const_analysis(TCGContext *s)
+{
+ int nb_cargs, nb_iargs, nb_oargs, dest, src, src2, del_args, i;
+ TCGArg *args;
+ uint16_t op;
+ uint16_t *opc_ptr;
+ const TCGOpDef *def;
+ uint8_t *const_temps;
+ tcg_target_ulong *temp_values;
+ tcg_target_ulong val, mask;
+ tcg_target_ulong dest_val, src_val, src2_val;
+
+ const_temps = tcg_malloc(s->nb_temps);
+ memset(const_temps, 0, s->nb_temps);
+ temp_values = tcg_malloc(s->nb_temps * sizeof(uint32_t));
+
+ opc_ptr = gen_opc_buf;
+ args = gen_opparam_buf;
+ while (opc_ptr < gen_opc_ptr) {
+ op = *opc_ptr;
+ def = &tcg_op_defs[op];
+ nb_oargs = def->nb_oargs;
+ nb_iargs = def->nb_iargs;
+ nb_cargs = def->nb_cargs;
+ del_args = 0;
+ mask = ~((tcg_target_ulong)0);
+
+ switch(op) {
+ case INDEX_op_movi_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_movi_i64:
+#endif
+ dest = args[0];
+ val = args[1];
+ const_temps[dest] = 1;
+ temp_values[dest] = val;
+ break;
+ case INDEX_op_mov_i32:
+#if TCG_TARGET_REG_BITS == 64
+ case INDEX_op_mov_i64:
+#endif
+ dest = args[0];
+ src = args[1];
+ const_temps[dest] = const_temps[src];
+ temp_values[dest] = temp_values[src];
+ break;
+ case INDEX_op_not_i32:
+#if TCG_TARGET_REG_BITS == 64
+ mask = 0xffffffff;
+ case INDEX_op_not_i64:
+#endif
+ dest = args[0];
+ src = args[1];
+ if (const_temps[src]) {
+ const_temps[dest] = 1;
+ dest_val = ~temp_values[src];
+ *opc_ptr = INDEX_op_movi_i32;
+ args[1] = temp_values[dest] = dest_val & mask;
+ } else {
+ const_temps[dest] = 0;
+ }
+ break;
+ case INDEX_op_add_i32:
+ case INDEX_op_sub_i32:
+ case INDEX_op_mul_i32:
+ case INDEX_op_and_i32:
+ case INDEX_op_or_i32:
+ case INDEX_op_xor_i32:
+ case INDEX_op_shl_i32:
+ case INDEX_op_shr_i32:
+#if TCG_TARGET_REG_BITS == 64
+ mask = 0xffffffff;
+ case INDEX_op_add_i64:
+ case INDEX_op_sub_i64:
+ case INDEX_op_mul_i64:
+ case INDEX_op_and_i64:
+ case INDEX_op_or_i64:
+ case INDEX_op_xor_i64:
+ case INDEX_op_shl_i64:
+ case INDEX_op_shr_i64:
+#endif
+
+ dest = args[0];
+ src = args[1];
+ src2 = args[2];
+ if (const_temps[src] && const_temps[src2]) {
+ src_val = temp_values[src];
+ src2_val = temp_values[src2];
+ const_temps[dest] = 1;
+ switch (op) {
+ case INDEX_op_add_i32:
+ dest_val = src_val + src2_val;
+ break;
+ case INDEX_op_add_i64:
+ dest_val = (src_val + src2_val) & mask;
+ break;
+ case INDEX_op_sub_i32:
+ dest_val = src_val - src2_val;
+ break;
+ case INDEX_op_sub_i64:
+ dest_val = (src_val - src2_val) & mask;
+ break;
+ case INDEX_op_mul_i32:
+ dest_val = src_val * src2_val;
+ break;
+ case INDEX_op_mul_i64:
+ dest_val = (src_val * src2_val) & mask;
+ break;
+ case INDEX_op_and_i32:
+ dest_val = src_val & src2_val;
+ break;
+ case INDEX_op_and_i64:
+ dest_val = src_val & src2_val & mask;
+ break;
+ case INDEX_op_or_i32:
+ dest_val = src_val | src2_val;
+ break;
+ case INDEX_op_or_i64:
+ dest_val = (src_val | src2_val) & mask;
+ break;
+ case INDEX_op_xor_i32:
+ dest_val = src_val ^ src2_val;
+ break;
+ case INDEX_op_xor_i64:
+ dest_val = (src_val ^ src2_val) & mask;
+ break;
+ case INDEX_op_shl_i32:
+ dest_val = src_val << src2_val;
+ break;
+ case INDEX_op_shl_i64:
+ dest_val = (src_val << src2_val) & mask;
+ break;
+ case INDEX_op_shr_i32:
+ dest_val = src_val >> src2_val;
+ break;
+ case INDEX_op_shr_i64:
+ dest_val = (src_val >> src2_val) & mask;
+ break;
+ default:
+ fprintf(stderr,"index op %i\n",op);
+ tcg_abort();
+ return;
+ }
+ *opc_ptr = INDEX_op_movi_i32;
+ args[1] = temp_values[dest] = dest_val & mask;
+ del_args = 1;
+ } else {
+ const_temps[dest] = 0;
+ }
+ break;
+ case INDEX_op_call:
+ nb_oargs = args[0] >> 16;
+ nb_iargs = args[0] & 0xffff;
+ nb_cargs = def->nb_cargs;
+ args++;
+ for (i = 0; i < nb_oargs; i++) {
+ const_temps[args[i]] = 0;
+ }
+ break;
+ case INDEX_op_nopn:
+ /* variable number of arguments */
+ nb_cargs = args[0];
+ break;
+ case INDEX_op_set_label:
+ memset(const_temps, 0, s->nb_temps);
+ break;
+ default:
+ if (def->flags & TCG_OPF_BB_END) {
+ memset(const_temps, 0, s->nb_temps);
+ } else {
+ for (i = 0; i < nb_oargs; i++) {
+ const_temps[args[i]] = 0;
+ }
+ }
+ break;
+ }
+ opc_ptr++;
+ args += nb_iargs + nb_oargs + nb_cargs - del_args;
+ if (del_args > 0) {
+ gen_opparam_ptr -= del_args;
+ memmove(args, args + del_args, (gen_opparam_ptr - args) * sizeof(*args));
+ }
+ }
+
+ if (args != gen_opparam_ptr)
+ tcg_abort();
}
#ifdef USE_LIVENESS_ANALYSIS
@@ -1891,6 +2078,8 @@
}
#endif
+ tcg_const_analysis(s);
+
#ifdef CONFIG_PROFILER
s->la_time -= profile_getclock();
#endif
^ permalink raw reply [flat|nested] 10+ messages in thread