* [Qemu-devel] [RFC][PATCH v0 1/8] Compute additional liveness information for register allocator.
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 2/8] Propagate REG_NEXT_USE value through process of register allocation Kirill Batuzov
` (7 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Compute next use for each operation argument.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
tcg/tcg.h | 4 +++
2 files changed, 71 insertions(+), 6 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8748c05..821ffa7 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -24,6 +24,7 @@
/* define it to use liveness analysis (better code) */
#define USE_LIVENESS_ANALYSIS
+#define USE_ADVANCED_REGALLOC
#include "config.h"
@@ -1177,7 +1178,8 @@ static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps)
/* liveness analysis: end of basic block: globals are live, temps are
dead, local temps are live. */
-static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps)
+static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
+ int *temp_next_use)
{
int i;
TCGTemp *ts;
@@ -1185,10 +1187,14 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps)
memset(dead_temps, 0, s->nb_globals);
ts = &s->temps[s->nb_globals];
for(i = s->nb_globals; i < s->nb_temps; i++) {
- if (ts->temp_local)
+ if (ts->temp_local) {
dead_temps[i] = 0;
- else
+ } else {
dead_temps[i] = 1;
+#ifdef USE_ADVANCED_REGALLOC
+ temp_next_use[i] = -1;
+#endif
+ }
ts++;
}
}
@@ -1201,18 +1207,30 @@ static void tcg_liveness_analysis(TCGContext *s)
int i, op_index, nb_args, nb_iargs, nb_oargs, arg, nb_ops;
TCGOpcode op;
TCGArg *args;
+ int *next_use_ptr = NULL;
const TCGOpDef *def;
uint8_t *dead_temps;
+ int *temp_next_use = NULL;
unsigned int dead_iargs;
-
+
gen_opc_ptr++; /* skip end */
nb_ops = gen_opc_ptr - gen_opc_buf;
s->op_dead_iargs = tcg_malloc(nb_ops * sizeof(uint16_t));
+
+#ifdef USE_ADVANCED_REGALLOC
+ nb_args = gen_opparam_ptr - gen_opparam_buf;
+ s->param_next_use = tcg_malloc(nb_args * sizeof(s->param_next_use[0]));
+ next_use_ptr = s->param_next_use + nb_args;
+#endif
dead_temps = tcg_malloc(s->nb_temps);
memset(dead_temps, 1, s->nb_temps);
+#ifdef USE_ADVANCED_REGALLOC
+ temp_next_use = tcg_malloc(s->nb_temps * sizeof(temp_next_use[0]));
+ memset(temp_next_use, -1, s->nb_temps * sizeof(temp_next_use[0]));
+#endif
args = gen_opparam_ptr;
op_index = nb_ops - 1;
@@ -1226,9 +1244,11 @@ static void tcg_liveness_analysis(TCGContext *s)
nb_args = args[-1];
args -= nb_args;
+ next_use_ptr -= nb_args;
nb_iargs = args[0] & 0xffff;
nb_oargs = args[0] >> 16;
args++;
+ next_use_ptr++;
call_flags = args[nb_oargs + nb_iargs];
/* pure functions can be removed if their result is not
@@ -1244,10 +1264,23 @@ static void tcg_liveness_analysis(TCGContext *s)
} else {
do_not_remove_call:
+#ifdef USE_ADVANCED_REGALLOC
+ for (i = 0; i < nb_iargs + nb_oargs; i++) {
+ if (!dead_temps[args[i]]) {
+ next_use_ptr[i] = temp_next_use[args[i]];
+ } else {
+ next_use_ptr[i] = -1;
+ }
+ }
+#endif
+
/* output args are dead */
for(i = 0; i < nb_oargs; i++) {
arg = args[i];
dead_temps[arg] = 1;
+#ifdef USE_ADVANCED_REGALLOC
+ temp_next_use[arg] = -1;
+#endif
}
if (!(call_flags & TCG_CALL_CONST)) {
@@ -1263,36 +1296,48 @@ static void tcg_liveness_analysis(TCGContext *s)
if (dead_temps[arg]) {
dead_iargs |= (1 << i);
}
+#ifdef USE_ADVANCED_REGALLOC
+ temp_next_use[arg] = op_index;
+#endif
dead_temps[arg] = 0;
}
}
s->op_dead_iargs[op_index] = dead_iargs;
}
args--;
+ next_use_ptr--;
}
break;
case INDEX_op_set_label:
args--;
+ next_use_ptr--;
/* mark end of basic block */
- tcg_la_bb_end(s, dead_temps);
+ tcg_la_bb_end(s, dead_temps, temp_next_use);
break;
case INDEX_op_debug_insn_start:
args -= def->nb_args;
+ next_use_ptr -= def->nb_args;
break;
case INDEX_op_nopn:
nb_args = args[-1];
args -= nb_args;
+ next_use_ptr -= nb_args;
break;
case INDEX_op_discard:
args--;
+ next_use_ptr--;
/* mark the temporary as dead */
dead_temps[args[0]] = 1;
+#ifdef USE_ADVANCED_REGALLOC
+ temp_next_use[args[0]] = -1;
+#endif
break;
case INDEX_op_end:
break;
/* XXX: optimize by hardcoding common cases (e.g. triadic ops) */
default:
args -= def->nb_args;
+ next_use_ptr -= def->nb_args;
nb_iargs = def->nb_iargs;
nb_oargs = def->nb_oargs;
@@ -1312,15 +1357,28 @@ static void tcg_liveness_analysis(TCGContext *s)
} else {
do_not_remove:
+#ifdef USE_ADVANCED_REGALLOC
+ for (i = 0; i < nb_iargs + nb_oargs; i++) {
+ if (!dead_temps[args[i]]) {
+ next_use_ptr[i] = temp_next_use[args[i]];
+ } else {
+ next_use_ptr[i] = -1;
+ }
+ }
+#endif
+
/* output args are dead */
for(i = 0; i < nb_oargs; i++) {
arg = args[i];
dead_temps[arg] = 1;
+#ifdef USE_ADVANCED_REGALLOC
+ temp_next_use[arg] = -1;
+#endif
}
/* if end of basic block, update */
if (def->flags & TCG_OPF_BB_END) {
- tcg_la_bb_end(s, dead_temps);
+ tcg_la_bb_end(s, dead_temps, temp_next_use);
} else if (def->flags & TCG_OPF_CALL_CLOBBER) {
/* globals are live */
memset(dead_temps, 0, s->nb_globals);
@@ -1333,6 +1391,9 @@ static void tcg_liveness_analysis(TCGContext *s)
if (dead_temps[arg]) {
dead_iargs |= (1 << i);
}
+#ifdef USE_ADVANCED_REGALLOC
+ temp_next_use[arg] = op_index;
+#endif
dead_temps[arg] = 0;
}
s->op_dead_iargs[op_index] = dead_iargs;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index cecef63..d8bfa2c 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -288,10 +288,14 @@ struct TCGContext {
/* liveness analysis */
uint16_t *op_dead_iargs; /* for each operation, each bit tells if the
corresponding input argument is dead */
+ int *param_next_use; /* for each operation argument tells where it's
+ next used is (USE_ADVANCED_REGALLOC only) */
/* tells in which temporary a given register is. It does not take
into account fixed registers */
int reg_to_temp[TCG_TARGET_NB_REGS];
+ /* tells where the next use of a given reg appears */
+ int reg_next_use[TCG_TARGET_NB_REGS];
TCGRegSet reserved_regs;
tcg_target_long current_frame_offset;
tcg_target_long frame_start;
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC][PATCH v0 2/8] Propagate REG_NEXT_USE value through process of register allocation.
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 1/8] Compute additional liveness information for " Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 3/8] Do better spill choice Kirill Batuzov
` (6 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Propagate next use of each register through process of register allocation.
This would be needed to do a better spill choice.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 36 +++++++++++++++++++++++++++++++++---
1 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 821ffa7..c6e920e 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1643,6 +1643,7 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args)
static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
const TCGArg *args,
+ const int *param_next_use,
unsigned int dead_iargs)
{
TCGTemp *ts, *ots;
@@ -1669,6 +1670,9 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
reg = tcg_reg_alloc(s, arg_ct->u.regs, s->reserved_regs);
}
if (ts->reg != reg) {
+#ifdef USE_ADVANCED_REGALLOC
+ s->reg_next_use[ts->reg] = param_next_use[1];
+#endif
tcg_out_mov(s, ots->type, reg, ts->reg);
}
}
@@ -1694,6 +1698,9 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
} else {
tcg_abort();
}
+#ifdef USE_ADVANCED_REGALLOC
+ s->reg_next_use[reg] = param_next_use[0];
+#endif
s->reg_to_temp[reg] = args[0];
ots->reg = reg;
ots->val_type = TEMP_VAL_REG;
@@ -1703,6 +1710,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
static void tcg_reg_alloc_op(TCGContext *s,
const TCGOpDef *def, TCGOpcode opc,
const TCGArg *args,
+ const int *param_next_use,
unsigned int dead_iargs)
{
TCGRegSet allocated_regs;
@@ -1776,6 +1784,9 @@ static void tcg_reg_alloc_op(TCGContext *s,
reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs);
tcg_out_mov(s, ts->type, reg, ts->reg);
}
+#ifdef USE_ADVANCED_REGALLOC
+ s->reg_next_use[reg] = param_next_use[i];
+#endif
new_args[i] = reg;
const_args[i] = 0;
tcg_regset_set_reg(allocated_regs, reg);
@@ -1845,6 +1856,9 @@ static void tcg_reg_alloc_op(TCGContext *s,
}
oarg_end:
new_args[i] = reg;
+#ifdef USE_ADVANCED_REGALLOC
+ s->reg_next_use[reg] = param_next_use[i];
+#endif
}
}
@@ -1869,6 +1883,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
TCGOpcode opc, const TCGArg *args,
+ const int *param_next_use,
unsigned int dead_iargs)
{
int nb_iargs, nb_oargs, flags, nb_regs, i, reg, nb_params;
@@ -1953,6 +1968,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
tcg_abort();
}
tcg_regset_set_reg(allocated_regs, reg);
+#ifdef USE_ADVANCED_REGALLOC
+ s->reg_next_use[reg] = param_next_use[nb_oargs + i];
+#endif
}
}
@@ -2039,6 +2057,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
ts->reg = reg;
ts->mem_coherent = 0;
s->reg_to_temp[reg] = arg;
+#ifdef USE_ADVANCED_REGALLOC
+ s->reg_next_use[reg] = param_next_use[i];
+#endif
}
}
@@ -2068,6 +2089,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
TCGOpcode opc;
int op_index;
const TCGOpDef *def;
+ const int *param_next_use_ptr;
+ int nb_args;
unsigned int dead_iargs;
const TCGArg *args;
@@ -2095,6 +2118,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
}
#endif
+ param_next_use_ptr = s->param_next_use;
+
tcg_reg_alloc_start(s);
s->code_buf = gen_code_buf;
@@ -2120,7 +2145,7 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
case INDEX_op_mov_i64:
#endif
dead_iargs = s->op_dead_iargs[op_index];
- tcg_reg_alloc_mov(s, def, args, dead_iargs);
+ tcg_reg_alloc_mov(s, def, args, param_next_use_ptr, dead_iargs);
break;
case INDEX_op_movi_i32:
#if TCG_TARGET_REG_BITS == 64
@@ -2137,6 +2162,7 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
case INDEX_op_nop3:
break;
case INDEX_op_nopn:
+ param_next_use_ptr += args[0];
args += args[0];
goto next;
case INDEX_op_discard:
@@ -2157,7 +2183,10 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
break;
case INDEX_op_call:
dead_iargs = s->op_dead_iargs[op_index];
- args += tcg_reg_alloc_call(s, def, opc, args, dead_iargs);
+ nb_args = tcg_reg_alloc_call(s, def, opc, args,
+ param_next_use_ptr, dead_iargs);
+ args += nb_args;
+ param_next_use_ptr += nb_args;
goto next;
case INDEX_op_end:
goto the_end;
@@ -2166,10 +2195,11 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
faster to have specialized register allocator functions for
some common argument patterns */
dead_iargs = s->op_dead_iargs[op_index];
- tcg_reg_alloc_op(s, def, opc, args, dead_iargs);
+ tcg_reg_alloc_op(s, def, opc, args, param_next_use_ptr, dead_iargs);
break;
}
args += def->nb_args;
+ param_next_use_ptr += def->nb_args;
next:
if (search_pc >= 0 && search_pc < s->code_ptr - gen_code_buf) {
return op_index;
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC][PATCH v0 3/8] Do better spill choice.
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 1/8] Compute additional liveness information for " Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 2/8] Propagate REG_NEXT_USE value through process of register allocation Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 4/8] Calculate NEXT_CALL liveness information Kirill Batuzov
` (5 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Choose register with farthest next use for spilling.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 21 +++++++++++++++++++++
1 files changed, 21 insertions(+), 0 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index c6e920e..61689e2 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1528,6 +1528,9 @@ static void tcg_reg_free(TCGContext *s, int reg)
static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2)
{
int i, reg;
+#ifdef USE_ADVANCED_REGALLOC
+ int best_reg = -1, best_score = -2;
+#endif
TCGRegSet reg_ct;
tcg_regset_andnot(reg_ct, reg1, reg2);
@@ -1543,11 +1546,29 @@ static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2)
for(i = 0; i < ARRAY_SIZE(tcg_target_reg_alloc_order); i++) {
reg = tcg_target_reg_alloc_order[i];
if (tcg_regset_test_reg(reg_ct, reg)) {
+#ifdef USE_ADVANCED_REGALLOC
+ if (s->reg_next_use[reg] > best_score ||
+ s->reg_next_use[reg] == -1) {
+ best_reg = reg;
+ best_score = s->reg_next_use[reg];
+ if (best_score == -1) {
+ best_score = OPPARAM_BUF_SIZE + 1;
+ }
+ }
+#else
tcg_reg_free(s, reg);
return reg;
+#endif
}
}
+#ifdef USE_ADVANCED_REGALLOC
+ if (best_score >= 0 && best_reg >= 0) {
+ tcg_reg_free(s, best_reg);
+ return best_reg;
+ }
+#endif
+
tcg_abort();
}
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC][PATCH v0 4/8] Calculate NEXT_CALL liveness information.
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
` (2 preceding siblings ...)
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 3/8] Do better spill choice Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 5/8] Track call-clobbered uses of registers Kirill Batuzov
` (4 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Keep track of where is the next call for each TCG operation.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 11 +++++++++++
tcg/tcg.h | 2 ++
2 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 61689e2..799b245 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1212,6 +1212,9 @@ static void tcg_liveness_analysis(TCGContext *s)
uint8_t *dead_temps;
int *temp_next_use = NULL;
unsigned int dead_iargs;
+#ifdef USE_ADVANCED_REGALLOC
+ int last_call;
+#endif
gen_opc_ptr++; /* skip end */
@@ -1223,6 +1226,8 @@ static void tcg_liveness_analysis(TCGContext *s)
nb_args = gen_opparam_ptr - gen_opparam_buf;
s->param_next_use = tcg_malloc(nb_args * sizeof(s->param_next_use[0]));
next_use_ptr = s->param_next_use + nb_args;
+ last_call = nb_ops + 1;
+ s->next_call = tcg_malloc(nb_ops * sizeof(s->next_call[0]));
#endif
dead_temps = tcg_malloc(s->nb_temps);
@@ -1237,6 +1242,12 @@ static void tcg_liveness_analysis(TCGContext *s)
while (op_index >= 0) {
op = gen_opc_buf[op_index];
def = &tcg_op_defs[op];
+#ifdef USE_ADVANCED_REGALLOC
+ s->next_call[op_index] = last_call;
+ if (def->flags & TCG_OPF_CALL_CLOBBER) {
+ last_call = op_index;
+ }
+#endif
switch(op) {
case INDEX_op_call:
{
diff --git a/tcg/tcg.h b/tcg/tcg.h
index d8bfa2c..9ff519e 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -290,6 +290,8 @@ struct TCGContext {
corresponding input argument is dead */
int *param_next_use; /* for each operation argument tells where it's
next used is (USE_ADVANCED_REGALLOC only) */
+ int *next_call; /* for each operation tells where next CALL operation
+ occurs */
/* tells in which temporary a given register is. It does not take
into account fixed registers */
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC][PATCH v0 5/8] Track call-clobbered uses of registers
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
` (3 preceding siblings ...)
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 4/8] Calculate NEXT_CALL liveness information Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 6/8] Spill globals early if their next use is in call Kirill Batuzov
` (3 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Adjust next use for call-clobbered registers.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 12 ++++++++++++
1 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 799b245..8ab556d 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2125,6 +2125,9 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
int nb_args;
unsigned int dead_iargs;
const TCGArg *args;
+#ifdef USE_ADVANCED_REGALLOC
+ int reg;
+#endif
#ifdef DEBUG_DISAS
if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
@@ -2166,6 +2169,15 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
tcg_table_op_count[opc]++;
#endif
def = &tcg_op_defs[opc];
+
+#ifdef USE_ADVANCED_REGALLOC
+ for (reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
+ if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)
+ && s->reg_next_use[reg] > s->next_call[op_index]) {
+ s->reg_next_use[reg] = s->next_call[op_index];
+ }
+ }
+#endif
#if 0
printf("%s: %d %d %d\n", def->name,
def->nb_oargs, def->nb_iargs, def->nb_cargs);
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC][PATCH v0 6/8] Spill globals early if their next use is in call.
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
` (4 preceding siblings ...)
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 5/8] Track call-clobbered uses of registers Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 7/8] Spill globals early if their next use is at the BB end Kirill Batuzov
` (2 subsequent siblings)
8 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Spill globals early if their next use is in call. They'll be spilled
anyway in this case.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 18 ++++++++++++++++++
1 files changed, 18 insertions(+), 0 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8ab556d..ad5bd71 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1297,6 +1297,11 @@ static void tcg_liveness_analysis(TCGContext *s)
if (!(call_flags & TCG_CALL_CONST)) {
/* globals are live (they may be used by the call) */
memset(dead_temps, 0, s->nb_globals);
+#ifdef USE_ADVANCED_REGALLOC
+ for (i = 0; i < s->nb_globals; i++) {
+ temp_next_use[i] = op_index;
+ }
+#endif
}
/* input args are live */
@@ -1393,6 +1398,11 @@ static void tcg_liveness_analysis(TCGContext *s)
} else if (def->flags & TCG_OPF_CALL_CLOBBER) {
/* globals are live */
memset(dead_temps, 0, s->nb_globals);
+#ifdef USE_ADVANCED_REGALLOC
+ for (i = 0; i < s->nb_globals; i++) {
+ temp_next_use[i] = op_index;
+ }
+#endif
}
/* input args are live */
@@ -2190,6 +2200,14 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
#endif
dead_iargs = s->op_dead_iargs[op_index];
tcg_reg_alloc_mov(s, def, args, param_next_use_ptr, dead_iargs);
+#ifdef USE_ADVANCED_REGALLOC
+ if (args[0] < s->nb_globals) {
+ if (tcg_op_defs[gen_opc_buf[param_next_use_ptr[0]]].flags
+ & TCG_OPF_CALL_CLOBBER) {
+ tcg_reg_free(s, s->temps[args[0]].reg);
+ }
+ }
+#endif
break;
case INDEX_op_movi_i32:
#if TCG_TARGET_REG_BITS == 64
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC][PATCH v0 7/8] Spill globals early if their next use is at the BB end.
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
` (5 preceding siblings ...)
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 6/8] Spill globals early if their next use is in call Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 8/8] Add spill count profiling Kirill Batuzov
2011-05-23 21:22 ` [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Aurelien Jarno
8 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Spill globals early if their next use is at the BB end. They'll be spilled
anyway in this case.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 13 +++++++++----
1 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index ad5bd71..022eef9 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1179,11 +1179,16 @@ static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps)
/* liveness analysis: end of basic block: globals are live, temps are
dead, local temps are live. */
static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
- int *temp_next_use)
+ int *temp_next_use, int op_index)
{
int i;
TCGTemp *ts;
+#ifdef USE_ADVANCED_REGALLOC
+ for (i = 0; i < s->nb_globals; i++) {
+ temp_next_use[i] = op_index;
+ }
+#endif
memset(dead_temps, 0, s->nb_globals);
ts = &s->temps[s->nb_globals];
for(i = s->nb_globals; i < s->nb_temps; i++) {
@@ -1328,7 +1333,7 @@ static void tcg_liveness_analysis(TCGContext *s)
args--;
next_use_ptr--;
/* mark end of basic block */
- tcg_la_bb_end(s, dead_temps, temp_next_use);
+ tcg_la_bb_end(s, dead_temps, temp_next_use, op_index);
break;
case INDEX_op_debug_insn_start:
args -= def->nb_args;
@@ -1394,7 +1399,7 @@ static void tcg_liveness_analysis(TCGContext *s)
/* if end of basic block, update */
if (def->flags & TCG_OPF_BB_END) {
- tcg_la_bb_end(s, dead_temps, temp_next_use);
+ tcg_la_bb_end(s, dead_temps, temp_next_use, op_index);
} else if (def->flags & TCG_OPF_CALL_CLOBBER) {
/* globals are live */
memset(dead_temps, 0, s->nb_globals);
@@ -2203,7 +2208,7 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
#ifdef USE_ADVANCED_REGALLOC
if (args[0] < s->nb_globals) {
if (tcg_op_defs[gen_opc_buf[param_next_use_ptr[0]]].flags
- & TCG_OPF_CALL_CLOBBER) {
+ & (TCG_OPF_CALL_CLOBBER | TCG_OPF_BB_END)) {
tcg_reg_free(s, s->temps[args[0]].reg);
}
}
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC][PATCH v0 8/8] Add spill count profiling.
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
` (6 preceding siblings ...)
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 7/8] Spill globals early if their next use is at the BB end Kirill Batuzov
@ 2011-05-23 14:40 ` Kirill Batuzov
2011-05-23 19:32 ` Blue Swirl
2011-05-23 21:22 ` [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Aurelien Jarno
8 siblings, 1 reply; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-23 14:40 UTC (permalink / raw)
To: qemu-devel; +Cc: zhur
Gather generated spills statistics. It is useful for debugging and evaluating
of new register allocator.
Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
---
tcg/tcg.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tcg/tcg.h | 6 +++++
2 files changed, 75 insertions(+), 0 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 022eef9..ba2cddc 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1530,6 +1530,11 @@ static void temp_allocate_frame(TCGContext *s, int temp)
s->current_frame_offset += sizeof(tcg_target_long);
}
+#ifdef CONFIG_PROFILER
+enum { SPILL_REAL, SPILL_BB_END, SPILL_CALL_HWREG,
+ SPILL_CALL_IARG, SPILL_CALL_CLOBBER } spill_cause;
+#endif
+
/* free register 'reg' by spilling the corresponding temporary if necessary */
static void tcg_reg_free(TCGContext *s, int reg)
{
@@ -1544,6 +1549,26 @@ static void tcg_reg_free(TCGContext *s, int reg)
if (!ts->mem_allocated)
temp_allocate_frame(s, temp);
tcg_out_st(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+#ifdef CONFIG_PROFILER
+ s->spill_count++;
+ switch (spill_cause) {
+ case SPILL_REAL:
+ s->spill_real++;
+ break;
+ case SPILL_BB_END:
+ s->spill_bb_end++;
+ break;
+ case SPILL_CALL_HWREG:
+ s->spill_call_hwreg++;
+ break;
+ case SPILL_CALL_IARG:
+ s->spill_call_iarg++;
+ break;
+ case SPILL_CALL_CLOBBER:
+ s->spill_call_clobber++;
+ break;
+ }
+#endif
}
ts->val_type = TEMP_VAL_MEM;
s->reg_to_temp[reg] = -1;
@@ -1582,6 +1607,9 @@ static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2)
}
}
#else
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_REAL;
+#endif
tcg_reg_free(s, reg);
return reg;
#endif
@@ -1590,6 +1618,9 @@ static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2)
#ifdef USE_ADVANCED_REGALLOC
if (best_score >= 0 && best_reg >= 0) {
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_REAL;
+#endif
tcg_reg_free(s, best_reg);
return best_reg;
}
@@ -1653,6 +1684,9 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
for(i = s->nb_globals; i < s->nb_temps; i++) {
ts = &s->temps[i];
if (ts->temp_local) {
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_BB_END;
+#endif
temp_save(s, i, allocated_regs);
} else {
if (ts->val_type == TEMP_VAL_REG) {
@@ -1662,6 +1696,10 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
}
}
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_BB_END;
+#endif
+
save_globals(s, allocated_regs);
}
@@ -1860,12 +1898,18 @@ static void tcg_reg_alloc_op(TCGContext *s,
/* XXX: permit generic clobber register list ? */
for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) {
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_CALL_CLOBBER;
+#endif
tcg_reg_free(s, reg);
}
}
/* XXX: for load/store we could do that only for the slow path
(i.e. when a memory callback is called) */
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_CALL_HWREG;
+#endif
/* store globals and free associated registers (we assume the insn
can modify any global. */
save_globals(s, allocated_regs);
@@ -2001,6 +2045,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
if (arg != TCG_CALL_DUMMY_ARG) {
ts = &s->temps[arg];
reg = tcg_target_call_iarg_regs[i];
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_CALL_IARG;
+#endif
tcg_reg_free(s, reg);
if (ts->val_type == TEMP_VAL_REG) {
if (ts->reg != reg) {
@@ -2071,6 +2118,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
/* clobber call registers */
for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) {
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_CALL_CLOBBER;
+#endif
tcg_reg_free(s, reg);
}
}
@@ -2078,6 +2128,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
/* store globals and free associated registers (we assume the call
can modify any global. */
if (!(flags & TCG_CALL_CONST)) {
+#ifdef CONFIG_PROFILER
+ spill_cause = SPILL_CALL_HWREG;
+#endif
save_globals(s, allocated_regs);
}
@@ -2209,6 +2262,14 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
if (args[0] < s->nb_globals) {
if (tcg_op_defs[gen_opc_buf[param_next_use_ptr[0]]].flags
& (TCG_OPF_CALL_CLOBBER | TCG_OPF_BB_END)) {
+#ifdef CONFIG_PROFILER
+ if (tcg_op_defs[gen_opc_buf[param_next_use_ptr[0]]].flags
+ & TCG_OPF_CALL_CLOBBER) {
+ spill_cause = SPILL_CALL_HWREG;
+ } else {
+ spill_cause = SPILL_BB_END;
+ }
+#endif
tcg_reg_free(s, s->temps[args[0]].reg);
}
}
@@ -2354,6 +2415,14 @@ void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
s->restore_count);
cpu_fprintf(f, " avg cycles %0.1f\n",
s->restore_count ? (double)s->restore_time / s->restore_count : 0);
+ cpu_fprintf(f, "spill count %" PRId64 "\n",
+ s->spill_count);
+ cpu_fprintf(f, " real spills %" PRId64 "\n", s->spill_real);
+ cpu_fprintf(f, " spills at bb end %" PRId64 "\n", s->spill_bb_end);
+ cpu_fprintf(f, " spills at call:\n");
+ cpu_fprintf(f, " globals %" PRId64 "\n", s->spill_call_hwreg);
+ cpu_fprintf(f, " iarg passing %" PRId64 "\n", s->spill_call_iarg);
+ cpu_fprintf(f, " call cloobers %" PRId64 "\n", s->spill_call_clobber);
dump_op_count();
}
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 9ff519e..722bd72 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -328,6 +328,12 @@ struct TCGContext {
int64_t la_time;
int64_t restore_count;
int64_t restore_time;
+ int64_t spill_count;
+ int64_t spill_bb_end;
+ int64_t spill_call_hwreg;
+ int64_t spill_call_iarg;
+ int64_t spill_call_clobber;
+ int64_t spill_real;
#endif
#ifdef CONFIG_DEBUG_TCG
--
1.7.4.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC][PATCH v0 8/8] Add spill count profiling.
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 8/8] Add spill count profiling Kirill Batuzov
@ 2011-05-23 19:32 ` Blue Swirl
0 siblings, 0 replies; 16+ messages in thread
From: Blue Swirl @ 2011-05-23 19:32 UTC (permalink / raw)
To: Kirill Batuzov; +Cc: qemu-devel, zhur
On Mon, May 23, 2011 at 5:40 PM, Kirill Batuzov <batuzovk@ispras.ru> wrote:
> Gather generated spills statistics. It is useful for debugging and evaluating
> of new register allocator.
>
> Signed-off-by: Kirill Batuzov <batuzovk@ispras.ru>
> ---
> tcg/tcg.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> tcg/tcg.h | 6 +++++
> 2 files changed, 75 insertions(+), 0 deletions(-)
>
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 022eef9..ba2cddc 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1530,6 +1530,11 @@ static void temp_allocate_frame(TCGContext *s, int temp)
> s->current_frame_offset += sizeof(tcg_target_long);
> }
>
> +#ifdef CONFIG_PROFILER
> +enum { SPILL_REAL, SPILL_BB_END, SPILL_CALL_HWREG,
> + SPILL_CALL_IARG, SPILL_CALL_CLOBBER } spill_cause;
> +#endif
How about moving this to TCGContext instead of using static variables?
> +
> /* free register 'reg' by spilling the corresponding temporary if necessary */
> static void tcg_reg_free(TCGContext *s, int reg)
> {
> @@ -1544,6 +1549,26 @@ static void tcg_reg_free(TCGContext *s, int reg)
> if (!ts->mem_allocated)
> temp_allocate_frame(s, temp);
> tcg_out_st(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
> +#ifdef CONFIG_PROFILER
> + s->spill_count++;
> + switch (spill_cause) {
> + case SPILL_REAL:
> + s->spill_real++;
> + break;
> + case SPILL_BB_END:
> + s->spill_bb_end++;
> + break;
> + case SPILL_CALL_HWREG:
> + s->spill_call_hwreg++;
> + break;
> + case SPILL_CALL_IARG:
> + s->spill_call_iarg++;
> + break;
> + case SPILL_CALL_CLOBBER:
> + s->spill_call_clobber++;
> + break;
> + }
> +#endif
> }
> ts->val_type = TEMP_VAL_MEM;
> s->reg_to_temp[reg] = -1;
> @@ -1582,6 +1607,9 @@ static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2)
> }
> }
> #else
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_REAL;
> +#endif
> tcg_reg_free(s, reg);
> return reg;
> #endif
> @@ -1590,6 +1618,9 @@ static int tcg_reg_alloc(TCGContext *s, TCGRegSet reg1, TCGRegSet reg2)
>
> #ifdef USE_ADVANCED_REGALLOC
> if (best_score >= 0 && best_reg >= 0) {
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_REAL;
> +#endif
> tcg_reg_free(s, best_reg);
> return best_reg;
> }
> @@ -1653,6 +1684,9 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
> for(i = s->nb_globals; i < s->nb_temps; i++) {
> ts = &s->temps[i];
> if (ts->temp_local) {
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_BB_END;
> +#endif
> temp_save(s, i, allocated_regs);
> } else {
> if (ts->val_type == TEMP_VAL_REG) {
> @@ -1662,6 +1696,10 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
> }
> }
>
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_BB_END;
> +#endif
> +
> save_globals(s, allocated_regs);
> }
>
> @@ -1860,12 +1898,18 @@ static void tcg_reg_alloc_op(TCGContext *s,
> /* XXX: permit generic clobber register list ? */
> for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
> if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) {
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_CALL_CLOBBER;
> +#endif
> tcg_reg_free(s, reg);
> }
> }
> /* XXX: for load/store we could do that only for the slow path
> (i.e. when a memory callback is called) */
>
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_CALL_HWREG;
> +#endif
> /* store globals and free associated registers (we assume the insn
> can modify any global. */
> save_globals(s, allocated_regs);
> @@ -2001,6 +2045,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
> if (arg != TCG_CALL_DUMMY_ARG) {
> ts = &s->temps[arg];
> reg = tcg_target_call_iarg_regs[i];
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_CALL_IARG;
> +#endif
> tcg_reg_free(s, reg);
> if (ts->val_type == TEMP_VAL_REG) {
> if (ts->reg != reg) {
> @@ -2071,6 +2118,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
> /* clobber call registers */
> for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
> if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) {
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_CALL_CLOBBER;
> +#endif
> tcg_reg_free(s, reg);
> }
> }
> @@ -2078,6 +2128,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
> /* store globals and free associated registers (we assume the call
> can modify any global. */
> if (!(flags & TCG_CALL_CONST)) {
> +#ifdef CONFIG_PROFILER
> + spill_cause = SPILL_CALL_HWREG;
> +#endif
> save_globals(s, allocated_regs);
> }
>
> @@ -2209,6 +2262,14 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
> if (args[0] < s->nb_globals) {
> if (tcg_op_defs[gen_opc_buf[param_next_use_ptr[0]]].flags
> & (TCG_OPF_CALL_CLOBBER | TCG_OPF_BB_END)) {
> +#ifdef CONFIG_PROFILER
> + if (tcg_op_defs[gen_opc_buf[param_next_use_ptr[0]]].flags
> + & TCG_OPF_CALL_CLOBBER) {
> + spill_cause = SPILL_CALL_HWREG;
> + } else {
> + spill_cause = SPILL_BB_END;
> + }
> +#endif
> tcg_reg_free(s, s->temps[args[0]].reg);
> }
> }
> @@ -2354,6 +2415,14 @@ void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
> s->restore_count);
> cpu_fprintf(f, " avg cycles %0.1f\n",
> s->restore_count ? (double)s->restore_time / s->restore_count : 0);
> + cpu_fprintf(f, "spill count %" PRId64 "\n",
> + s->spill_count);
> + cpu_fprintf(f, " real spills %" PRId64 "\n", s->spill_real);
> + cpu_fprintf(f, " spills at bb end %" PRId64 "\n", s->spill_bb_end);
> + cpu_fprintf(f, " spills at call:\n");
> + cpu_fprintf(f, " globals %" PRId64 "\n", s->spill_call_hwreg);
> + cpu_fprintf(f, " iarg passing %" PRId64 "\n", s->spill_call_iarg);
> + cpu_fprintf(f, " call cloobers %" PRId64 "\n", s->spill_call_clobber);
cloober?
>
> dump_op_count();
> }
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 9ff519e..722bd72 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -328,6 +328,12 @@ struct TCGContext {
> int64_t la_time;
> int64_t restore_count;
> int64_t restore_time;
> + int64_t spill_count;
> + int64_t spill_bb_end;
> + int64_t spill_call_hwreg;
> + int64_t spill_call_iarg;
> + int64_t spill_call_clobber;
> + int64_t spill_real;
> #endif
>
> #ifdef CONFIG_DEBUG_TCG
> --
> 1.7.4.1
>
>
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator
2011-05-23 14:40 [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Kirill Batuzov
` (7 preceding siblings ...)
2011-05-23 14:40 ` [Qemu-devel] [RFC][PATCH v0 8/8] Add spill count profiling Kirill Batuzov
@ 2011-05-23 21:22 ` Aurelien Jarno
2011-05-24 11:31 ` Kirill Batuzov
8 siblings, 1 reply; 16+ messages in thread
From: Aurelien Jarno @ 2011-05-23 21:22 UTC (permalink / raw)
To: Kirill Batuzov; +Cc: qemu-devel, zhur
On Mon, May 23, 2011 at 06:40:46PM +0400, Kirill Batuzov wrote:
> This series improves register allocator by keeping track of temp's and
> register's live ranges, doing better spill choice and spilling early unneeded
> globals.
>
> The patches do need testing and performance evaluation before they will be
> ready for final review. I decided to preliminary post them because it seems
> other people are working on the same thing and I'd like to avoid unnecessary
> work duplication if possible.
Thanks for this patch series. Your approach to solve this issue is
really different than mine. Instead I added more state to the dead/live
states, and use them to mark some input deads even for global, and mark
some output arguments to be synced. This informations are then used
directly in the tcg_reg_alloc_* functions to make better usage of the
available registers. On the other hand my patch series only tries to
really lower the number of spills and doesn't try to make better spill
choices.
I guess it would be a good idea that I continue with this approach (I
basically just have to fix a few cases were some regs are wrongly copied
back to memory), so that we can more easily compare the two approaches.
Your last patch is anyway interesting, having some statistics is always
something interesting.
In any case I really think we need a better register allocator before we
can do any serious optimization passes like constant or copy propagation,
otherwise we end up with a lot of register in use for no real reason.
> Kirill Batuzov (8):
> Compute additional liveness information for register allocator.
> Propagate REG_NEXT_USE value through process of register allocation.
> Do better spill choice.
> Calculate NEXT_CALL liveness information.
> Track call-clobbered uses of registers
> Spill globals early if their next use is in call.
> Spill globals early if their next use is at the BB end.
> Add spill count profiling.
>
> tcg/tcg.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
> tcg/tcg.h | 12 +++
> 2 files changed, 248 insertions(+), 9 deletions(-)
>
> --
> 1.7.4.1
>
>
>
--
Aurelien Jarno GPG: 1024D/F1BCDB73
aurelien@aurel32.net http://www.aurel32.net
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator
2011-05-23 21:22 ` [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator Aurelien Jarno
@ 2011-05-24 11:31 ` Kirill Batuzov
2011-05-24 12:40 ` Aurelien Jarno
` (2 more replies)
0 siblings, 3 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-24 11:31 UTC (permalink / raw)
To: Aurelien Jarno; +Cc: qemu-devel, zhur
On Mon, 23 May 2011, Aurelien Jarno wrote:
>
> Thanks for this patch series. Your approach to solve this issue is
> really different than mine. Instead I added more state to the dead/live
> states, and use them to mark some input deads even for global, and mark
> some output arguments to be synced. This informations are then used
> directly in the tcg_reg_alloc_* functions to make better usage of the
> available registers. On the other hand my patch series only tries to
> really lower the number of spills and doesn't try to make better spill
> choices.
>
> I guess it would be a good idea that I continue with this approach (I
> basically just have to fix a few cases were some regs are wrongly copied
> back to memory), so that we can more easily compare the two approaches.
> Your last patch is anyway interesting, having some statistics is always
> something interesting.
>
> In any case I really think we need a better register allocator before we
> can do any serious optimization passes like constant or copy propagation,
> otherwise we end up with a lot of register in use for no real reason.
>
When I started working on this patch series I first wanted to write a
better register allocator, something linear scan based. But TBs
currently have quite specific and very simple structure. They have globals
which are alive everywhere and temps, packed in a count of nests. Each nest
is a result of translation of one guest instruction. Live ranges of temps in
one nest always intersect, while live ranges of temps from different
nests never intersect. As a result more sophisticated algorithm being
applied to this test case works very similar to a simple greedy algorithm we
have right now.
Gathered statistics shows some interesting things too. I've run matrix
multiplication benchmark (guest - ARM, host - x86, linux-user mode, with
my patches applied) and here are the results:
spill count 3916
real spills 32
spills at bb end 1023
spills at call:
globals 2755
iarg passing 0
call cloobers 106
Real spills are spills generated by register allocator when it runs out
of registers. They are less than 1% of all spills. Other tests show
similar behavior.
I think any further improvements to register allocator without leveling
conventions about saving globals at calls and BB ends somehow is
useless.
Currently we are looking if we can pass some globals on registers
through basic block boundaries (inside one TB of course).
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator
2011-05-24 11:31 ` Kirill Batuzov
@ 2011-05-24 12:40 ` Aurelien Jarno
2011-05-24 13:24 ` Laurent Desnogues
2011-05-24 16:07 ` Richard Henderson
2 siblings, 0 replies; 16+ messages in thread
From: Aurelien Jarno @ 2011-05-24 12:40 UTC (permalink / raw)
To: Kirill Batuzov; +Cc: qemu-devel, zhur
On Tue, May 24, 2011 at 03:31:11PM +0400, Kirill Batuzov wrote:
>
>
> On Mon, 23 May 2011, Aurelien Jarno wrote:
>
> >
> > Thanks for this patch series. Your approach to solve this issue is
> > really different than mine. Instead I added more state to the dead/live
> > states, and use them to mark some input deads even for global, and mark
> > some output arguments to be synced. This informations are then used
> > directly in the tcg_reg_alloc_* functions to make better usage of the
> > available registers. On the other hand my patch series only tries to
> > really lower the number of spills and doesn't try to make better spill
> > choices.
> >
> > I guess it would be a good idea that I continue with this approach (I
> > basically just have to fix a few cases were some regs are wrongly copied
> > back to memory), so that we can more easily compare the two approaches.
> > Your last patch is anyway interesting, having some statistics is always
> > something interesting.
> >
> > In any case I really think we need a better register allocator before we
> > can do any serious optimization passes like constant or copy propagation,
> > otherwise we end up with a lot of register in use for no real reason.
> >
> When I started working on this patch series I first wanted to write a
> better register allocator, something linear scan based. But TBs
> currently have quite specific and very simple structure. They have globals
> which are alive everywhere and temps, packed in a count of nests. Each nest
> is a result of translation of one guest instruction. Live ranges of temps in
> one nest always intersect, while live ranges of temps from different
> nests never intersect. As a result more sophisticated algorithm being
> applied to this test case works very similar to a simple greedy algorithm we
> have right now.
>
> Gathered statistics shows some interesting things too. I've run matrix
> multiplication benchmark (guest - ARM, host - x86, linux-user mode, with
> my patches applied) and here are the results:
>
> spill count 3916
> real spills 32
> spills at bb end 1023
> spills at call:
> globals 2755
> iarg passing 0
> call cloobers 106
>
> Real spills are spills generated by register allocator when it runs out
> of registers. They are less than 1% of all spills. Other tests show
> similar behavior.
>
> I think any further improvements to register allocator without leveling
> conventions about saving globals at calls and BB ends somehow is
> useless.
>
That's actually why in my implementation I distinguish saving the global
back to memory, and synchronizing it with memory, but keeping it in a
register if it is not write anymore after. A lot of calls (especially
qemu ld/st) actually do not need to have the global back in memory, but
only be synchronized with the value in memory in case an exception
happens. Doing so save the moves memory to register after the call.
Anyway your statistics are actually showing what I was trying to say in
the TCG_AREG0 thread: if we use the host registers correctly, TCG won't
be really able to use another register.
--
Aurelien Jarno GPG: 1024D/F1BCDB73
aurelien@aurel32.net http://www.aurel32.net
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator
2011-05-24 11:31 ` Kirill Batuzov
2011-05-24 12:40 ` Aurelien Jarno
@ 2011-05-24 13:24 ` Laurent Desnogues
2011-05-24 13:32 ` Kirill Batuzov
2011-05-24 16:07 ` Richard Henderson
2 siblings, 1 reply; 16+ messages in thread
From: Laurent Desnogues @ 2011-05-24 13:24 UTC (permalink / raw)
To: Kirill Batuzov; +Cc: zhur, qemu-devel, Aurelien Jarno
On Tue, May 24, 2011 at 1:31 PM, Kirill Batuzov <batuzovk@ispras.ru> wrote:
[...]
> Gathered statistics shows some interesting things too. I've run matrix
> multiplication benchmark (guest - ARM, host - x86, linux-user mode, with
> my patches applied) and here are the results:
>
> spill count 3916
> real spills 32
> spills at bb end 1023
> spills at call:
> globals 2755
> iarg passing 0
> call cloobers 106
>
> Real spills are spills generated by register allocator when it runs out
> of registers. They are less than 1% of all spills. Other tests show
> similar behavior.
When you write "host x86", do you mean IA32 or x86_64?
That might change the number of real spills a lot if you meant
x86_64.
> I think any further improvements to register allocator without leveling
> conventions about saving globals at calls and BB ends somehow is
> useless.
>
> Currently we are looking if we can pass some globals on registers
> through basic block boundaries (inside one TB of course).
If by "basic block", you mean BB as implied by TCG br for
instance, I'm not sure all guests will benefit a lot. If you
mean that you intend on putting several guests BB in a
single TB then I guess you'll have to first collect dynamic
statistics before dynamically switching to grouping BB.
Laurent
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator
2011-05-24 13:24 ` Laurent Desnogues
@ 2011-05-24 13:32 ` Kirill Batuzov
0 siblings, 0 replies; 16+ messages in thread
From: Kirill Batuzov @ 2011-05-24 13:32 UTC (permalink / raw)
To: Laurent Desnogues; +Cc: zhur, qemu-devel, Aurelien Jarno
[-- Attachment #1: Type: TEXT/PLAIN, Size: 1696 bytes --]
On Tue, 24 May 2011, Laurent Desnogues wrote:
> On Tue, May 24, 2011 at 1:31 PM, Kirill Batuzov <batuzovk@ispras.ru> wrote:
> [...]
> > Gathered statistics shows some interesting things too. I've run matrix
> > multiplication benchmark (guest - ARM, host - x86, linux-user mode, with
> > my patches applied) and here are the results:
> >
> > spill count 3916
> > real spills 32
> > spills at bb end 1023
> > spills at call:
> > globals 2755
> > iarg passing 0
> > call cloobers 106
> >
> > Real spills are spills generated by register allocator when it runs out
> > of registers. They are less than 1% of all spills. Other tests show
> > similar behavior.
>
> When you write "host x86", do you mean IA32 or x86_64?
> That might change the number of real spills a lot if you meant
> x86_64.
>
I mean IA32, not x86_64.
> > I think any further improvements to register allocator without leveling
> > conventions about saving globals at calls and BB ends somehow is
> > useless.
> >
> > Currently we are looking if we can pass some globals on registers
> > through basic block boundaries (inside one TB of course).
>
> If by "basic block", you mean BB as implied by TCG br for
> instance, I'm not sure all guests will benefit a lot. If you
> mean that you intend on putting several guests BB in a
> single TB then I guess you'll have to first collect dynamic
> statistics before dynamically switching to grouping BB.
>
I mean BB as implied by TCG br. Even if only some guest will benefit
from this it still looks like a good idea. Other guests should not be
affected much I believe.
----
Kirill.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC][PATCH v0 0/8] Improve register allocator
2011-05-24 11:31 ` Kirill Batuzov
2011-05-24 12:40 ` Aurelien Jarno
2011-05-24 13:24 ` Laurent Desnogues
@ 2011-05-24 16:07 ` Richard Henderson
2 siblings, 0 replies; 16+ messages in thread
From: Richard Henderson @ 2011-05-24 16:07 UTC (permalink / raw)
To: Kirill Batuzov; +Cc: zhur, qemu-devel, Aurelien Jarno
On 05/24/2011 04:31 AM, Kirill Batuzov wrote:
>
>
> On Mon, 23 May 2011, Aurelien Jarno wrote:
>
>>
>> Thanks for this patch series. Your approach to solve this issue is
>> really different than mine. Instead I added more state to the dead/live
>> states, and use them to mark some input deads even for global, and mark
>> some output arguments to be synced. This informations are then used
>> directly in the tcg_reg_alloc_* functions to make better usage of the
>> available registers. On the other hand my patch series only tries to
>> really lower the number of spills and doesn't try to make better spill
>> choices.
>>
>> I guess it would be a good idea that I continue with this approach (I
>> basically just have to fix a few cases were some regs are wrongly copied
>> back to memory), so that we can more easily compare the two approaches.
>> Your last patch is anyway interesting, having some statistics is always
>> something interesting.
>>
>> In any case I really think we need a better register allocator before we
>> can do any serious optimization passes like constant or copy propagation,
>> otherwise we end up with a lot of register in use for no real reason.
>>
> When I started working on this patch series I first wanted to write a
> better register allocator, something linear scan based. But TBs
> currently have quite specific and very simple structure. They have globals
> which are alive everywhere and temps, packed in a count of nests. Each nest
> is a result of translation of one guest instruction. Live ranges of temps in
> one nest always intersect, while live ranges of temps from different
> nests never intersect. As a result more sophisticated algorithm being
> applied to this test case works very similar to a simple greedy algorithm we
> have right now.
Something that would be helpful for the RISC hosts would be to add some
mechanism to add constants -- or constant fragments, if you like -- into
the register allocation mix.
If you have access to a Sparc or PPC host (perhaps emulated under qemu),
have a look at the code generated for an i386, or even arm executable.
You'll see lots of similar constants being created, all in a 2-3 insn
sequence. Have a look at the code generated for a 64-bit target like
Alpha and it'll be a 4-6 insn sequence.
Ideally we'd be able to register-allocate these partial constant loads,
and so collapse similar sequences. We have tons of registers that are
not being used on these hosts, which seems a shame.
r~
^ permalink raw reply [flat|nested] 16+ messages in thread