* [Qemu-devel] [PATCH 0/4] tcg memory usage improvements
@ 2016-06-23 18:02 Richard Henderson
2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:02 UTC (permalink / raw)
To: qemu-devel; +Cc: aurelien
While continuing to work on the sparc64 on i686 problem, I've run into
a few things that could be done better.
I haven't done any proper measurements of memory usage or boot time,
but all together my guess is that this saves 10-20k. Not much, I grant,
but maintaining more cache locality ought not hurt.
r~
Richard Henderson (4):
tcg: Compress liveness data to 16 bits
tcg: Reorg TCGOp chaining
tcg: Fold life data into TCGOp
tcg: Compress dead_temps and mem_temps into a single array
include/exec/gen-icount.h | 2 +-
tcg/optimize.c | 37 +---
tcg/tcg-op.c | 2 +-
tcg/tcg.c | 472 ++++++++++++++++++++++++++++------------------
tcg/tcg.h | 50 +++--
5 files changed, 329 insertions(+), 234 deletions(-)
--
2.5.5
^ permalink raw reply [flat|nested] 5+ messages in thread
* [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits
2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
@ 2016-06-23 18:02 ` Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining Richard Henderson
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:02 UTC (permalink / raw)
To: qemu-devel; +Cc: aurelien
This reduces both memory usage and per-insn cacheline usage
during code generation.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/tcg.c | 58 ++++++++++++++++++++++------------------------------------
tcg/tcg.h | 16 ++++++++++------
2 files changed, 32 insertions(+), 42 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 44de991..4b117fd 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1331,7 +1331,7 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
}
}
-/* Liveness analysis : update the opc_dead_args array to tell if a
+/* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
static void tcg_liveness_analysis(TCGContext *s)
@@ -1340,9 +1340,8 @@ static void tcg_liveness_analysis(TCGContext *s)
int oi, oi_prev, nb_ops;
nb_ops = s->gen_next_op_idx;
- s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t));
- s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t));
-
+ s->op_arg_life = tcg_malloc(nb_ops * sizeof(TCGLifeData));
+
dead_temps = tcg_malloc(s->nb_temps);
mem_temps = tcg_malloc(s->nb_temps);
tcg_la_func_end(s, dead_temps, mem_temps);
@@ -1351,8 +1350,7 @@ static void tcg_liveness_analysis(TCGContext *s)
int i, nb_iargs, nb_oargs;
TCGOpcode opc_new, opc_new2;
bool have_opc_new2;
- uint16_t dead_args;
- uint8_t sync_args;
+ TCGLifeData arg_life = 0;
TCGArg arg;
TCGOp * const op = &s->gen_op_buf[oi];
@@ -1384,15 +1382,13 @@ static void tcg_liveness_analysis(TCGContext *s)
do_not_remove_call:
/* output args are dead */
- dead_args = 0;
- sync_args = 0;
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
- dead_args |= (1 << i);
+ arg_life |= DEAD_ARG << i;
}
if (mem_temps[arg]) {
- sync_args |= (1 << i);
+ arg_life |= SYNC_ARG << i;
}
dead_temps[arg] = 1;
mem_temps[arg] = 0;
@@ -1413,7 +1409,7 @@ static void tcg_liveness_analysis(TCGContext *s)
arg = args[i];
if (arg != TCG_CALL_DUMMY_ARG) {
if (dead_temps[arg]) {
- dead_args |= (1 << i);
+ arg_life |= DEAD_ARG << i;
}
}
}
@@ -1422,8 +1418,6 @@ static void tcg_liveness_analysis(TCGContext *s)
arg = args[i];
dead_temps[arg] = 0;
}
- s->op_dead_args[oi] = dead_args;
- s->op_sync_args[oi] = sync_args;
}
}
break;
@@ -1534,15 +1528,13 @@ static void tcg_liveness_analysis(TCGContext *s)
} else {
do_not_remove:
/* output args are dead */
- dead_args = 0;
- sync_args = 0;
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
- dead_args |= (1 << i);
+ arg_life |= DEAD_ARG << i;
}
if (mem_temps[arg]) {
- sync_args |= (1 << i);
+ arg_life |= SYNC_ARG << i;
}
dead_temps[arg] = 1;
mem_temps[arg] = 0;
@@ -1560,7 +1552,7 @@ static void tcg_liveness_analysis(TCGContext *s)
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
- dead_args |= (1 << i);
+ arg_life |= DEAD_ARG << i;
}
}
/* input arguments are live for preceding opcodes */
@@ -1568,11 +1560,10 @@ static void tcg_liveness_analysis(TCGContext *s)
arg = args[i];
dead_temps[arg] = 0;
}
- s->op_dead_args[oi] = dead_args;
- s->op_sync_args[oi] = sync_args;
}
break;
}
+ s->op_arg_life[oi] = arg_life;
}
}
#else
@@ -1911,11 +1902,11 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
save_globals(s, allocated_regs);
}
-#define IS_DEAD_ARG(n) ((dead_args >> (n)) & 1)
-#define NEED_SYNC_ARG(n) ((sync_args >> (n)) & 1)
+#define IS_DEAD_ARG(n) (arg_life & (DEAD_ARG << (n)))
+#define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n)))
static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
- uint16_t dead_args, uint8_t sync_args)
+ TCGLifeData arg_life)
{
TCGTemp *ots;
tcg_target_ulong val;
@@ -1944,8 +1935,7 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
}
static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
- const TCGArg *args, uint16_t dead_args,
- uint8_t sync_args)
+ const TCGArg *args, TCGLifeData arg_life)
{
TCGRegSet allocated_regs;
TCGTemp *ts, *ots;
@@ -2030,8 +2020,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
static void tcg_reg_alloc_op(TCGContext *s,
const TCGOpDef *def, TCGOpcode opc,
- const TCGArg *args, uint16_t dead_args,
- uint8_t sync_args)
+ const TCGArg *args, TCGLifeData arg_life)
{
TCGRegSet allocated_regs;
int i, k, nb_iargs, nb_oargs;
@@ -2196,8 +2185,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
#endif
static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
- const TCGArg * const args, uint16_t dead_args,
- uint8_t sync_args)
+ const TCGArg * const args, TCGLifeData arg_life)
{
int flags, nb_regs, i;
TCGReg reg;
@@ -2417,8 +2405,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
TCGArg * const args = &s->gen_opparam_buf[op->args];
TCGOpcode opc = op->opc;
const TCGOpDef *def = &tcg_op_defs[opc];
- uint16_t dead_args = s->op_dead_args[oi];
- uint8_t sync_args = s->op_sync_args[oi];
+ TCGLifeData arg_life = s->op_arg_life[oi];
oi_next = op->next;
#ifdef CONFIG_PROFILER
@@ -2428,11 +2415,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
switch (opc) {
case INDEX_op_mov_i32:
case INDEX_op_mov_i64:
- tcg_reg_alloc_mov(s, def, args, dead_args, sync_args);
+ tcg_reg_alloc_mov(s, def, args, arg_life);
break;
case INDEX_op_movi_i32:
case INDEX_op_movi_i64:
- tcg_reg_alloc_movi(s, args, dead_args, sync_args);
+ tcg_reg_alloc_movi(s, args, arg_life);
break;
case INDEX_op_insn_start:
if (num_insns >= 0) {
@@ -2457,8 +2444,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
tcg_out_label(s, arg_label(args[0]), s->code_ptr);
break;
case INDEX_op_call:
- tcg_reg_alloc_call(s, op->callo, op->calli, args,
- dead_args, sync_args);
+ tcg_reg_alloc_call(s, op->callo, op->calli, args, arg_life);
break;
default:
/* Sanity check that we've not introduced any unhandled opcodes. */
@@ -2468,7 +2454,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
/* Note: in order to speed up the code, it would be much
faster to have specialized register allocator functions for
some common argument patterns */
- tcg_reg_alloc_op(s, def, opc, args, dead_args, sync_args);
+ tcg_reg_alloc_op(s, def, opc, args, arg_life);
break;
}
#ifdef CONFIG_DEBUG_TCG
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 66d7fc0..cc14560 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -505,6 +505,14 @@ typedef struct TCGTempSet {
unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
} TCGTempSet;
+/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding,
+ this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands.
+ There are never more than 2 outputs, which means that we can store all
+ dead + sync data within 16 bits. */
+#define DEAD_ARG 4
+#define SYNC_ARG 1
+typedef uint16_t TCGLifeData;
+
typedef struct TCGOp {
TCGOpcode opc : 8;
@@ -538,12 +546,8 @@ struct TCGContext {
uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */
/* liveness analysis */
- uint16_t *op_dead_args; /* for each operation, each bit tells if the
- corresponding argument is dead */
- uint8_t *op_sync_args; /* for each operation, each bit tells if the
- corresponding output argument needs to be
- sync to memory. */
-
+ TCGLifeData *op_arg_life;
+
TCGRegSet reserved_regs;
intptr_t current_frame_offset;
intptr_t frame_start;
--
2.5.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining
2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
@ 2016-06-23 18:03 ` Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array Richard Henderson
3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:03 UTC (permalink / raw)
To: qemu-devel; +Cc: aurelien
Instead of using -1 as end of chain, use 0, and link through the 0
entry as a fully circular double-linked list.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
include/exec/gen-icount.h | 2 +-
tcg/optimize.c | 8 ++------
tcg/tcg-op.c | 2 +-
tcg/tcg.c | 32 ++++++++++++--------------------
tcg/tcg.h | 20 ++++++++++++--------
5 files changed, 28 insertions(+), 36 deletions(-)
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index a011324..5f16077 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -59,7 +59,7 @@ static void gen_tb_end(TranslationBlock *tb, int num_insns)
}
/* Terminate the linked list. */
- tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
+ tcg_ctx.gen_op_buf[tcg_ctx.gen_op_buf[0].prev].next = 0;
}
static inline void gen_io_start(void)
diff --git a/tcg/optimize.c b/tcg/optimize.c
index c0d975b..8df7fc7 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -103,11 +103,7 @@ static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op,
.prev = prev,
.next = next
};
- if (prev >= 0) {
- s->gen_op_buf[prev].next = oi;
- } else {
- s->gen_first_op_idx = oi;
- }
+ s->gen_op_buf[prev].next = oi;
old_op->prev = oi;
return new_op;
@@ -583,7 +579,7 @@ void tcg_optimize(TCGContext *s)
nb_globals = s->nb_globals;
reset_all_temps(nb_temps);
- for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
+ for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
tcg_target_ulong mask, partmask, affected;
int nb_oargs, nb_iargs, i;
TCGArg tmp;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 569cdc6..62d91b4 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -52,7 +52,7 @@ static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args)
int pi = oi - 1;
tcg_debug_assert(oi < OPC_BUF_SIZE);
- ctx->gen_last_op_idx = oi;
+ ctx->gen_op_buf[0].prev = oi;
ctx->gen_next_op_idx = ni;
ctx->gen_op_buf[oi] = (TCGOp){
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 4b117fd..3e884e4 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -438,9 +438,9 @@ void tcg_func_start(TCGContext *s)
s->goto_tb_issue_mask = 0;
#endif
- s->gen_first_op_idx = 0;
- s->gen_last_op_idx = -1;
- s->gen_next_op_idx = 0;
+ s->gen_op_buf[0].next = 1;
+ s->gen_op_buf[0].prev = 0;
+ s->gen_next_op_idx = 1;
s->gen_next_parm_idx = 0;
s->be = tcg_malloc(sizeof(TCGBackendData));
@@ -869,7 +869,7 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
/* Make sure the calli field didn't overflow. */
tcg_debug_assert(s->gen_op_buf[i].calli == real_args);
- s->gen_last_op_idx = i;
+ s->gen_op_buf[0].prev = i;
s->gen_next_op_idx = i + 1;
s->gen_next_parm_idx = pi;
@@ -1005,7 +1005,7 @@ void tcg_dump_ops(TCGContext *s)
TCGOp *op;
int oi;
- for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) {
+ for (oi = s->gen_op_buf[0].next; oi != 0; oi = op->next) {
int i, k, nb_oargs, nb_iargs, nb_cargs;
const TCGOpDef *def;
const TCGArg *args;
@@ -1017,7 +1017,7 @@ void tcg_dump_ops(TCGContext *s)
args = &s->gen_opparam_buf[op->args];
if (c == INDEX_op_insn_start) {
- qemu_log("%s ----", oi != s->gen_first_op_idx ? "\n" : "");
+ qemu_log("%s ----", oi != s->gen_op_buf[0].next ? "\n" : "");
for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
target_ulong a;
@@ -1288,18 +1288,10 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
int next = op->next;
int prev = op->prev;
- if (next >= 0) {
- s->gen_op_buf[next].prev = prev;
- } else {
- s->gen_last_op_idx = prev;
- }
- if (prev >= 0) {
- s->gen_op_buf[prev].next = next;
- } else {
- s->gen_first_op_idx = next;
- }
+ s->gen_op_buf[next].prev = prev;
+ s->gen_op_buf[prev].next = next;
- memset(op, -1, sizeof(*op));
+ memset(op, 0, sizeof(*op));
#ifdef CONFIG_PROFILER
s->del_op_count++;
@@ -1346,7 +1338,7 @@ static void tcg_liveness_analysis(TCGContext *s)
mem_temps = tcg_malloc(s->nb_temps);
tcg_la_func_end(s, dead_temps, mem_temps);
- for (oi = s->gen_last_op_idx; oi >= 0; oi = oi_prev) {
+ for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) {
int i, nb_iargs, nb_oargs;
TCGOpcode opc_new, opc_new2;
bool have_opc_new2;
@@ -2341,7 +2333,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
{
int n;
- n = s->gen_last_op_idx + 1;
+ n = s->gen_op_buf[0].prev + 1;
s->op_count += n;
if (n > s->op_count_max) {
s->op_count_max = n;
@@ -2400,7 +2392,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
tcg_out_tb_init(s);
num_insns = -1;
- for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
+ for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
TCGOp * const op = &s->gen_op_buf[oi];
TCGArg * const args = &s->gen_opparam_buf[op->args];
TCGOpcode opc = op->opc;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index cc14560..49b396d 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -520,17 +520,21 @@ typedef struct TCGOp {
unsigned callo : 2;
unsigned calli : 6;
- /* Index of the arguments for this op, or -1 for zero-operand ops. */
- signed args : 16;
+ /* Index of the arguments for this op, or 0 for zero-operand ops. */
+ unsigned args : 16;
- /* Index of the prex/next op, or -1 for the end of the list. */
- signed prev : 16;
- signed next : 16;
+ /* Index of the prex/next op, or 0 for the end of the list. */
+ unsigned prev : 16;
+ unsigned next : 16;
} TCGOp;
-QEMU_BUILD_BUG_ON(NB_OPS > 0xff);
-QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff);
-QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff);
+/* Make sure operands fit in the bitfields above. */
+QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
+QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 16));
+QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 16));
+
+/* Make sure that we don't overflow 64 bits without noticing. */
+QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
struct TCGContext {
uint8_t *pool_cur, *pool_end;
--
2.5.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp
2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining Richard Henderson
@ 2016-06-23 18:03 ` Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array Richard Henderson
3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:03 UTC (permalink / raw)
To: qemu-devel; +Cc: aurelien
Reduce the size of other bitfields to make room.
This reduces the cache footprint of compilation.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/tcg.c | 9 +++------
tcg/tcg.h | 26 ++++++++++++++------------
2 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 3e884e4..b0c9dca 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1329,10 +1329,7 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
static void tcg_liveness_analysis(TCGContext *s)
{
uint8_t *dead_temps, *mem_temps;
- int oi, oi_prev, nb_ops;
-
- nb_ops = s->gen_next_op_idx;
- s->op_arg_life = tcg_malloc(nb_ops * sizeof(TCGLifeData));
+ int oi, oi_prev;
dead_temps = tcg_malloc(s->nb_temps);
mem_temps = tcg_malloc(s->nb_temps);
@@ -1555,7 +1552,7 @@ static void tcg_liveness_analysis(TCGContext *s)
}
break;
}
- s->op_arg_life[oi] = arg_life;
+ op->life = arg_life;
}
}
#else
@@ -2397,7 +2394,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
TCGArg * const args = &s->gen_opparam_buf[op->args];
TCGOpcode opc = op->opc;
const TCGOpDef *def = &tcg_op_defs[opc];
- TCGLifeData arg_life = s->op_arg_life[oi];
+ TCGLifeData arg_life = op->life;
oi_next = op->next;
#ifdef CONFIG_PROFILER
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 49b396d..2ff3ad2 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -513,25 +513,30 @@ typedef struct TCGTempSet {
#define SYNC_ARG 1
typedef uint16_t TCGLifeData;
+/* The layout here is designed to avoid crossing of a 32-bit boundary.
+ If we do so, gcc adds padding, expanding the size to 12. */
typedef struct TCGOp {
- TCGOpcode opc : 8;
+ TCGOpcode opc : 8; /* 8 */
+
+ /* Index of the prex/next op, or 0 for the end of the list. */
+ unsigned prev : 10; /* 18 */
+ unsigned next : 10; /* 28 */
/* The number of out and in parameter for a call. */
- unsigned callo : 2;
- unsigned calli : 6;
+ unsigned calli : 4; /* 32 */
+ unsigned callo : 2; /* 34 */
/* Index of the arguments for this op, or 0 for zero-operand ops. */
- unsigned args : 16;
+ unsigned args : 14; /* 48 */
- /* Index of the prex/next op, or 0 for the end of the list. */
- unsigned prev : 16;
- unsigned next : 16;
+ /* Lifetime data of the operands. */
+ unsigned life : 16; /* 64 */
} TCGOp;
/* Make sure operands fit in the bitfields above. */
QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
-QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 16));
-QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 16));
+QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 10));
+QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 14));
/* Make sure that we don't overflow 64 bits without noticing. */
QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
@@ -549,9 +554,6 @@ struct TCGContext {
uint16_t *tb_jmp_insn_offset; /* tb->jmp_insn_offset if USE_DIRECT_JUMP */
uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */
- /* liveness analysis */
- TCGLifeData *op_arg_life;
-
TCGRegSet reserved_regs;
intptr_t current_frame_offset;
intptr_t frame_start;
--
2.5.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array
2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
` (2 preceding siblings ...)
2016-06-23 18:03 ` [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp Richard Henderson
@ 2016-06-23 18:03 ` Richard Henderson
3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:03 UTC (permalink / raw)
To: qemu-devel; +Cc: aurelien
We only need two bits per temporary. Fold the two bytes into one,
and reduce the memory and cachelines required during compilation.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/tcg.c | 119 +++++++++++++++++++++++++++++++-------------------------------
1 file changed, 60 insertions(+), 59 deletions(-)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index b0c9dca..6397a37 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -333,7 +333,7 @@ void tcg_context_init(TCGContext *s)
memset(s, 0, sizeof(*s));
s->nb_globals = 0;
-
+
/* Count total number of arguments and allocate the corresponding
space */
total_args = 0;
@@ -825,16 +825,16 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
real_args++;
}
#endif
- /* If stack grows up, then we will be placing successive
- arguments at lower addresses, which means we need to
- reverse the order compared to how we would normally
- treat either big or little-endian. For those arguments
- that will wind up in registers, this still works for
- HPPA (the only current STACK_GROWSUP target) since the
- argument registers are *also* allocated in decreasing
- order. If another such target is added, this logic may
- have to get more complicated to differentiate between
- stack arguments and register arguments. */
+ /* If stack grows up, then we will be placing successive
+ arguments at lower addresses, which means we need to
+ reverse the order compared to how we would normally
+ treat either big or little-endian. For those arguments
+ that will wind up in registers, this still works for
+ HPPA (the only current STACK_GROWSUP target) since the
+ argument registers are *also* allocated in decreasing
+ order. If another such target is added, this logic may
+ have to get more complicated to differentiate between
+ stack arguments and register arguments. */
#if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
s->gen_opparam_buf[pi++] = args[i] + 1;
s->gen_opparam_buf[pi++] = args[i];
@@ -1299,27 +1299,29 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
}
#ifdef USE_LIVENESS_ANALYSIS
+
+#define TS_DEAD 1
+#define TS_SYNC 2
+
/* liveness analysis: end of function: all temps are dead, and globals
should be in memory. */
-static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps,
- uint8_t *mem_temps)
+static inline void tcg_la_func_end(TCGContext *s, uint8_t *temp_state)
{
- memset(dead_temps, 1, s->nb_temps);
- memset(mem_temps, 1, s->nb_globals);
- memset(mem_temps + s->nb_globals, 0, s->nb_temps - s->nb_globals);
+ memset(temp_state, TS_DEAD | TS_SYNC, s->nb_globals);
+ memset(temp_state + s->nb_globals, TS_DEAD, s->nb_temps - s->nb_globals);
}
/* liveness analysis: end of basic block: all temps are dead, globals
and local temps should be in memory. */
-static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
- uint8_t *mem_temps)
+static inline void tcg_la_bb_end(TCGContext *s, uint8_t *temp_state)
{
- int i;
+ int i, n;
- memset(dead_temps, 1, s->nb_temps);
- memset(mem_temps, 1, s->nb_globals);
- for(i = s->nb_globals; i < s->nb_temps; i++) {
- mem_temps[i] = s->temps[i].temp_local;
+ tcg_la_func_end(s, temp_state);
+ for (i = s->nb_globals, n = s->nb_temps; i < n; i++) {
+ if (s->temps[i].temp_local) {
+ temp_state[i] |= TS_SYNC;
+ }
}
}
@@ -1328,12 +1330,12 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
temporaries are removed. */
static void tcg_liveness_analysis(TCGContext *s)
{
- uint8_t *dead_temps, *mem_temps;
+ uint8_t *temp_state;
int oi, oi_prev;
+ int nb_globals = s->nb_globals;
- dead_temps = tcg_malloc(s->nb_temps);
- mem_temps = tcg_malloc(s->nb_temps);
- tcg_la_func_end(s, dead_temps, mem_temps);
+ temp_state = tcg_malloc(s->nb_temps);
+ tcg_la_func_end(s, temp_state);
for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) {
int i, nb_iargs, nb_oargs;
@@ -1362,7 +1364,7 @@ static void tcg_liveness_analysis(TCGContext *s)
if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
- if (!dead_temps[arg] || mem_temps[arg]) {
+ if (temp_state[arg] != TS_DEAD) {
goto do_not_remove_call;
}
}
@@ -1373,39 +1375,41 @@ static void tcg_liveness_analysis(TCGContext *s)
/* output args are dead */
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
- if (dead_temps[arg]) {
+ if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
- if (mem_temps[arg]) {
+ if (temp_state[arg] & TS_SYNC) {
arg_life |= SYNC_ARG << i;
}
- dead_temps[arg] = 1;
- mem_temps[arg] = 0;
+ temp_state[arg] = TS_DEAD;
}
- if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
- /* globals should be synced to memory */
- memset(mem_temps, 1, s->nb_globals);
- }
if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
TCG_CALL_NO_READ_GLOBALS))) {
/* globals should go back to memory */
- memset(dead_temps, 1, s->nb_globals);
+ memset(temp_state, TS_DEAD | TS_SYNC, nb_globals);
+ } else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
+ /* globals should be synced to memory */
+ for (i = 0; i < nb_globals; i++) {
+ temp_state[i] |= TS_SYNC;
+ }
}
/* record arguments that die in this helper */
for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
if (arg != TCG_CALL_DUMMY_ARG) {
- if (dead_temps[arg]) {
+ if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
}
}
/* input arguments are live for preceding opcodes */
- for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+ for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
- dead_temps[arg] = 0;
+ if (arg != TCG_CALL_DUMMY_ARG) {
+ temp_state[arg] &= ~TS_DEAD;
+ }
}
}
}
@@ -1414,8 +1418,7 @@ static void tcg_liveness_analysis(TCGContext *s)
break;
case INDEX_op_discard:
/* mark the temporary as dead */
- dead_temps[args[0]] = 1;
- mem_temps[args[0]] = 0;
+ temp_state[args[0]] = TS_DEAD;
break;
case INDEX_op_add2_i32:
@@ -1436,8 +1439,8 @@ static void tcg_liveness_analysis(TCGContext *s)
the low part. The result can be optimized to a simple
add or sub. This happens often for x86_64 guest when the
cpu mode is set to 32 bit. */
- if (dead_temps[args[1]] && !mem_temps[args[1]]) {
- if (dead_temps[args[0]] && !mem_temps[args[0]]) {
+ if (temp_state[args[1]] != TS_DEAD) {
+ if (temp_state[args[0]] != TS_DEAD) {
goto do_remove;
}
/* Replace the opcode and adjust the args in place,
@@ -1474,8 +1477,8 @@ static void tcg_liveness_analysis(TCGContext *s)
do_mul2:
nb_iargs = 2;
nb_oargs = 2;
- if (dead_temps[args[1]] && !mem_temps[args[1]]) {
- if (dead_temps[args[0]] && !mem_temps[args[0]]) {
+ if (temp_state[args[1]] != TS_DEAD) {
+ if (temp_state[args[0]] != TS_DEAD) {
/* Both parts of the operation are dead. */
goto do_remove;
}
@@ -1483,8 +1486,7 @@ static void tcg_liveness_analysis(TCGContext *s)
op->opc = opc = opc_new;
args[1] = args[2];
args[2] = args[3];
- } else if (have_opc_new2 && dead_temps[args[0]]
- && !mem_temps[args[0]]) {
+ } else if (temp_state[args[0]] != TS_DEAD && have_opc_new2) {
/* The low part of the operation is dead; generate the high. */
op->opc = opc = opc_new2;
args[0] = args[1];
@@ -1507,8 +1509,7 @@ static void tcg_liveness_analysis(TCGContext *s)
implies side effects */
if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) {
for (i = 0; i < nb_oargs; i++) {
- arg = args[i];
- if (!dead_temps[arg] || mem_temps[arg]) {
+ if (temp_state[args[i]] != TS_DEAD) {
goto do_not_remove;
}
}
@@ -1519,35 +1520,35 @@ static void tcg_liveness_analysis(TCGContext *s)
/* output args are dead */
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
- if (dead_temps[arg]) {
+ if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
- if (mem_temps[arg]) {
+ if (temp_state[arg] & TS_SYNC) {
arg_life |= SYNC_ARG << i;
}
- dead_temps[arg] = 1;
- mem_temps[arg] = 0;
+ temp_state[arg] = TS_DEAD;
}
/* if end of basic block, update */
if (def->flags & TCG_OPF_BB_END) {
- tcg_la_bb_end(s, dead_temps, mem_temps);
+ tcg_la_bb_end(s, temp_state);
} else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
/* globals should be synced to memory */
- memset(mem_temps, 1, s->nb_globals);
+ for (i = 0; i < nb_globals; i++) {
+ temp_state[i] |= TS_SYNC;
+ }
}
/* record arguments that die in this opcode */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
arg = args[i];
- if (dead_temps[arg]) {
+ if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
}
/* input arguments are live for preceding opcodes */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
- arg = args[i];
- dead_temps[arg] = 0;
+ temp_state[args[i]] &= ~TS_DEAD;
}
}
break;
--
2.5.5
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2016-06-23 18:03 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array Richard Henderson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).