qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/4] tcg memory usage improvements
@ 2016-06-23 18:02 Richard Henderson
  2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

While continuing to work on the sparc64 on i686 problem, I've run into
a few things that could be done better.

I haven't done any proper measurements of memory usage or boot time,
but all together my guess is that this saves 10-20k.  Not much, I grant,
but maintaining more cache locality ought not hurt.


r~


Richard Henderson (4):
  tcg: Compress liveness data to 16 bits
  tcg: Reorg TCGOp chaining
  tcg: Fold life data into TCGOp
  tcg: Compress dead_temps and mem_temps into a single array

 include/exec/gen-icount.h |   2 +-
 tcg/optimize.c            |  37 +---
 tcg/tcg-op.c              |   2 +-
 tcg/tcg.c                 | 472 ++++++++++++++++++++++++++++------------------
 tcg/tcg.h                 |  50 +++--
 5 files changed, 329 insertions(+), 234 deletions(-)

-- 
2.5.5

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits
  2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
@ 2016-06-23 18:02 ` Richard Henderson
  2016-06-23 18:03 ` [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining Richard Henderson
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

This reduces both memory usage and per-insn cacheline usage
during code generation.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/tcg.c | 58 ++++++++++++++++++++++------------------------------------
 tcg/tcg.h | 16 ++++++++++------
 2 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 44de991..4b117fd 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1331,7 +1331,7 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
     }
 }
 
-/* Liveness analysis : update the opc_dead_args array to tell if a
+/* Liveness analysis : update the opc_arg_life array to tell if a
    given input arguments is dead. Instructions updating dead
    temporaries are removed. */
 static void tcg_liveness_analysis(TCGContext *s)
@@ -1340,9 +1340,8 @@ static void tcg_liveness_analysis(TCGContext *s)
     int oi, oi_prev, nb_ops;
 
     nb_ops = s->gen_next_op_idx;
-    s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t));
-    s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t));
-    
+    s->op_arg_life = tcg_malloc(nb_ops * sizeof(TCGLifeData));
+
     dead_temps = tcg_malloc(s->nb_temps);
     mem_temps = tcg_malloc(s->nb_temps);
     tcg_la_func_end(s, dead_temps, mem_temps);
@@ -1351,8 +1350,7 @@ static void tcg_liveness_analysis(TCGContext *s)
         int i, nb_iargs, nb_oargs;
         TCGOpcode opc_new, opc_new2;
         bool have_opc_new2;
-        uint16_t dead_args;
-        uint8_t sync_args;
+        TCGLifeData arg_life = 0;
         TCGArg arg;
 
         TCGOp * const op = &s->gen_op_buf[oi];
@@ -1384,15 +1382,13 @@ static void tcg_liveness_analysis(TCGContext *s)
                 do_not_remove_call:
 
                     /* output args are dead */
-                    dead_args = 0;
-                    sync_args = 0;
                     for (i = 0; i < nb_oargs; i++) {
                         arg = args[i];
                         if (dead_temps[arg]) {
-                            dead_args |= (1 << i);
+                            arg_life |= DEAD_ARG << i;
                         }
                         if (mem_temps[arg]) {
-                            sync_args |= (1 << i);
+                            arg_life |= SYNC_ARG << i;
                         }
                         dead_temps[arg] = 1;
                         mem_temps[arg] = 0;
@@ -1413,7 +1409,7 @@ static void tcg_liveness_analysis(TCGContext *s)
                         arg = args[i];
                         if (arg != TCG_CALL_DUMMY_ARG) {
                             if (dead_temps[arg]) {
-                                dead_args |= (1 << i);
+                                arg_life |= DEAD_ARG << i;
                             }
                         }
                     }
@@ -1422,8 +1418,6 @@ static void tcg_liveness_analysis(TCGContext *s)
                         arg = args[i];
                         dead_temps[arg] = 0;
                     }
-                    s->op_dead_args[oi] = dead_args;
-                    s->op_sync_args[oi] = sync_args;
                 }
             }
             break;
@@ -1534,15 +1528,13 @@ static void tcg_liveness_analysis(TCGContext *s)
             } else {
             do_not_remove:
                 /* output args are dead */
-                dead_args = 0;
-                sync_args = 0;
                 for (i = 0; i < nb_oargs; i++) {
                     arg = args[i];
                     if (dead_temps[arg]) {
-                        dead_args |= (1 << i);
+                        arg_life |= DEAD_ARG << i;
                     }
                     if (mem_temps[arg]) {
-                        sync_args |= (1 << i);
+                        arg_life |= SYNC_ARG << i;
                     }
                     dead_temps[arg] = 1;
                     mem_temps[arg] = 0;
@@ -1560,7 +1552,7 @@ static void tcg_liveness_analysis(TCGContext *s)
                 for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
                     arg = args[i];
                     if (dead_temps[arg]) {
-                        dead_args |= (1 << i);
+                        arg_life |= DEAD_ARG << i;
                     }
                 }
                 /* input arguments are live for preceding opcodes */
@@ -1568,11 +1560,10 @@ static void tcg_liveness_analysis(TCGContext *s)
                     arg = args[i];
                     dead_temps[arg] = 0;
                 }
-                s->op_dead_args[oi] = dead_args;
-                s->op_sync_args[oi] = sync_args;
             }
             break;
         }
+        s->op_arg_life[oi] = arg_life;
     }
 }
 #else
@@ -1911,11 +1902,11 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
     save_globals(s, allocated_regs);
 }
 
-#define IS_DEAD_ARG(n) ((dead_args >> (n)) & 1)
-#define NEED_SYNC_ARG(n) ((sync_args >> (n)) & 1)
+#define IS_DEAD_ARG(n)   (arg_life & (DEAD_ARG << (n)))
+#define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n)))
 
 static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
-                               uint16_t dead_args, uint8_t sync_args)
+                               TCGLifeData arg_life)
 {
     TCGTemp *ots;
     tcg_target_ulong val;
@@ -1944,8 +1935,7 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
 }
 
 static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
-                              const TCGArg *args, uint16_t dead_args,
-                              uint8_t sync_args)
+                              const TCGArg *args, TCGLifeData arg_life)
 {
     TCGRegSet allocated_regs;
     TCGTemp *ts, *ots;
@@ -2030,8 +2020,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
 
 static void tcg_reg_alloc_op(TCGContext *s, 
                              const TCGOpDef *def, TCGOpcode opc,
-                             const TCGArg *args, uint16_t dead_args,
-                             uint8_t sync_args)
+                             const TCGArg *args, TCGLifeData arg_life)
 {
     TCGRegSet allocated_regs;
     int i, k, nb_iargs, nb_oargs;
@@ -2196,8 +2185,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
 #endif
 
 static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
-                               const TCGArg * const args, uint16_t dead_args,
-                               uint8_t sync_args)
+                               const TCGArg * const args, TCGLifeData arg_life)
 {
     int flags, nb_regs, i;
     TCGReg reg;
@@ -2417,8 +2405,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         TCGArg * const args = &s->gen_opparam_buf[op->args];
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
-        uint16_t dead_args = s->op_dead_args[oi];
-        uint8_t sync_args = s->op_sync_args[oi];
+        TCGLifeData arg_life = s->op_arg_life[oi];
 
         oi_next = op->next;
 #ifdef CONFIG_PROFILER
@@ -2428,11 +2415,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         switch (opc) {
         case INDEX_op_mov_i32:
         case INDEX_op_mov_i64:
-            tcg_reg_alloc_mov(s, def, args, dead_args, sync_args);
+            tcg_reg_alloc_mov(s, def, args, arg_life);
             break;
         case INDEX_op_movi_i32:
         case INDEX_op_movi_i64:
-            tcg_reg_alloc_movi(s, args, dead_args, sync_args);
+            tcg_reg_alloc_movi(s, args, arg_life);
             break;
         case INDEX_op_insn_start:
             if (num_insns >= 0) {
@@ -2457,8 +2444,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
             tcg_out_label(s, arg_label(args[0]), s->code_ptr);
             break;
         case INDEX_op_call:
-            tcg_reg_alloc_call(s, op->callo, op->calli, args,
-                               dead_args, sync_args);
+            tcg_reg_alloc_call(s, op->callo, op->calli, args, arg_life);
             break;
         default:
             /* Sanity check that we've not introduced any unhandled opcodes. */
@@ -2468,7 +2454,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
             /* Note: in order to speed up the code, it would be much
                faster to have specialized register allocator functions for
                some common argument patterns */
-            tcg_reg_alloc_op(s, def, opc, args, dead_args, sync_args);
+            tcg_reg_alloc_op(s, def, opc, args, arg_life);
             break;
         }
 #ifdef CONFIG_DEBUG_TCG
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 66d7fc0..cc14560 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -505,6 +505,14 @@ typedef struct TCGTempSet {
     unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
 } TCGTempSet;
 
+/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding,
+   this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands.
+   There are never more than 2 outputs, which means that we can store all
+   dead + sync data within 16 bits.  */
+#define DEAD_ARG  4
+#define SYNC_ARG  1
+typedef uint16_t TCGLifeData;
+
 typedef struct TCGOp {
     TCGOpcode opc   : 8;
 
@@ -538,12 +546,8 @@ struct TCGContext {
     uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */
 
     /* liveness analysis */
-    uint16_t *op_dead_args; /* for each operation, each bit tells if the
-                               corresponding argument is dead */
-    uint8_t *op_sync_args;  /* for each operation, each bit tells if the
-                               corresponding output argument needs to be
-                               sync to memory. */
-    
+    TCGLifeData *op_arg_life;
+
     TCGRegSet reserved_regs;
     intptr_t current_frame_offset;
     intptr_t frame_start;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining
  2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
  2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
@ 2016-06-23 18:03 ` Richard Henderson
  2016-06-23 18:03 ` [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp Richard Henderson
  2016-06-23 18:03 ` [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array Richard Henderson
  3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:03 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Instead of using -1 as end of chain, use 0, and link through the 0
entry as a fully circular double-linked list.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/gen-icount.h |  2 +-
 tcg/optimize.c            |  8 ++------
 tcg/tcg-op.c              |  2 +-
 tcg/tcg.c                 | 32 ++++++++++++--------------------
 tcg/tcg.h                 | 20 ++++++++++++--------
 5 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index a011324..5f16077 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -59,7 +59,7 @@ static void gen_tb_end(TranslationBlock *tb, int num_insns)
     }
 
     /* Terminate the linked list.  */
-    tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
+    tcg_ctx.gen_op_buf[tcg_ctx.gen_op_buf[0].prev].next = 0;
 }
 
 static inline void gen_io_start(void)
diff --git a/tcg/optimize.c b/tcg/optimize.c
index c0d975b..8df7fc7 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -103,11 +103,7 @@ static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op,
         .prev = prev,
         .next = next
     };
-    if (prev >= 0) {
-        s->gen_op_buf[prev].next = oi;
-    } else {
-        s->gen_first_op_idx = oi;
-    }
+    s->gen_op_buf[prev].next = oi;
     old_op->prev = oi;
 
     return new_op;
@@ -583,7 +579,7 @@ void tcg_optimize(TCGContext *s)
     nb_globals = s->nb_globals;
     reset_all_temps(nb_temps);
 
-    for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
+    for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
         tcg_target_ulong mask, partmask, affected;
         int nb_oargs, nb_iargs, i;
         TCGArg tmp;
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 569cdc6..62d91b4 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -52,7 +52,7 @@ static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args)
     int pi = oi - 1;
 
     tcg_debug_assert(oi < OPC_BUF_SIZE);
-    ctx->gen_last_op_idx = oi;
+    ctx->gen_op_buf[0].prev = oi;
     ctx->gen_next_op_idx = ni;
 
     ctx->gen_op_buf[oi] = (TCGOp){
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 4b117fd..3e884e4 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -438,9 +438,9 @@ void tcg_func_start(TCGContext *s)
     s->goto_tb_issue_mask = 0;
 #endif
 
-    s->gen_first_op_idx = 0;
-    s->gen_last_op_idx = -1;
-    s->gen_next_op_idx = 0;
+    s->gen_op_buf[0].next = 1;
+    s->gen_op_buf[0].prev = 0;
+    s->gen_next_op_idx = 1;
     s->gen_next_parm_idx = 0;
 
     s->be = tcg_malloc(sizeof(TCGBackendData));
@@ -869,7 +869,7 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
     /* Make sure the calli field didn't overflow.  */
     tcg_debug_assert(s->gen_op_buf[i].calli == real_args);
 
-    s->gen_last_op_idx = i;
+    s->gen_op_buf[0].prev = i;
     s->gen_next_op_idx = i + 1;
     s->gen_next_parm_idx = pi;
 
@@ -1005,7 +1005,7 @@ void tcg_dump_ops(TCGContext *s)
     TCGOp *op;
     int oi;
 
-    for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) {
+    for (oi = s->gen_op_buf[0].next; oi != 0; oi = op->next) {
         int i, k, nb_oargs, nb_iargs, nb_cargs;
         const TCGOpDef *def;
         const TCGArg *args;
@@ -1017,7 +1017,7 @@ void tcg_dump_ops(TCGContext *s)
         args = &s->gen_opparam_buf[op->args];
 
         if (c == INDEX_op_insn_start) {
-            qemu_log("%s ----", oi != s->gen_first_op_idx ? "\n" : "");
+            qemu_log("%s ----", oi != s->gen_op_buf[0].next ? "\n" : "");
 
             for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
                 target_ulong a;
@@ -1288,18 +1288,10 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
     int next = op->next;
     int prev = op->prev;
 
-    if (next >= 0) {
-        s->gen_op_buf[next].prev = prev;
-    } else {
-        s->gen_last_op_idx = prev;
-    }
-    if (prev >= 0) {
-        s->gen_op_buf[prev].next = next;
-    } else {
-        s->gen_first_op_idx = next;
-    }
+    s->gen_op_buf[next].prev = prev;
+    s->gen_op_buf[prev].next = next;
 
-    memset(op, -1, sizeof(*op));
+    memset(op, 0, sizeof(*op));
 
 #ifdef CONFIG_PROFILER
     s->del_op_count++;
@@ -1346,7 +1338,7 @@ static void tcg_liveness_analysis(TCGContext *s)
     mem_temps = tcg_malloc(s->nb_temps);
     tcg_la_func_end(s, dead_temps, mem_temps);
 
-    for (oi = s->gen_last_op_idx; oi >= 0; oi = oi_prev) {
+    for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) {
         int i, nb_iargs, nb_oargs;
         TCGOpcode opc_new, opc_new2;
         bool have_opc_new2;
@@ -2341,7 +2333,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     {
         int n;
 
-        n = s->gen_last_op_idx + 1;
+        n = s->gen_op_buf[0].prev + 1;
         s->op_count += n;
         if (n > s->op_count_max) {
             s->op_count_max = n;
@@ -2400,7 +2392,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     tcg_out_tb_init(s);
 
     num_insns = -1;
-    for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
+    for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
         TCGOp * const op = &s->gen_op_buf[oi];
         TCGArg * const args = &s->gen_opparam_buf[op->args];
         TCGOpcode opc = op->opc;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index cc14560..49b396d 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -520,17 +520,21 @@ typedef struct TCGOp {
     unsigned callo  : 2;
     unsigned calli  : 6;
 
-    /* Index of the arguments for this op, or -1 for zero-operand ops.  */
-    signed args     : 16;
+    /* Index of the arguments for this op, or 0 for zero-operand ops.  */
+    unsigned args   : 16;
 
-    /* Index of the prex/next op, or -1 for the end of the list.  */
-    signed prev     : 16;
-    signed next     : 16;
+    /* Index of the prex/next op, or 0 for the end of the list.  */
+    unsigned prev   : 16;
+    unsigned next   : 16;
 } TCGOp;
 
-QEMU_BUILD_BUG_ON(NB_OPS > 0xff);
-QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff);
-QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff);
+/* Make sure operands fit in the bitfields above.  */
+QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
+QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 16));
+QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 16));
+
+/* Make sure that we don't overflow 64 bits without noticing.  */
+QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
 
 struct TCGContext {
     uint8_t *pool_cur, *pool_end;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp
  2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
  2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
  2016-06-23 18:03 ` [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining Richard Henderson
@ 2016-06-23 18:03 ` Richard Henderson
  2016-06-23 18:03 ` [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array Richard Henderson
  3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:03 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Reduce the size of other bitfields to make room.
This reduces the cache footprint of compilation.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/tcg.c |  9 +++------
 tcg/tcg.h | 26 ++++++++++++++------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 3e884e4..b0c9dca 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1329,10 +1329,7 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
 static void tcg_liveness_analysis(TCGContext *s)
 {
     uint8_t *dead_temps, *mem_temps;
-    int oi, oi_prev, nb_ops;
-
-    nb_ops = s->gen_next_op_idx;
-    s->op_arg_life = tcg_malloc(nb_ops * sizeof(TCGLifeData));
+    int oi, oi_prev;
 
     dead_temps = tcg_malloc(s->nb_temps);
     mem_temps = tcg_malloc(s->nb_temps);
@@ -1555,7 +1552,7 @@ static void tcg_liveness_analysis(TCGContext *s)
             }
             break;
         }
-        s->op_arg_life[oi] = arg_life;
+        op->life = arg_life;
     }
 }
 #else
@@ -2397,7 +2394,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         TCGArg * const args = &s->gen_opparam_buf[op->args];
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
-        TCGLifeData arg_life = s->op_arg_life[oi];
+        TCGLifeData arg_life = op->life;
 
         oi_next = op->next;
 #ifdef CONFIG_PROFILER
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 49b396d..2ff3ad2 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -513,25 +513,30 @@ typedef struct TCGTempSet {
 #define SYNC_ARG  1
 typedef uint16_t TCGLifeData;
 
+/* The layout here is designed to avoid crossing of a 32-bit boundary.
+   If we do so, gcc adds padding, expanding the size to 12.  */
 typedef struct TCGOp {
-    TCGOpcode opc   : 8;
+    TCGOpcode opc   : 8;        /*  8 */
+
+    /* Index of the prex/next op, or 0 for the end of the list.  */
+    unsigned prev   : 10;       /* 18 */
+    unsigned next   : 10;       /* 28 */
 
     /* The number of out and in parameter for a call.  */
-    unsigned callo  : 2;
-    unsigned calli  : 6;
+    unsigned calli  : 4;        /* 32 */
+    unsigned callo  : 2;        /* 34 */
 
     /* Index of the arguments for this op, or 0 for zero-operand ops.  */
-    unsigned args   : 16;
+    unsigned args   : 14;       /* 48 */
 
-    /* Index of the prex/next op, or 0 for the end of the list.  */
-    unsigned prev   : 16;
-    unsigned next   : 16;
+    /* Lifetime data of the operands.  */
+    unsigned life   : 16;       /* 64 */
 } TCGOp;
 
 /* Make sure operands fit in the bitfields above.  */
 QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
-QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 16));
-QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 16));
+QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 10));
+QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 14));
 
 /* Make sure that we don't overflow 64 bits without noticing.  */
 QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
@@ -549,9 +554,6 @@ struct TCGContext {
     uint16_t *tb_jmp_insn_offset; /* tb->jmp_insn_offset if USE_DIRECT_JUMP */
     uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */
 
-    /* liveness analysis */
-    TCGLifeData *op_arg_life;
-
     TCGRegSet reserved_regs;
     intptr_t current_frame_offset;
     intptr_t frame_start;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array
  2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
                   ` (2 preceding siblings ...)
  2016-06-23 18:03 ` [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp Richard Henderson
@ 2016-06-23 18:03 ` Richard Henderson
  3 siblings, 0 replies; 5+ messages in thread
From: Richard Henderson @ 2016-06-23 18:03 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

We only need two bits per temporary.  Fold the two bytes into one,
and reduce the memory and cachelines required during compilation.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/tcg.c | 119 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 60 insertions(+), 59 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index b0c9dca..6397a37 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -333,7 +333,7 @@ void tcg_context_init(TCGContext *s)
 
     memset(s, 0, sizeof(*s));
     s->nb_globals = 0;
-    
+
     /* Count total number of arguments and allocate the corresponding
        space */
     total_args = 0;
@@ -825,16 +825,16 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
                 real_args++;
             }
 #endif
-	    /* If stack grows up, then we will be placing successive
-	       arguments at lower addresses, which means we need to
-	       reverse the order compared to how we would normally
-	       treat either big or little-endian.  For those arguments
-	       that will wind up in registers, this still works for
-	       HPPA (the only current STACK_GROWSUP target) since the
-	       argument registers are *also* allocated in decreasing
-	       order.  If another such target is added, this logic may
-	       have to get more complicated to differentiate between
-	       stack arguments and register arguments.  */
+           /* If stack grows up, then we will be placing successive
+              arguments at lower addresses, which means we need to
+              reverse the order compared to how we would normally
+              treat either big or little-endian.  For those arguments
+              that will wind up in registers, this still works for
+              HPPA (the only current STACK_GROWSUP target) since the
+              argument registers are *also* allocated in decreasing
+              order.  If another such target is added, this logic may
+              have to get more complicated to differentiate between
+              stack arguments and register arguments.  */
 #if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
             s->gen_opparam_buf[pi++] = args[i] + 1;
             s->gen_opparam_buf[pi++] = args[i];
@@ -1299,27 +1299,29 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
 }
 
 #ifdef USE_LIVENESS_ANALYSIS
+
+#define TS_DEAD  1
+#define TS_SYNC  2
+
 /* liveness analysis: end of function: all temps are dead, and globals
    should be in memory. */
-static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps,
-                                   uint8_t *mem_temps)
+static inline void tcg_la_func_end(TCGContext *s, uint8_t *temp_state)
 {
-    memset(dead_temps, 1, s->nb_temps);
-    memset(mem_temps, 1, s->nb_globals);
-    memset(mem_temps + s->nb_globals, 0, s->nb_temps - s->nb_globals);
+    memset(temp_state, TS_DEAD | TS_SYNC, s->nb_globals);
+    memset(temp_state + s->nb_globals, TS_DEAD, s->nb_temps - s->nb_globals);
 }
 
 /* liveness analysis: end of basic block: all temps are dead, globals
    and local temps should be in memory. */
-static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
-                                 uint8_t *mem_temps)
+static inline void tcg_la_bb_end(TCGContext *s, uint8_t *temp_state)
 {
-    int i;
+    int i, n;
 
-    memset(dead_temps, 1, s->nb_temps);
-    memset(mem_temps, 1, s->nb_globals);
-    for(i = s->nb_globals; i < s->nb_temps; i++) {
-        mem_temps[i] = s->temps[i].temp_local;
+    tcg_la_func_end(s, temp_state);
+    for (i = s->nb_globals, n = s->nb_temps; i < n; i++) {
+        if (s->temps[i].temp_local) {
+            temp_state[i] |= TS_SYNC;
+        }
     }
 }
 
@@ -1328,12 +1330,12 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
    temporaries are removed. */
 static void tcg_liveness_analysis(TCGContext *s)
 {
-    uint8_t *dead_temps, *mem_temps;
+    uint8_t *temp_state;
     int oi, oi_prev;
+    int nb_globals = s->nb_globals;
 
-    dead_temps = tcg_malloc(s->nb_temps);
-    mem_temps = tcg_malloc(s->nb_temps);
-    tcg_la_func_end(s, dead_temps, mem_temps);
+    temp_state = tcg_malloc(s->nb_temps);
+    tcg_la_func_end(s, temp_state);
 
     for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) {
         int i, nb_iargs, nb_oargs;
@@ -1362,7 +1364,7 @@ static void tcg_liveness_analysis(TCGContext *s)
                 if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
                     for (i = 0; i < nb_oargs; i++) {
                         arg = args[i];
-                        if (!dead_temps[arg] || mem_temps[arg]) {
+                        if (temp_state[arg] != TS_DEAD) {
                             goto do_not_remove_call;
                         }
                     }
@@ -1373,39 +1375,41 @@ static void tcg_liveness_analysis(TCGContext *s)
                     /* output args are dead */
                     for (i = 0; i < nb_oargs; i++) {
                         arg = args[i];
-                        if (dead_temps[arg]) {
+                        if (temp_state[arg] & TS_DEAD) {
                             arg_life |= DEAD_ARG << i;
                         }
-                        if (mem_temps[arg]) {
+                        if (temp_state[arg] & TS_SYNC) {
                             arg_life |= SYNC_ARG << i;
                         }
-                        dead_temps[arg] = 1;
-                        mem_temps[arg] = 0;
+                        temp_state[arg] = TS_DEAD;
                     }
 
-                    if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
-                        /* globals should be synced to memory */
-                        memset(mem_temps, 1, s->nb_globals);
-                    }
                     if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
                                         TCG_CALL_NO_READ_GLOBALS))) {
                         /* globals should go back to memory */
-                        memset(dead_temps, 1, s->nb_globals);
+                        memset(temp_state, TS_DEAD | TS_SYNC, nb_globals);
+                    } else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
+                        /* globals should be synced to memory */
+                        for (i = 0; i < nb_globals; i++) {
+                            temp_state[i] |= TS_SYNC;
+                        }
                     }
 
                     /* record arguments that die in this helper */
                     for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
                         arg = args[i];
                         if (arg != TCG_CALL_DUMMY_ARG) {
-                            if (dead_temps[arg]) {
+                            if (temp_state[arg] & TS_DEAD) {
                                 arg_life |= DEAD_ARG << i;
                             }
                         }
                     }
                     /* input arguments are live for preceding opcodes */
-                    for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                    for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
                         arg = args[i];
-                        dead_temps[arg] = 0;
+                        if (arg != TCG_CALL_DUMMY_ARG) {
+                            temp_state[arg] &= ~TS_DEAD;
+                        }
                     }
                 }
             }
@@ -1414,8 +1418,7 @@ static void tcg_liveness_analysis(TCGContext *s)
             break;
         case INDEX_op_discard:
             /* mark the temporary as dead */
-            dead_temps[args[0]] = 1;
-            mem_temps[args[0]] = 0;
+            temp_state[args[0]] = TS_DEAD;
             break;
 
         case INDEX_op_add2_i32:
@@ -1436,8 +1439,8 @@ static void tcg_liveness_analysis(TCGContext *s)
                the low part.  The result can be optimized to a simple
                add or sub.  This happens often for x86_64 guest when the
                cpu mode is set to 32 bit.  */
-            if (dead_temps[args[1]] && !mem_temps[args[1]]) {
-                if (dead_temps[args[0]] && !mem_temps[args[0]]) {
+            if (temp_state[args[1]] != TS_DEAD) {
+                if (temp_state[args[0]] != TS_DEAD) {
                     goto do_remove;
                 }
                 /* Replace the opcode and adjust the args in place,
@@ -1474,8 +1477,8 @@ static void tcg_liveness_analysis(TCGContext *s)
         do_mul2:
             nb_iargs = 2;
             nb_oargs = 2;
-            if (dead_temps[args[1]] && !mem_temps[args[1]]) {
-                if (dead_temps[args[0]] && !mem_temps[args[0]]) {
+            if (temp_state[args[1]] != TS_DEAD) {
+                if (temp_state[args[0]] != TS_DEAD) {
                     /* Both parts of the operation are dead.  */
                     goto do_remove;
                 }
@@ -1483,8 +1486,7 @@ static void tcg_liveness_analysis(TCGContext *s)
                 op->opc = opc = opc_new;
                 args[1] = args[2];
                 args[2] = args[3];
-            } else if (have_opc_new2 && dead_temps[args[0]]
-                       && !mem_temps[args[0]]) {
+            } else if (temp_state[args[0]] != TS_DEAD && have_opc_new2) {
                 /* The low part of the operation is dead; generate the high. */
                 op->opc = opc = opc_new2;
                 args[0] = args[1];
@@ -1507,8 +1509,7 @@ static void tcg_liveness_analysis(TCGContext *s)
                implies side effects */
             if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) {
                 for (i = 0; i < nb_oargs; i++) {
-                    arg = args[i];
-                    if (!dead_temps[arg] || mem_temps[arg]) {
+                    if (temp_state[args[i]] != TS_DEAD) {
                         goto do_not_remove;
                     }
                 }
@@ -1519,35 +1520,35 @@ static void tcg_liveness_analysis(TCGContext *s)
                 /* output args are dead */
                 for (i = 0; i < nb_oargs; i++) {
                     arg = args[i];
-                    if (dead_temps[arg]) {
+                    if (temp_state[arg] & TS_DEAD) {
                         arg_life |= DEAD_ARG << i;
                     }
-                    if (mem_temps[arg]) {
+                    if (temp_state[arg] & TS_SYNC) {
                         arg_life |= SYNC_ARG << i;
                     }
-                    dead_temps[arg] = 1;
-                    mem_temps[arg] = 0;
+                    temp_state[arg] = TS_DEAD;
                 }
 
                 /* if end of basic block, update */
                 if (def->flags & TCG_OPF_BB_END) {
-                    tcg_la_bb_end(s, dead_temps, mem_temps);
+                    tcg_la_bb_end(s, temp_state);
                 } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
                     /* globals should be synced to memory */
-                    memset(mem_temps, 1, s->nb_globals);
+                    for (i = 0; i < nb_globals; i++) {
+                        temp_state[i] |= TS_SYNC;
+                    }
                 }
 
                 /* record arguments that die in this opcode */
                 for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
                     arg = args[i];
-                    if (dead_temps[arg]) {
+                    if (temp_state[arg] & TS_DEAD) {
                         arg_life |= DEAD_ARG << i;
                     }
                 }
                 /* input arguments are live for preceding opcodes */
                 for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-                    arg = args[i];
-                    dead_temps[arg] = 0;
+                    temp_state[args[i]] &= ~TS_DEAD;
                 }
             }
             break;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2016-06-23 18:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-06-23 18:02 [Qemu-devel] [PATCH 0/4] tcg memory usage improvements Richard Henderson
2016-06-23 18:02 ` [Qemu-devel] [PATCH 1/4] tcg: Compress liveness data to 16 bits Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 2/4] tcg: Reorg TCGOp chaining Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 3/4] tcg: Fold life data into TCGOp Richard Henderson
2016-06-23 18:03 ` [Qemu-devel] [PATCH 4/4] tcg: Compress dead_temps and mem_temps into a single array Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).