* [Qemu-devel] [PATCH v4 11/43] tcg: define CF_PARALLEL and use it for TB hashing along with CF_COUNT_MASK
2017-07-21 5:59 [Qemu-devel] [PATCH v4 00/43] tcg: support for multiple TCG contexts Emilio G. Cota
@ 2017-07-21 5:59 ` Emilio G. Cota
2017-07-21 21:28 ` Richard Henderson
2017-08-27 22:15 ` Pranith Kumar
2017-07-21 5:59 ` [Qemu-devel] [PATCH v4 20/43] tcg: check CF_PARALLEL instead of parallel_cpus Emilio G. Cota
` (3 subsequent siblings)
4 siblings, 2 replies; 17+ messages in thread
From: Emilio G. Cota @ 2017-07-21 5:59 UTC (permalink / raw)
To: qemu-devel; +Cc: Richard Henderson
This will enable us to decouple code translation from the value
of parallel_cpus at any given time. It will also help us minimize
TB flushes when generating code via EXCP_ATOMIC.
Note that the declaration of parallel_cpus is brought to exec-all.h
to be able to define there the "curr_cflags" inline.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
include/exec/exec-all.h | 20 +++++++++++++++++++-
include/exec/tb-hash-xx.h | 9 ++++++---
include/exec/tb-hash.h | 4 ++--
include/exec/tb-lookup.h | 6 +++---
tcg/tcg.h | 1 -
accel/tcg/cpu-exec.c | 45 +++++++++++++++++++++++----------------------
accel/tcg/translate-all.c | 13 +++++++++----
exec.c | 2 +-
tcg/tcg-runtime.c | 2 +-
tests/qht-bench.c | 2 +-
10 files changed, 65 insertions(+), 39 deletions(-)
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 8cbd90b..f6a928d 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -353,6 +353,9 @@ struct TranslationBlock {
#define CF_USE_ICOUNT 0x20000
#define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */
#define CF_INVALID 0x80000 /* TB is stale. Setters must acquire tb_lock */
+#define CF_PARALLEL 0x100000 /* Generate code for a parallel context */
+/* cflags' mask for hashing/comparison */
+#define CF_HASH_MASK (CF_COUNT_MASK | CF_PARALLEL)
/* Per-vCPU dynamic tracing state used to generate this TB */
uint32_t trace_vcpu_dstate;
@@ -396,11 +399,26 @@ struct TranslationBlock {
uintptr_t jmp_list_first;
};
+extern bool parallel_cpus;
+
+/* Hide the atomic_read to make code a little easier on the eyes */
+static inline uint32_t tb_cflags(const TranslationBlock *tb)
+{
+ return atomic_read(&tb->cflags);
+}
+
+/* current cflags for hashing/comparison */
+static inline uint32_t curr_cflags(void)
+{
+ return parallel_cpus ? CF_PARALLEL : 0;
+}
+
void tb_free(TranslationBlock *tb);
void tb_flush(CPUState *cpu);
void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
- target_ulong cs_base, uint32_t flags);
+ target_ulong cs_base, uint32_t flags,
+ uint32_t cf_mask);
#if defined(USE_DIRECT_JUMP)
diff --git a/include/exec/tb-hash-xx.h b/include/exec/tb-hash-xx.h
index 6cd3022..747a9a6 100644
--- a/include/exec/tb-hash-xx.h
+++ b/include/exec/tb-hash-xx.h
@@ -48,8 +48,8 @@
* xxhash32, customized for input variables that are not guaranteed to be
* contiguous in memory.
*/
-static inline
-uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
+static inline uint32_t
+tb_hash_func7(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f, uint32_t g)
{
uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
@@ -78,7 +78,7 @@ uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
v4 *= PRIME32_1;
h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18);
- h32 += 24;
+ h32 += 28;
h32 += e * PRIME32_3;
h32 = rol32(h32, 17) * PRIME32_4;
@@ -86,6 +86,9 @@ uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
h32 += f * PRIME32_3;
h32 = rol32(h32, 17) * PRIME32_4;
+ h32 += g * PRIME32_3;
+ h32 = rol32(h32, 17) * PRIME32_4;
+
h32 ^= h32 >> 15;
h32 *= PRIME32_2;
h32 ^= h32 >> 13;
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
index 17b5ee0..0526c4f 100644
--- a/include/exec/tb-hash.h
+++ b/include/exec/tb-hash.h
@@ -59,9 +59,9 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
static inline
uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags,
- uint32_t trace_vcpu_dstate)
+ uint32_t cf_mask, uint32_t trace_vcpu_dstate)
{
- return tb_hash_func6(phys_pc, pc, flags, trace_vcpu_dstate);
+ return tb_hash_func7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate);
}
#endif
diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
index 436b6d5..2961385 100644
--- a/include/exec/tb-lookup.h
+++ b/include/exec/tb-lookup.h
@@ -21,7 +21,7 @@
/* Might cause an exception, so have a longjmp destination ready */
static inline TranslationBlock *
tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
- uint32_t *flags)
+ uint32_t *flags, uint32_t cf_mask)
{
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
TranslationBlock *tb;
@@ -35,10 +35,10 @@ tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
tb->cs_base == *cs_base &&
tb->flags == *flags &&
tb->trace_vcpu_dstate == *cpu->trace_dstate &&
- !(atomic_read(&tb->cflags) & CF_INVALID))) {
+ (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
return tb;
}
- tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags);
+ tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags, cf_mask);
if (tb == NULL) {
return NULL;
}
diff --git a/tcg/tcg.h b/tcg/tcg.h
index da78721..96872f8 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -730,7 +730,6 @@ struct TCGContext {
};
extern TCGContext tcg_ctx;
-extern bool parallel_cpus;
static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
{
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index fae8c40..b71e015 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -207,7 +207,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
tb_lock();
tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
max_cycles | CF_NOCACHE
- | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
+ | (ignore_icount ? CF_IGNORE_ICOUNT : 0)
+ | curr_cflags());
tb->orig_tb = orig_tb;
tb_unlock();
@@ -225,31 +226,27 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
static void cpu_exec_step(CPUState *cpu)
{
CPUClass *cc = CPU_GET_CLASS(cpu);
- CPUArchState *env = (CPUArchState *)cpu->env_ptr;
TranslationBlock *tb;
target_ulong cs_base, pc;
uint32_t flags;
+ uint32_t cflags = 1 | CF_IGNORE_ICOUNT;
- cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
if (sigsetjmp(cpu->jmp_env, 0) == 0) {
- mmap_lock();
- tb_lock();
- tb = tb_gen_code(cpu, pc, cs_base, flags,
- 1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
- tb->orig_tb = NULL;
- tb_unlock();
- mmap_unlock();
+ tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags,
+ cflags & CF_HASH_MASK);
+ if (tb == NULL) {
+ mmap_lock();
+ tb_lock();
+ tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
+ tb_unlock();
+ mmap_unlock();
+ }
cc->cpu_exec_enter(cpu);
/* execute the generated code */
- trace_exec_tb_nocache(tb, pc);
+ trace_exec_tb(tb, pc);
cpu_tb_exec(cpu, tb);
cc->cpu_exec_exit(cpu);
-
- tb_lock();
- tb_phys_invalidate(tb, -1);
- tb_free(tb);
- tb_unlock();
} else {
/* We may have exited due to another problem here, so we need
* to reset any tb_locks we may have taken but didn't release.
@@ -281,6 +278,7 @@ struct tb_desc {
CPUArchState *env;
tb_page_addr_t phys_page1;
uint32_t flags;
+ uint32_t cf_mask;
uint32_t trace_vcpu_dstate;
};
@@ -294,7 +292,7 @@ static bool tb_cmp(const void *p, const void *d)
tb->cs_base == desc->cs_base &&
tb->flags == desc->flags &&
tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
- !(atomic_read(&tb->cflags) & CF_INVALID)) {
+ (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == desc->cf_mask) {
/* check next page if needed */
if (tb->page_addr[1] == -1) {
return true;
@@ -313,7 +311,8 @@ static bool tb_cmp(const void *p, const void *d)
}
TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
- target_ulong cs_base, uint32_t flags)
+ target_ulong cs_base, uint32_t flags,
+ uint32_t cf_mask)
{
tb_page_addr_t phys_pc;
struct tb_desc desc;
@@ -322,11 +321,12 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
desc.env = (CPUArchState *)cpu->env_ptr;
desc.cs_base = cs_base;
desc.flags = flags;
+ desc.cf_mask = cf_mask;
desc.trace_vcpu_dstate = *cpu->trace_dstate;
desc.pc = pc;
phys_pc = get_page_addr_code(desc.env, pc);
desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
- h = tb_hash_func(phys_pc, pc, flags, *cpu->trace_dstate);
+ h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
}
@@ -338,8 +338,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
target_ulong cs_base, pc;
uint32_t flags;
bool acquired_tb_lock = false;
+ uint32_t cf_mask = curr_cflags();
- tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
+ tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
if (tb == NULL) {
/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
* taken outside tb_lock. As system emulation is currently
@@ -352,10 +353,10 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
/* There's a chance that our desired tb has been translated while
* taking the locks so we check again inside the lock.
*/
- tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+ tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
if (likely(tb == NULL)) {
/* if no translated code available, then translate it now */
- tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+ tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
}
mmap_unlock();
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index c8abef0..c1ce38f 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1077,7 +1077,8 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
/* remove the TB from the hash list */
phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
- h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
+ h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
+ tb->trace_vcpu_dstate);
qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);
/* remove the TB from the page list */
@@ -1222,7 +1223,8 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
}
/* add in the hash table */
- h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
+ h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
+ tb->trace_vcpu_dstate);
qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
#ifdef DEBUG_TB_CHECK
@@ -1503,7 +1505,8 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
/* we generate a block containing just the instruction
modifying the memory. It will ensure that it cannot modify
itself */
- tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1);
+ tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
+ 1 | curr_cflags());
cpu_loop_exit_noexc(cpu);
}
#endif
@@ -1621,7 +1624,8 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
/* we generate a block containing just the instruction
modifying the memory. It will ensure that it cannot modify
itself */
- tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1);
+ tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
+ 1 | curr_cflags());
/* tb_lock will be reset after cpu_loop_exit_noexc longjmps
* back into the cpu_exec loop. */
return true;
@@ -1765,6 +1769,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
}
cflags = n | CF_LAST_IO;
+ cflags |= curr_cflags();
pc = tb->pc;
cs_base = tb->cs_base;
flags = tb->flags;
diff --git a/exec.c b/exec.c
index 01ac21e..94b0f3e 100644
--- a/exec.c
+++ b/exec.c
@@ -2433,7 +2433,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
cpu_loop_exit(cpu);
} else {
cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
- tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
+ tb_gen_code(cpu, pc, cs_base, cpu_flags, 1 | curr_cflags());
cpu_loop_exit_noexc(cpu);
}
}
diff --git a/tcg/tcg-runtime.c b/tcg/tcg-runtime.c
index 7100339..4f873a9 100644
--- a/tcg/tcg-runtime.c
+++ b/tcg/tcg-runtime.c
@@ -151,7 +151,7 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env)
target_ulong cs_base, pc;
uint32_t flags;
- tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
+ tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, curr_cflags());
if (tb == NULL) {
return tcg_ctx.code_gen_epilogue;
}
diff --git a/tests/qht-bench.c b/tests/qht-bench.c
index 11c1cec..4cabdfd 100644
--- a/tests/qht-bench.c
+++ b/tests/qht-bench.c
@@ -103,7 +103,7 @@ static bool is_equal(const void *obj, const void *userp)
static inline uint32_t h(unsigned long v)
{
- return tb_hash_func6(v, 0, 0, 0);
+ return tb_hash_func7(v, 0, 0, 0, 0);
}
/*
--
2.7.4
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [Qemu-devel] [PATCH v4 35/43] tcg: allocate optimizer temps with tcg_malloc
2017-07-21 5:59 [Qemu-devel] [PATCH v4 00/43] tcg: support for multiple TCG contexts Emilio G. Cota
2017-07-21 5:59 ` [Qemu-devel] [PATCH v4 11/43] tcg: define CF_PARALLEL and use it for TB hashing along with CF_COUNT_MASK Emilio G. Cota
2017-07-21 5:59 ` [Qemu-devel] [PATCH v4 20/43] tcg: check CF_PARALLEL instead of parallel_cpus Emilio G. Cota
@ 2017-07-21 5:59 ` Emilio G. Cota
2017-07-21 21:31 ` Richard Henderson
2017-07-21 5:59 ` [Qemu-devel] [PATCH v4 42/43] tcg: introduce regions to split code_gen_buffer Emilio G. Cota
2017-07-21 5:59 ` [Qemu-devel] [PATCH v4 43/43] tcg: enable multiple TCG contexts in softmmu Emilio G. Cota
4 siblings, 1 reply; 17+ messages in thread
From: Emilio G. Cota @ 2017-07-21 5:59 UTC (permalink / raw)
To: qemu-devel; +Cc: Richard Henderson
Groundwork for supporting multiple TCG contexts.
While at it, also allocate temps_used directly as a bitmap of the
required size, instead of using a bitmap of TCG_MAX_TEMPS via
TCGTempSet.
Performance-wise we lose about 1.12% in a translation-heavy workload
such as booting+shutting down debian-arm:
Performance counter stats for 'taskset -c 0 arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=die-on-boot.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
exec time (s) Relative slowdown wrt original (%)
---------------------------------------------------------------
original 20.213321616 0.
tcg_malloc 20.441130078 1.1270214
TCGContext 20.477846517 1.3086662
g_malloc 20.780527895 2.8061013
The other two alternatives shown in the table are:
- TCGContext: embed temps[TCG_MAX_TEMPS] and TCGTempSet used_temps
in TCGContext. This is simple enough but it isn't faster than using
tcg_malloc; moreover, it wastes memory.
- g_malloc: allocate/deallocate both temps and used_temps every time
tcg_optimize is executed.
Suggested-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/optimize.c | 306 ++++++++++++++++++++++++++++++---------------------------
1 file changed, 161 insertions(+), 145 deletions(-)
diff --git a/tcg/optimize.c b/tcg/optimize.c
index adfc56c..ce422e9 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -40,21 +40,18 @@ struct tcg_temp_info {
tcg_target_ulong mask;
};
-static struct tcg_temp_info temps[TCG_MAX_TEMPS];
-static TCGTempSet temps_used;
-
-static inline bool temp_is_const(TCGArg arg)
+static inline bool temp_is_const(const struct tcg_temp_info *temps, TCGArg arg)
{
return temps[arg].is_const;
}
-static inline bool temp_is_copy(TCGArg arg)
+static inline bool temp_is_copy(const struct tcg_temp_info *temps, TCGArg arg)
{
return temps[arg].next_copy != arg;
}
/* Reset TEMP's state, possibly removing the temp for the list of copies. */
-static void reset_temp(TCGArg temp)
+static void reset_temp(struct tcg_temp_info *temps, TCGArg temp)
{
temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
@@ -64,21 +61,16 @@ static void reset_temp(TCGArg temp)
temps[temp].mask = -1;
}
-/* Reset all temporaries, given that there are NB_TEMPS of them. */
-static void reset_all_temps(int nb_temps)
-{
- bitmap_zero(temps_used.l, nb_temps);
-}
-
/* Initialize and activate a temporary. */
-static void init_temp_info(TCGArg temp)
+static void init_temp_info(struct tcg_temp_info *temps,
+ unsigned long *temps_used, TCGArg temp)
{
- if (!test_bit(temp, temps_used.l)) {
+ if (!test_bit(temp, temps_used)) {
temps[temp].next_copy = temp;
temps[temp].prev_copy = temp;
temps[temp].is_const = false;
temps[temp].mask = -1;
- set_bit(temp, temps_used.l);
+ set_bit(temp, temps_used);
}
}
@@ -116,7 +108,8 @@ static TCGOpcode op_to_movi(TCGOpcode op)
}
}
-static TCGArg find_better_copy(TCGContext *s, TCGArg temp)
+static TCGArg find_better_copy(TCGContext *s, const struct tcg_temp_info *temps,
+ TCGArg temp)
{
TCGArg i;
@@ -145,7 +138,8 @@ static TCGArg find_better_copy(TCGContext *s, TCGArg temp)
return temp;
}
-static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
+static bool temps_are_copies(const struct tcg_temp_info *temps, TCGArg arg1,
+ TCGArg arg2)
{
TCGArg i;
@@ -153,7 +147,7 @@ static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
return true;
}
- if (!temp_is_copy(arg1) || !temp_is_copy(arg2)) {
+ if (!temp_is_copy(temps, arg1) || !temp_is_copy(temps, arg2)) {
return false;
}
@@ -166,15 +160,15 @@ static bool temps_are_copies(TCGArg arg1, TCGArg arg2)
return false;
}
-static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg *args,
- TCGArg dst, TCGArg val)
+static void tcg_opt_gen_movi(TCGContext *s, struct tcg_temp_info *temps,
+ TCGOp *op, TCGArg *args, TCGArg dst, TCGArg val)
{
TCGOpcode new_op = op_to_movi(op->opc);
tcg_target_ulong mask;
op->opc = new_op;
- reset_temp(dst);
+ reset_temp(temps, dst);
temps[dst].is_const = true;
temps[dst].val = val;
mask = val;
@@ -188,10 +182,10 @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg *args,
args[1] = val;
}
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
- TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(TCGContext *s, struct tcg_temp_info *temps,
+ TCGOp *op, TCGArg *args, TCGArg dst, TCGArg src)
{
- if (temps_are_copies(dst, src)) {
+ if (temps_are_copies(temps, dst, src)) {
tcg_op_remove(s, op);
return;
}
@@ -201,7 +195,7 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg *args,
op->opc = new_op;
- reset_temp(dst);
+ reset_temp(temps, dst);
mask = temps[src].mask;
if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
/* High bits of the destination are now garbage. */
@@ -463,10 +457,11 @@ static bool do_constant_folding_cond_eq(TCGCond c)
/* Return 2 if the condition can't be simplified, and the result
of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
- TCGArg y, TCGCond c)
+static TCGArg
+do_constant_folding_cond(const struct tcg_temp_info *temps, TCGOpcode op,
+ TCGArg x, TCGArg y, TCGCond c)
{
- if (temp_is_const(x) && temp_is_const(y)) {
+ if (temp_is_const(temps, x) && temp_is_const(temps, y)) {
switch (op_bits(op)) {
case 32:
return do_constant_folding_cond_32(temps[x].val, temps[y].val, c);
@@ -475,9 +470,9 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
default:
tcg_abort();
}
- } else if (temps_are_copies(x, y)) {
+ } else if (temps_are_copies(temps, x, y)) {
return do_constant_folding_cond_eq(c);
- } else if (temp_is_const(y) && temps[y].val == 0) {
+ } else if (temp_is_const(temps, y) && temps[y].val == 0) {
switch (c) {
case TCG_COND_LTU:
return 0;
@@ -492,15 +487,17 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
/* Return 2 if the condition can't be simplified, and the result
of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+static TCGArg
+do_constant_folding_cond2(const struct tcg_temp_info *temps, TCGArg *p1,
+ TCGArg *p2, TCGCond c)
{
TCGArg al = p1[0], ah = p1[1];
TCGArg bl = p2[0], bh = p2[1];
- if (temp_is_const(bl) && temp_is_const(bh)) {
+ if (temp_is_const(temps, bl) && temp_is_const(temps, bh)) {
uint64_t b = ((uint64_t)temps[bh].val << 32) | (uint32_t)temps[bl].val;
- if (temp_is_const(al) && temp_is_const(ah)) {
+ if (temp_is_const(temps, al) && temp_is_const(temps, ah)) {
uint64_t a;
a = ((uint64_t)temps[ah].val << 32) | (uint32_t)temps[al].val;
return do_constant_folding_cond_64(a, b, c);
@@ -516,18 +513,19 @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
}
}
}
- if (temps_are_copies(al, bl) && temps_are_copies(ah, bh)) {
+ if (temps_are_copies(temps, al, bl) && temps_are_copies(temps, ah, bh)) {
return do_constant_folding_cond_eq(c);
}
return 2;
}
-static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
+static bool swap_commutative(const struct tcg_temp_info *temps, TCGArg dest,
+ TCGArg *p1, TCGArg *p2)
{
TCGArg a1 = *p1, a2 = *p2;
int sum = 0;
- sum += temp_is_const(a1);
- sum -= temp_is_const(a2);
+ sum += temp_is_const(temps, a1);
+ sum -= temp_is_const(temps, a2);
/* Prefer the constant in second argument, and then the form
op a, a, b, which is better handled on non-RISC hosts. */
@@ -539,13 +537,14 @@ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
return false;
}
-static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+static bool swap_commutative2(const struct tcg_temp_info *temps, TCGArg *p1,
+ TCGArg *p2)
{
int sum = 0;
- sum += temp_is_const(p1[0]);
- sum += temp_is_const(p1[1]);
- sum -= temp_is_const(p2[0]);
- sum -= temp_is_const(p2[1]);
+ sum += temp_is_const(temps, p1[0]);
+ sum += temp_is_const(temps, p1[1]);
+ sum -= temp_is_const(temps, p2[0]);
+ sum -= temp_is_const(temps, p2[1]);
if (sum > 0) {
TCGArg t;
t = p1[0], p1[0] = p2[0], p2[0] = t;
@@ -558,6 +557,8 @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
/* Propagate constants and copies, fold constant expressions. */
void tcg_optimize(TCGContext *s)
{
+ struct tcg_temp_info *temps;
+ unsigned long *temps_used;
int oi, oi_next, nb_temps, nb_globals;
TCGArg *prev_mb_args = NULL;
@@ -568,7 +569,9 @@ void tcg_optimize(TCGContext *s)
nb_temps = s->nb_temps;
nb_globals = s->nb_globals;
- reset_all_temps(nb_temps);
+ temps = tcg_malloc(nb_temps * sizeof(*temps));
+ temps_used = tcg_malloc(BITS_TO_LONGS(nb_temps) * sizeof(*temps_used));
+ bitmap_zero(temps_used, nb_temps);
for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
tcg_target_ulong mask, partmask, affected;
@@ -590,21 +593,21 @@ void tcg_optimize(TCGContext *s)
for (i = 0; i < nb_oargs + nb_iargs; i++) {
tmp = args[i];
if (tmp != TCG_CALL_DUMMY_ARG) {
- init_temp_info(tmp);
+ init_temp_info(temps, temps_used, tmp);
}
}
} else {
nb_oargs = def->nb_oargs;
nb_iargs = def->nb_iargs;
for (i = 0; i < nb_oargs + nb_iargs; i++) {
- init_temp_info(args[i]);
+ init_temp_info(temps, temps_used, args[i]);
}
}
/* Do copy propagation */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
- if (temp_is_copy(args[i])) {
- args[i] = find_better_copy(s, args[i]);
+ if (temp_is_copy(temps, args[i])) {
+ args[i] = find_better_copy(s, temps, args[i]);
}
}
@@ -620,44 +623,44 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(nor):
CASE_OP_32_64(muluh):
CASE_OP_32_64(mulsh):
- swap_commutative(args[0], &args[1], &args[2]);
+ swap_commutative(temps, args[0], &args[1], &args[2]);
break;
CASE_OP_32_64(brcond):
- if (swap_commutative(-1, &args[0], &args[1])) {
+ if (swap_commutative(temps, -1, &args[0], &args[1])) {
args[2] = tcg_swap_cond(args[2]);
}
break;
CASE_OP_32_64(setcond):
- if (swap_commutative(args[0], &args[1], &args[2])) {
+ if (swap_commutative(temps, args[0], &args[1], &args[2])) {
args[3] = tcg_swap_cond(args[3]);
}
break;
CASE_OP_32_64(movcond):
- if (swap_commutative(-1, &args[1], &args[2])) {
+ if (swap_commutative(temps, -1, &args[1], &args[2])) {
args[5] = tcg_swap_cond(args[5]);
}
/* For movcond, we canonicalize the "false" input reg to match
the destination reg so that the tcg backend can implement
a "move if true" operation. */
- if (swap_commutative(args[0], &args[4], &args[3])) {
+ if (swap_commutative(temps, args[0], &args[4], &args[3])) {
args[5] = tcg_invert_cond(args[5]);
}
break;
CASE_OP_32_64(add2):
- swap_commutative(args[0], &args[2], &args[4]);
- swap_commutative(args[1], &args[3], &args[5]);
+ swap_commutative(temps, args[0], &args[2], &args[4]);
+ swap_commutative(temps, args[1], &args[3], &args[5]);
break;
CASE_OP_32_64(mulu2):
CASE_OP_32_64(muls2):
- swap_commutative(args[0], &args[2], &args[3]);
+ swap_commutative(temps, args[0], &args[2], &args[3]);
break;
case INDEX_op_brcond2_i32:
- if (swap_commutative2(&args[0], &args[2])) {
+ if (swap_commutative2(temps, &args[0], &args[2])) {
args[4] = tcg_swap_cond(args[4]);
}
break;
case INDEX_op_setcond2_i32:
- if (swap_commutative2(&args[1], &args[3])) {
+ if (swap_commutative2(temps, &args[1], &args[3])) {
args[5] = tcg_swap_cond(args[5]);
}
break;
@@ -673,8 +676,8 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(sar):
CASE_OP_32_64(rotl):
CASE_OP_32_64(rotr):
- if (temp_is_const(args[1]) && temps[args[1]].val == 0) {
- tcg_opt_gen_movi(s, op, args, args[0], 0);
+ if (temp_is_const(temps, args[1]) && temps[args[1]].val == 0) {
+ tcg_opt_gen_movi(s, temps, op, args, args[0], 0);
continue;
}
break;
@@ -683,7 +686,7 @@ void tcg_optimize(TCGContext *s)
TCGOpcode neg_op;
bool have_neg;
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
/* Proceed with possible constant folding. */
break;
}
@@ -697,9 +700,9 @@ void tcg_optimize(TCGContext *s)
if (!have_neg) {
break;
}
- if (temp_is_const(args[1]) && temps[args[1]].val == 0) {
+ if (temp_is_const(temps, args[1]) && temps[args[1]].val == 0) {
op->opc = neg_op;
- reset_temp(args[0]);
+ reset_temp(temps, args[0]);
args[1] = args[2];
continue;
}
@@ -707,30 +710,30 @@ void tcg_optimize(TCGContext *s)
break;
CASE_OP_32_64(xor):
CASE_OP_32_64(nand):
- if (!temp_is_const(args[1])
- && temp_is_const(args[2]) && temps[args[2]].val == -1) {
+ if (!temp_is_const(temps, args[1])
+ && temp_is_const(temps, args[2]) && temps[args[2]].val == -1) {
i = 1;
goto try_not;
}
break;
CASE_OP_32_64(nor):
- if (!temp_is_const(args[1])
- && temp_is_const(args[2]) && temps[args[2]].val == 0) {
+ if (!temp_is_const(temps, args[1])
+ && temp_is_const(temps, args[2]) && temps[args[2]].val == 0) {
i = 1;
goto try_not;
}
break;
CASE_OP_32_64(andc):
- if (!temp_is_const(args[2])
- && temp_is_const(args[1]) && temps[args[1]].val == -1) {
+ if (!temp_is_const(temps, args[2])
+ && temp_is_const(temps, args[1]) && temps[args[1]].val == -1) {
i = 2;
goto try_not;
}
break;
CASE_OP_32_64(orc):
CASE_OP_32_64(eqv):
- if (!temp_is_const(args[2])
- && temp_is_const(args[1]) && temps[args[1]].val == 0) {
+ if (!temp_is_const(temps, args[2])
+ && temp_is_const(temps, args[1]) && temps[args[1]].val == 0) {
i = 2;
goto try_not;
}
@@ -751,7 +754,7 @@ void tcg_optimize(TCGContext *s)
break;
}
op->opc = not_op;
- reset_temp(args[0]);
+ reset_temp(temps, args[0]);
args[1] = args[i];
continue;
}
@@ -771,18 +774,18 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(or):
CASE_OP_32_64(xor):
CASE_OP_32_64(andc):
- if (!temp_is_const(args[1])
- && temp_is_const(args[2]) && temps[args[2]].val == 0) {
- tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+ if (!temp_is_const(temps, args[1])
+ && temp_is_const(temps, args[2]) && temps[args[2]].val == 0) {
+ tcg_opt_gen_mov(s, temps, op, args, args[0], args[1]);
continue;
}
break;
CASE_OP_32_64(and):
CASE_OP_32_64(orc):
CASE_OP_32_64(eqv):
- if (!temp_is_const(args[1])
- && temp_is_const(args[2]) && temps[args[2]].val == -1) {
- tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+ if (!temp_is_const(temps, args[1])
+ && temp_is_const(temps, args[2]) && temps[args[2]].val == -1) {
+ tcg_opt_gen_mov(s, temps, op, args, args[0], args[1]);
continue;
}
break;
@@ -819,7 +822,7 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(and):
mask = temps[args[2]].mask;
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
and_const:
affected = temps[args[1]].mask & ~mask;
}
@@ -838,7 +841,7 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(andc):
/* Known-zeros does not imply known-ones. Therefore unless
args[2] is constant, we can't infer anything from it. */
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
mask = ~temps[args[2]].mask;
goto and_const;
}
@@ -847,26 +850,26 @@ void tcg_optimize(TCGContext *s)
break;
case INDEX_op_sar_i32:
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
tmp = temps[args[2]].val & 31;
mask = (int32_t)temps[args[1]].mask >> tmp;
}
break;
case INDEX_op_sar_i64:
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
tmp = temps[args[2]].val & 63;
mask = (int64_t)temps[args[1]].mask >> tmp;
}
break;
case INDEX_op_shr_i32:
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
tmp = temps[args[2]].val & 31;
mask = (uint32_t)temps[args[1]].mask >> tmp;
}
break;
case INDEX_op_shr_i64:
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
tmp = temps[args[2]].val & 63;
mask = (uint64_t)temps[args[1]].mask >> tmp;
}
@@ -880,7 +883,7 @@ void tcg_optimize(TCGContext *s)
break;
CASE_OP_32_64(shl):
- if (temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[2])) {
tmp = temps[args[2]].val & (TCG_TARGET_REG_BITS - 1);
mask = temps[args[1]].mask << tmp;
}
@@ -976,12 +979,12 @@ void tcg_optimize(TCGContext *s)
if (partmask == 0) {
tcg_debug_assert(nb_oargs == 1);
- tcg_opt_gen_movi(s, op, args, args[0], 0);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], 0);
continue;
}
if (affected == 0) {
tcg_debug_assert(nb_oargs == 1);
- tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+ tcg_opt_gen_mov(s, temps, op, args, args[0], args[1]);
continue;
}
@@ -991,8 +994,8 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(mul):
CASE_OP_32_64(muluh):
CASE_OP_32_64(mulsh):
- if ((temp_is_const(args[2]) && temps[args[2]].val == 0)) {
- tcg_opt_gen_movi(s, op, args, args[0], 0);
+ if ((temp_is_const(temps, args[2]) && temps[args[2]].val == 0)) {
+ tcg_opt_gen_movi(s, temps, op, args, args[0], 0);
continue;
}
break;
@@ -1004,8 +1007,8 @@ void tcg_optimize(TCGContext *s)
switch (opc) {
CASE_OP_32_64(or):
CASE_OP_32_64(and):
- if (temps_are_copies(args[1], args[2])) {
- tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+ if (temps_are_copies(temps, args[1], args[2])) {
+ tcg_opt_gen_mov(s, temps, op, args, args[0], args[1]);
continue;
}
break;
@@ -1018,8 +1021,8 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(andc):
CASE_OP_32_64(sub):
CASE_OP_32_64(xor):
- if (temps_are_copies(args[1], args[2])) {
- tcg_opt_gen_movi(s, op, args, args[0], 0);
+ if (temps_are_copies(temps, args[1], args[2])) {
+ tcg_opt_gen_movi(s, temps, op, args, args[0], 0);
continue;
}
break;
@@ -1032,10 +1035,10 @@ void tcg_optimize(TCGContext *s)
allocator where needed and possible. Also detect copies. */
switch (opc) {
CASE_OP_32_64(mov):
- tcg_opt_gen_mov(s, op, args, args[0], args[1]);
+ tcg_opt_gen_mov(s, temps, op, args, args[0], args[1]);
break;
CASE_OP_32_64(movi):
- tcg_opt_gen_movi(s, op, args, args[0], args[1]);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], args[1]);
break;
CASE_OP_32_64(not):
@@ -1051,9 +1054,9 @@ void tcg_optimize(TCGContext *s)
case INDEX_op_extu_i32_i64:
case INDEX_op_extrl_i64_i32:
case INDEX_op_extrh_i64_i32:
- if (temp_is_const(args[1])) {
+ if (temp_is_const(temps, args[1])) {
tmp = do_constant_folding(opc, temps[args[1]].val, 0);
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
break;
}
goto do_default;
@@ -1080,66 +1083,70 @@ void tcg_optimize(TCGContext *s)
CASE_OP_32_64(divu):
CASE_OP_32_64(rem):
CASE_OP_32_64(remu):
- if (temp_is_const(args[1]) && temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[1]) &&
+ temp_is_const(temps, args[2])) {
tmp = do_constant_folding(opc, temps[args[1]].val,
temps[args[2]].val);
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
break;
}
goto do_default;
CASE_OP_32_64(clz):
CASE_OP_32_64(ctz):
- if (temp_is_const(args[1])) {
+ if (temp_is_const(temps, args[1])) {
TCGArg v = temps[args[1]].val;
if (v != 0) {
tmp = do_constant_folding(opc, v, 0);
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
} else {
- tcg_opt_gen_mov(s, op, args, args[0], args[2]);
+ tcg_opt_gen_mov(s, temps, op, args, args[0], args[2]);
}
break;
}
goto do_default;
CASE_OP_32_64(deposit):
- if (temp_is_const(args[1]) && temp_is_const(args[2])) {
+ if (temp_is_const(temps, args[1]) &&
+ temp_is_const(temps, args[2])) {
tmp = deposit64(temps[args[1]].val, args[3], args[4],
temps[args[2]].val);
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
break;
}
goto do_default;
CASE_OP_32_64(extract):
- if (temp_is_const(args[1])) {
+ if (temp_is_const(temps, args[1])) {
tmp = extract64(temps[args[1]].val, args[2], args[3]);
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
break;
}
goto do_default;
CASE_OP_32_64(sextract):
- if (temp_is_const(args[1])) {
+ if (temp_is_const(temps, args[1])) {
tmp = sextract64(temps[args[1]].val, args[2], args[3]);
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
break;
}
goto do_default;
CASE_OP_32_64(setcond):
- tmp = do_constant_folding_cond(opc, args[1], args[2], args[3]);
+ tmp = do_constant_folding_cond(temps, opc, args[1], args[2],
+ args[3]);
if (tmp != 2) {
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
break;
}
goto do_default;
CASE_OP_32_64(brcond):
- tmp = do_constant_folding_cond(opc, args[0], args[1], args[2]);
+ tmp = do_constant_folding_cond(temps, opc, args[0], args[1],
+ args[2]);
if (tmp != 2) {
if (tmp) {
- reset_all_temps(nb_temps);
+ bitmap_zero(temps_used, nb_temps);
op->opc = INDEX_op_br;
args[0] = args[3];
} else {
@@ -1150,12 +1157,14 @@ void tcg_optimize(TCGContext *s)
goto do_default;
CASE_OP_32_64(movcond):
- tmp = do_constant_folding_cond(opc, args[1], args[2], args[5]);
+ tmp = do_constant_folding_cond(temps, opc, args[1], args[2],
+ args[5]);
if (tmp != 2) {
- tcg_opt_gen_mov(s, op, args, args[0], args[4-tmp]);
+ tcg_opt_gen_mov(s, temps, op, args, args[0], args[4 - tmp]);
break;
}
- if (temp_is_const(args[3]) && temp_is_const(args[4])) {
+ if (temp_is_const(temps, args[3]) &&
+ temp_is_const(temps, args[4])) {
tcg_target_ulong tv = temps[args[3]].val;
tcg_target_ulong fv = temps[args[4]].val;
TCGCond cond = args[5];
@@ -1174,8 +1183,10 @@ void tcg_optimize(TCGContext *s)
case INDEX_op_add2_i32:
case INDEX_op_sub2_i32:
- if (temp_is_const(args[2]) && temp_is_const(args[3])
- && temp_is_const(args[4]) && temp_is_const(args[5])) {
+ if (temp_is_const(temps, args[2]) &&
+ temp_is_const(temps, args[3]) &&
+ temp_is_const(temps, args[4]) &&
+ temp_is_const(temps, args[5])) {
uint32_t al = temps[args[2]].val;
uint32_t ah = temps[args[3]].val;
uint32_t bl = temps[args[4]].val;
@@ -1194,8 +1205,8 @@ void tcg_optimize(TCGContext *s)
rl = args[0];
rh = args[1];
- tcg_opt_gen_movi(s, op, args, rl, (int32_t)a);
- tcg_opt_gen_movi(s, op2, args2, rh, (int32_t)(a >> 32));
+ tcg_opt_gen_movi(s, temps, op, args, rl, (int32_t)a);
+ tcg_opt_gen_movi(s, temps, op2, args2, rh, (int32_t)(a >> 32));
/* We've done all we need to do with the movi. Skip it. */
oi_next = op2->next;
@@ -1204,7 +1215,8 @@ void tcg_optimize(TCGContext *s)
goto do_default;
case INDEX_op_mulu2_i32:
- if (temp_is_const(args[2]) && temp_is_const(args[3])) {
+ if (temp_is_const(temps, args[2]) &&
+ temp_is_const(temps, args[3])) {
uint32_t a = temps[args[2]].val;
uint32_t b = temps[args[3]].val;
uint64_t r = (uint64_t)a * b;
@@ -1214,8 +1226,8 @@ void tcg_optimize(TCGContext *s)
rl = args[0];
rh = args[1];
- tcg_opt_gen_movi(s, op, args, rl, (int32_t)r);
- tcg_opt_gen_movi(s, op2, args2, rh, (int32_t)(r >> 32));
+ tcg_opt_gen_movi(s, temps, op, args, rl, (int32_t)r);
+ tcg_opt_gen_movi(s, temps, op2, args2, rh, (int32_t)(r >> 32));
/* We've done all we need to do with the movi. Skip it. */
oi_next = op2->next;
@@ -1224,11 +1236,11 @@ void tcg_optimize(TCGContext *s)
goto do_default;
case INDEX_op_brcond2_i32:
- tmp = do_constant_folding_cond2(&args[0], &args[2], args[4]);
+ tmp = do_constant_folding_cond2(temps, &args[0], &args[2], args[4]);
if (tmp != 2) {
if (tmp) {
do_brcond_true:
- reset_all_temps(nb_temps);
+ bitmap_zero(temps_used, nb_temps);
op->opc = INDEX_op_br;
args[0] = args[5];
} else {
@@ -1236,12 +1248,14 @@ void tcg_optimize(TCGContext *s)
tcg_op_remove(s, op);
}
} else if ((args[4] == TCG_COND_LT || args[4] == TCG_COND_GE)
- && temp_is_const(args[2]) && temps[args[2]].val == 0
- && temp_is_const(args[3]) && temps[args[3]].val == 0) {
+ && temp_is_const(temps, args[2])
+ && temps[args[2]].val == 0
+ && temp_is_const(temps, args[3])
+ && temps[args[3]].val == 0) {
/* Simplify LT/GE comparisons vs zero to a single compare
vs the high word of the input. */
do_brcond_high:
- reset_all_temps(nb_temps);
+ bitmap_zero(temps_used, nb_temps);
op->opc = INDEX_op_brcond_i32;
args[0] = args[1];
args[1] = args[3];
@@ -1250,14 +1264,14 @@ void tcg_optimize(TCGContext *s)
} else if (args[4] == TCG_COND_EQ) {
/* Simplify EQ comparisons where one of the pairs
can be simplified. */
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_brcond_i32,
args[0], args[2], TCG_COND_EQ);
if (tmp == 0) {
goto do_brcond_false;
} else if (tmp == 1) {
goto do_brcond_high;
}
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_brcond_i32,
args[1], args[3], TCG_COND_EQ);
if (tmp == 0) {
goto do_brcond_false;
@@ -1265,7 +1279,7 @@ void tcg_optimize(TCGContext *s)
goto do_default;
}
do_brcond_low:
- reset_all_temps(nb_temps);
+ bitmap_zero(temps_used, nb_temps);
op->opc = INDEX_op_brcond_i32;
args[1] = args[2];
args[2] = args[4];
@@ -1273,14 +1287,14 @@ void tcg_optimize(TCGContext *s)
} else if (args[4] == TCG_COND_NE) {
/* Simplify NE comparisons where one of the pairs
can be simplified. */
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_brcond_i32,
args[0], args[2], TCG_COND_NE);
if (tmp == 0) {
goto do_brcond_high;
} else if (tmp == 1) {
goto do_brcond_true;
}
- tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_brcond_i32,
args[1], args[3], TCG_COND_NE);
if (tmp == 0) {
goto do_brcond_low;
@@ -1294,17 +1308,19 @@ void tcg_optimize(TCGContext *s)
break;
case INDEX_op_setcond2_i32:
- tmp = do_constant_folding_cond2(&args[1], &args[3], args[5]);
+ tmp = do_constant_folding_cond2(temps, &args[1], &args[3], args[5]);
if (tmp != 2) {
do_setcond_const:
- tcg_opt_gen_movi(s, op, args, args[0], tmp);
+ tcg_opt_gen_movi(s, temps, op, args, args[0], tmp);
} else if ((args[5] == TCG_COND_LT || args[5] == TCG_COND_GE)
- && temp_is_const(args[3]) && temps[args[3]].val == 0
- && temp_is_const(args[4]) && temps[args[4]].val == 0) {
+ && temp_is_const(temps, args[3])
+ && temps[args[3]].val == 0
+ && temp_is_const(temps, args[4])
+ && temps[args[4]].val == 0) {
/* Simplify LT/GE comparisons vs zero to a single compare
vs the high word of the input. */
do_setcond_high:
- reset_temp(args[0]);
+ reset_temp(temps, args[0]);
temps[args[0]].mask = 1;
op->opc = INDEX_op_setcond_i32;
args[1] = args[2];
@@ -1313,14 +1329,14 @@ void tcg_optimize(TCGContext *s)
} else if (args[5] == TCG_COND_EQ) {
/* Simplify EQ comparisons where one of the pairs
can be simplified. */
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_setcond_i32,
args[1], args[3], TCG_COND_EQ);
if (tmp == 0) {
goto do_setcond_const;
} else if (tmp == 1) {
goto do_setcond_high;
}
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_setcond_i32,
args[2], args[4], TCG_COND_EQ);
if (tmp == 0) {
goto do_setcond_high;
@@ -1328,7 +1344,7 @@ void tcg_optimize(TCGContext *s)
goto do_default;
}
do_setcond_low:
- reset_temp(args[0]);
+ reset_temp(temps, args[0]);
temps[args[0]].mask = 1;
op->opc = INDEX_op_setcond_i32;
args[2] = args[3];
@@ -1336,14 +1352,14 @@ void tcg_optimize(TCGContext *s)
} else if (args[5] == TCG_COND_NE) {
/* Simplify NE comparisons where one of the pairs
can be simplified. */
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_setcond_i32,
args[1], args[3], TCG_COND_NE);
if (tmp == 0) {
goto do_setcond_high;
} else if (tmp == 1) {
goto do_setcond_const;
}
- tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+ tmp = do_constant_folding_cond(temps, INDEX_op_setcond_i32,
args[2], args[4], TCG_COND_NE);
if (tmp == 0) {
goto do_setcond_low;
@@ -1360,8 +1376,8 @@ void tcg_optimize(TCGContext *s)
if (!(args[nb_oargs + nb_iargs + 1]
& (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
for (i = 0; i < nb_globals; i++) {
- if (test_bit(i, temps_used.l)) {
- reset_temp(i);
+ if (test_bit(i, temps_used)) {
+ reset_temp(temps, i);
}
}
}
@@ -1375,11 +1391,11 @@ void tcg_optimize(TCGContext *s)
block, otherwise we only trash the output args. "mask" is
the non-zero bits mask for the first output arg. */
if (def->flags & TCG_OPF_BB_END) {
- reset_all_temps(nb_temps);
+ bitmap_zero(temps_used, nb_temps);
} else {
do_reset_output:
for (i = 0; i < nb_oargs; i++) {
- reset_temp(args[i]);
+ reset_temp(temps, args[i]);
/* Save the corresponding known-zero bits mask for the
first output argument (only one supported so far). */
if (i == 0) {
--
2.7.4
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [Qemu-devel] [PATCH v4 42/43] tcg: introduce regions to split code_gen_buffer
2017-07-21 5:59 [Qemu-devel] [PATCH v4 00/43] tcg: support for multiple TCG contexts Emilio G. Cota
` (2 preceding siblings ...)
2017-07-21 5:59 ` [Qemu-devel] [PATCH v4 35/43] tcg: allocate optimizer temps with tcg_malloc Emilio G. Cota
@ 2017-07-21 5:59 ` Emilio G. Cota
2017-07-21 21:38 ` Richard Henderson
2017-07-21 5:59 ` [Qemu-devel] [PATCH v4 43/43] tcg: enable multiple TCG contexts in softmmu Emilio G. Cota
4 siblings, 1 reply; 17+ messages in thread
From: Emilio G. Cota @ 2017-07-21 5:59 UTC (permalink / raw)
To: qemu-devel; +Cc: Richard Henderson
This is groundwork for supporting multiple TCG contexts.
The naive solution here is to split code_gen_buffer statically
among the TCG threads; this however results in poor utilization
if translation needs are different across TCG threads.
What we do here is to add an extra layer of indirection, assigning
regions that act just like pages do in virtual memory allocation.
(BTW if you are wondering about the chosen naming, I did not want
to use blocks or pages because those are already heavily used in QEMU).
We use a global lock to serialize allocations as well as statistics
reporting (we now export the size of the used code_gen_buffer with
tcg_code_size()). Note that for the allocator we could just use
a counter and atomic_inc; however, that would complicate the gathering
of tcg_code_size()-like stats. So given that the region operations are
not a fast path, a lock seems the most reasonable choice.
The effectiveness of this approach is clear after seeing some numbers.
I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark.
Note that I'm evaluating this after enabling per-thread TCG (which
is done by a subsequent commit).
* -smp 1, 1 region (entire buffer):
qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357
qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363
qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364
qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373
qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373
qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360
qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370
qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367
That is, 8 flushes.
* -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]:
qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356
qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361
qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361
qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375
qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375
qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360
qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365
qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368
Again, 8 flushes. Note how buffer utilization is not 100%, but it
is close. Smaller region sizes would yield higher utilization,
but we want region allocation to be rare (it acquires a lock), so
we do not want to go too small.
* -smp 8, static partitioning of 8 regions (10 MB per region):
qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354
qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370
qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365
qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377
qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358
qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367
qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364
qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358
qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362
qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372
qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374
qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376
qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374
qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372
qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359
qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362
qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368
qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378
qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367
qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364
That is, 20 flushes. Note how a static partitioning approach uses
the code buffer poorly, leading to many unnecessary flushes.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/tcg.h | 6 ++
accel/tcg/translate-all.c | 63 +++++--------
bsd-user/main.c | 1 +
cpus.c | 12 +++
linux-user/main.c | 1 +
tcg/tcg.c | 222 +++++++++++++++++++++++++++++++++++++++++++++-
6 files changed, 260 insertions(+), 45 deletions(-)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 9c01fe6..7413aa0 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -760,6 +760,12 @@ void *tcg_malloc_internal(TCGContext *s, int size);
void tcg_pool_reset(TCGContext *s);
TranslationBlock *tcg_tb_alloc(TCGContext *s);
+void tcg_region_init(void);
+void tcg_region_reset_all(void);
+
+size_t tcg_code_size(void);
+size_t tcg_code_capacity(void);
+
/* Called with tb_lock held. */
static inline void *tcg_malloc(int size)
{
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 1a98f1a..9be584b 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -608,15 +608,13 @@ static inline void *alloc_code_gen_buffer(void)
{
void *buf = static_code_gen_buffer;
void *end = static_code_gen_buffer + sizeof(static_code_gen_buffer);
- size_t full_size, size;
+ size_t size;
/* page-align the beginning and end of the buffer */
buf = QEMU_ALIGN_PTR_UP(buf, qemu_real_host_page_size);
end = QEMU_ALIGN_PTR_DOWN(end, qemu_real_host_page_size);
- /* Reserve a guard page. */
- full_size = end - buf;
- size = full_size - qemu_real_host_page_size;
+ size = end - buf;
/* Honor a command-line option limiting the size of the buffer. */
if (size > tcg_ctx->code_gen_buffer_size) {
@@ -635,9 +633,6 @@ static inline void *alloc_code_gen_buffer(void)
if (qemu_mprotect_rwx(buf, size)) {
abort();
}
- if (qemu_mprotect_none(buf + size, qemu_real_host_page_size)) {
- abort();
- }
qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
return buf;
@@ -646,22 +641,16 @@ static inline void *alloc_code_gen_buffer(void)
static inline void *alloc_code_gen_buffer(void)
{
size_t size = tcg_ctx->code_gen_buffer_size;
- void *buf1, *buf2;
-
- /* Perform the allocation in two steps, so that the guard page
- is reserved but uncommitted. */
- buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
- MEM_RESERVE, PAGE_NOACCESS);
- if (buf1 != NULL) {
- buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
- assert(buf1 == buf2);
- }
+ void *buf;
- return buf1;
+ buf = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT,
+ PAGE_EXECUTE_READWRITE);
+ return buf;
}
#else
static inline void *alloc_code_gen_buffer(void)
{
+ int prot = PROT_WRITE | PROT_READ | PROT_EXEC;
int flags = MAP_PRIVATE | MAP_ANONYMOUS;
uintptr_t start = 0;
size_t size = tcg_ctx->code_gen_buffer_size;
@@ -695,8 +684,7 @@ static inline void *alloc_code_gen_buffer(void)
# endif
# endif
- buf = mmap((void *)start, size + qemu_real_host_page_size,
- PROT_NONE, flags, -1, 0);
+ buf = mmap((void *)start, size, prot, flags, -1, 0);
if (buf == MAP_FAILED) {
return NULL;
}
@@ -706,24 +694,23 @@ static inline void *alloc_code_gen_buffer(void)
/* Try again, with the original still mapped, to avoid re-acquiring
that 256mb crossing. This time don't specify an address. */
size_t size2;
- void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
- PROT_NONE, flags, -1, 0);
+ void *buf2 = mmap(NULL, size, prot, flags, -1, 0);
switch ((int)(buf2 != MAP_FAILED)) {
case 1:
if (!cross_256mb(buf2, size)) {
/* Success! Use the new buffer. */
- munmap(buf, size + qemu_real_host_page_size);
+ munmap(buf, size);
break;
}
/* Failure. Work with what we had. */
- munmap(buf2, size + qemu_real_host_page_size);
+ munmap(buf2, size);
/* fallthru */
default:
/* Split the original buffer. Free the smaller half. */
buf2 = split_cross_256mb(buf, size);
size2 = tcg_ctx->code_gen_buffer_size;
if (buf == buf2) {
- munmap(buf + size2 + qemu_real_host_page_size, size - size2);
+ munmap(buf + size2, size - size2);
} else {
munmap(buf, size - size2);
}
@@ -734,10 +721,6 @@ static inline void *alloc_code_gen_buffer(void)
}
#endif
- /* Make the final buffer accessible. The guard page at the end
- will remain inaccessible with PROT_NONE. */
- mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
-
/* Request large pages for the buffer. */
qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
@@ -918,13 +901,8 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
size_t host_size = 0;
g_tree_foreach(tb_ctx.tb_tree, tb_host_size_iter, &host_size);
- printf("qemu: flush code_size=%td nb_tbs=%zu avg_tb_size=%zu\n",
- tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer, nb_tbs,
- nb_tbs > 0 ? host_size / nb_tbs : 0);
- }
- if ((unsigned long)(tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer)
- > tcg_ctx->code_gen_buffer_size) {
- cpu_abort(cpu, "Internal error: code buffer overflow\n");
+ printf("qemu: flush code_size=%zu nb_tbs=%zu avg_tb_size=%zu\n",
+ tcg_code_size(), nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0);
}
CPU_FOREACH(cpu) {
@@ -938,7 +916,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
page_flush_tb();
- tcg_ctx->code_gen_ptr = tcg_ctx->code_gen_buffer;
+ tcg_region_reset_all();
/* XXX: flush processor icache at this point if cache flush is
expensive */
atomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1);
@@ -1279,9 +1257,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
cflags |= CF_USE_ICOUNT;
}
+ buffer_overflow:
tb = tb_alloc(pc);
if (unlikely(!tb)) {
- buffer_overflow:
/* flush must be done */
tb_flush(cpu);
mmap_unlock();
@@ -1365,9 +1343,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
}
#endif
- tcg_ctx->code_gen_ptr = (void *)
+ atomic_set(&tcg_ctx->code_gen_ptr, (void *)
ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
- CODE_GEN_ALIGN);
+ CODE_GEN_ALIGN));
/* init jump list */
assert(((uintptr_t)tb & 3) == 0);
@@ -1909,9 +1887,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
* otherwise users might think "-tb-size" is not honoured.
* For avg host size we use the precise numbers from tb_tree_stats though.
*/
- cpu_fprintf(f, "gen code size %td/%zd\n",
- tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer,
- tcg_ctx->code_gen_highwater - tcg_ctx->code_gen_buffer);
+ cpu_fprintf(f, "gen code size %zu/%zu\n",
+ tcg_code_size(), tcg_code_capacity());
cpu_fprintf(f, "TB count %zu\n", nb_tbs);
cpu_fprintf(f, "TB avg target size %zu max=%zu bytes\n",
nb_tbs ? tst.target_size / nb_tbs : 0,
diff --git a/bsd-user/main.c b/bsd-user/main.c
index 7a8b29e..aa52c97 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -979,6 +979,7 @@ int main(int argc, char **argv)
generating the prologue until now so that the prologue can take
the real value of GUEST_BASE into account. */
tcg_prologue_init(tcg_ctx);
+ tcg_region_init();
/* build Task State */
memset(ts, 0, sizeof(TaskState));
diff --git a/cpus.c b/cpus.c
index 9bed61e..6022d40 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1664,6 +1664,18 @@ static void qemu_tcg_init_vcpu(CPUState *cpu)
char thread_name[VCPU_THREAD_NAME_SIZE];
static QemuCond *single_tcg_halt_cond;
static QemuThread *single_tcg_cpu_thread;
+ static int tcg_region_inited;
+
+ /*
+ * Initialize TCG regions--once. Now is a good time, because:
+ * (1) TCG's init context, prologue and target globals have been set up.
+ * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
+ * -accel flag is processed, so the check doesn't work then).
+ */
+ if (!tcg_region_inited) {
+ tcg_region_inited = 1;
+ tcg_region_init();
+ }
if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
cpu->thread = g_malloc0(sizeof(QemuThread));
diff --git a/linux-user/main.c b/linux-user/main.c
index de7d948..aab4433 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -4460,6 +4460,7 @@ int main(int argc, char **argv, char **envp)
generating the prologue until now so that the prologue can take
the real value of GUEST_BASE into account. */
tcg_prologue_init(tcg_ctx);
+ tcg_region_init();
#if defined(TARGET_I386)
env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index cb4ecbd..c77654f 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -120,6 +120,30 @@ static bool tcg_out_tb_finalize(TCGContext *s);
static TCGContext **tcg_ctxs;
static unsigned int n_tcg_ctxs;
+/*
+ * We divide code_gen_buffer into equally-sized "regions" that TCG threads
+ * dynamically allocate from as demand dictates. Given appropriate region
+ * sizing, this minimizes flushes even when some TCG threads generate a lot
+ * more code than others.
+ */
+struct tcg_region_state {
+ QemuMutex lock;
+
+ /* fields set at init time */
+ void *start;
+ void *start_aligned;
+ void *end;
+ size_t n;
+ size_t size; /* size of one region */
+ size_t stride; /* .size + guard size */
+
+ /* fields protected by the lock */
+ size_t current; /* current region index */
+ size_t agg_size_full; /* aggregate size of full regions */
+};
+
+static struct tcg_region_state region;
+
static TCGRegSet tcg_target_available_regs[2];
static TCGRegSet tcg_target_call_clobber_regs;
@@ -257,6 +281,196 @@ TCGLabel *gen_new_label(void)
#include "tcg-target.inc.c"
+static void tcg_region_bounds(size_t curr_region, void **pstart, void **pend)
+{
+ void *start, *end;
+
+ start = region.start_aligned + curr_region * region.stride;
+ end = start + region.size;
+
+ if (curr_region == 0) {
+ start = region.start;
+ }
+ if (curr_region == region.n - 1) {
+ end = region.end;
+ }
+
+ *pstart = start;
+ *pend = end;
+}
+
+static void tcg_region_assign(TCGContext *s, size_t curr_region)
+{
+ void *start, *end;
+
+ tcg_region_bounds(curr_region, &start, &end);
+
+ s->code_gen_buffer = start;
+ s->code_gen_ptr = start;
+ s->code_gen_buffer_size = end - start;
+ s->code_gen_highwater = end - TCG_HIGHWATER;
+}
+
+static bool tcg_region_alloc__locked(TCGContext *s)
+{
+ if (region.current == region.n) {
+ return true;
+ }
+ tcg_region_assign(s, region.current);
+ region.current++;
+ return false;
+}
+
+/*
+ * Request a new region once the one in use has filled up.
+ * Returns true on error.
+ */
+static bool tcg_region_alloc(TCGContext *s)
+{
+ bool err;
+ /* read the region size now; alloc__locked will overwrite it on success */
+ size_t size_full = s->code_gen_buffer_size;
+
+ qemu_mutex_lock(®ion.lock);
+ err = tcg_region_alloc__locked(s);
+ if (!err) {
+ region.agg_size_full += size_full - TCG_HIGHWATER;
+ }
+ qemu_mutex_unlock(®ion.lock);
+ return err;
+}
+
+/*
+ * Perform a context's first region allocation.
+ * This function does _not_ increment region.agg_size_full.
+ */
+static inline bool tcg_region_initial_alloc__locked(TCGContext *s)
+{
+ return tcg_region_alloc__locked(s);
+}
+
+/* Call from a safe-work context */
+void tcg_region_reset_all(void)
+{
+ unsigned int i;
+
+ qemu_mutex_lock(®ion.lock);
+ region.current = 0;
+ region.agg_size_full = 0;
+
+ for (i = 0; i < n_tcg_ctxs; i++) {
+ bool err = tcg_region_initial_alloc__locked(tcg_ctxs[i]);
+
+ g_assert(!err);
+ }
+ qemu_mutex_unlock(®ion.lock);
+}
+
+/*
+ * Initializes region partitioning.
+ *
+ * Called at init time from the parent thread (i.e. the one calling
+ * tcg_context_init), after the target's TCG globals have been set.
+ */
+void tcg_region_init(void)
+{
+ void *buf = tcg_init_ctx.code_gen_buffer;
+ void *aligned;
+ size_t size = tcg_init_ctx.code_gen_buffer_size;
+ size_t page_size = qemu_real_host_page_size;
+ size_t region_size;
+ size_t n_regions;
+ size_t i;
+
+ /* We do not yet support multiple TCG contexts, so use one region for now */
+ n_regions = 1;
+
+ /* The first region will be 'aligned - buf' bytes larger than the others */
+ aligned = QEMU_ALIGN_PTR_UP(buf, page_size);
+ g_assert(aligned < tcg_init_ctx.code_gen_buffer + size);
+ /*
+ * Make region_size a multiple of page_size, using aligned as the start.
+ * As a result of this we might end up with a few extra pages at the end of
+ * the buffer; we will assign those to the last region.
+ */
+ region_size = (size - (aligned - buf)) / n_regions;
+ region_size = QEMU_ALIGN_DOWN(region_size, page_size);
+
+ /* A region must have at least 2 pages; one code, one guard */
+ g_assert(region_size >= 2 * page_size);
+
+ /* init the region struct */
+ qemu_mutex_init(®ion.lock);
+ region.n = n_regions;
+ region.size = region_size - page_size;
+ region.stride = region_size;
+ region.start = buf;
+ region.start_aligned = aligned;
+ /* page-align the end, since its last page will be a guard page */
+ region.end = QEMU_ALIGN_PTR_DOWN(buf + size, page_size);
+ /* account for that last guard page */
+ region.end -= page_size;
+
+ /* set guard pages */
+ for (i = 0; i < region.n; i++) {
+ void *start, *end;
+ int rc;
+
+ tcg_region_bounds(i, &start, &end);
+ rc = qemu_mprotect_none(end, page_size);
+ g_assert(!rc);
+ }
+
+ /* We do not yet support multiple TCG contexts so allocate the region now */
+ {
+ bool err = tcg_region_initial_alloc__locked(tcg_ctx);
+
+ g_assert(!err);
+ }
+}
+
+/*
+ * Returns the size (in bytes) of all translated code (i.e. from all regions)
+ * currently in the cache.
+ * See also: tcg_code_capacity()
+ * Do not confuse with tcg_current_code_size(); that one applies to a single
+ * TCG context.
+ */
+size_t tcg_code_size(void)
+{
+ unsigned int i;
+ size_t total;
+
+ qemu_mutex_lock(®ion.lock);
+ total = region.agg_size_full;
+ for (i = 0; i < n_tcg_ctxs; i++) {
+ const TCGContext *s = tcg_ctxs[i];
+ size_t size;
+
+ size = atomic_read(&s->code_gen_ptr) - s->code_gen_buffer;
+ g_assert(size <= s->code_gen_buffer_size);
+ total += size;
+ }
+ qemu_mutex_unlock(®ion.lock);
+ return total;
+}
+
+/*
+ * Returns the code capacity (in bytes) of the entire cache, i.e. including all
+ * regions.
+ * See also: tcg_code_size()
+ */
+size_t tcg_code_capacity(void)
+{
+ size_t guard_size, capacity;
+
+ /* no need for synchronization; these variables are set at init time */
+ guard_size = region.stride - region.size;
+ capacity = region.end + guard_size - region.start;
+ capacity -= region.n * (guard_size + TCG_HIGHWATER);
+ return capacity;
+}
+
/* pool based memory allocation */
void *tcg_malloc_internal(TCGContext *s, int size)
{
@@ -400,13 +614,17 @@ TranslationBlock *tcg_tb_alloc(TCGContext *s)
TranslationBlock *tb;
void *next;
+ retry:
tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align);
next = (void *)ROUND_UP((uintptr_t)(tb + 1), align);
if (unlikely(next > s->code_gen_highwater)) {
- return NULL;
+ if (tcg_region_alloc(s)) {
+ return NULL;
+ }
+ goto retry;
}
- s->code_gen_ptr = next;
+ atomic_set(&s->code_gen_ptr, next);
return tb;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 17+ messages in thread