* [Qemu-devel] [PATCH v2 01/13] exec-all: add tb_from_jmp_cache
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 02/13] exec-all: inline tb_from_jmp_cache Emilio G. Cota
` (11 subsequent siblings)
12 siblings, 0 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
This paves the way for upcoming changes.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
cpu-exec.c | 19 +++++++++++++++++++
include/exec/exec-all.h | 2 +-
2 files changed, 20 insertions(+), 1 deletion(-)
diff --git a/cpu-exec.c b/cpu-exec.c
index 63a56d0..b4adf16 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -309,6 +309,25 @@ static bool tb_cmp(const void *p, const void *d)
return false;
}
+TranslationBlock *tb_from_jmp_cache(CPUArchState *env, target_ulong vaddr)
+{
+ CPUState *cpu = ENV_GET_CPU(env);
+ TranslationBlock *tb;
+ target_ulong cs_base, pc;
+ uint32_t flags;
+
+ if (unlikely(atomic_read(&cpu->exit_request))) {
+ return NULL;
+ }
+ cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+ tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(vaddr)]);
+ if (likely(tb && tb->pc == vaddr && tb->cs_base == cs_base &&
+ tb->flags == flags)) {
+ return tb;
+ }
+ return NULL;
+}
+
static TranslationBlock *tb_htable_lookup(CPUState *cpu,
target_ulong pc,
target_ulong cs_base,
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index bcde1e6..18b80bc 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -56,7 +56,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
target_ulong pc, target_ulong cs_base,
uint32_t flags,
int cflags);
-
void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
void QEMU_NORETURN cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc);
void QEMU_NORETURN cpu_loop_exit_atomic(CPUState *cpu, uintptr_t pc);
@@ -368,6 +367,7 @@ struct TranslationBlock {
void tb_free(TranslationBlock *tb);
void tb_flush(CPUState *cpu);
void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
+TranslationBlock *tb_from_jmp_cache(CPUArchState *env, target_ulong vaddr);
#if defined(USE_DIRECT_JUMP)
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 02/13] exec-all: inline tb_from_jmp_cache
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 01/13] exec-all: add tb_from_jmp_cache Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:00 ` Richard Henderson
2017-04-25 11:15 ` Richard Henderson
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 03/13] tcg: enforce 64-byte alignment of TCGContext Emilio G. Cota
` (10 subsequent siblings)
12 siblings, 2 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
The inline improves performance, as shown in upcoming commits' logs.
This commit is kept separate to ease review, since the inclusion
of tb-hash.h might be controversial. The problem here, which was
introduced before this commit, is that tb_hash_func() depends on
page_addr_t: this defeats the original purpose of tb-hash.h,
which was to be self-contained and CPU-agnostic.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
cpu-exec.c | 19 -------------------
include/exec/exec-all.h | 24 +++++++++++++++++++++++-
2 files changed, 23 insertions(+), 20 deletions(-)
diff --git a/cpu-exec.c b/cpu-exec.c
index b4adf16..63a56d0 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -309,25 +309,6 @@ static bool tb_cmp(const void *p, const void *d)
return false;
}
-TranslationBlock *tb_from_jmp_cache(CPUArchState *env, target_ulong vaddr)
-{
- CPUState *cpu = ENV_GET_CPU(env);
- TranslationBlock *tb;
- target_ulong cs_base, pc;
- uint32_t flags;
-
- if (unlikely(atomic_read(&cpu->exit_request))) {
- return NULL;
- }
- cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
- tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(vaddr)]);
- if (likely(tb && tb->pc == vaddr && tb->cs_base == cs_base &&
- tb->flags == flags)) {
- return tb;
- }
- return NULL;
-}
-
static TranslationBlock *tb_htable_lookup(CPUState *cpu,
target_ulong pc,
target_ulong cs_base,
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 18b80bc..bd76987 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -367,7 +367,29 @@ struct TranslationBlock {
void tb_free(TranslationBlock *tb);
void tb_flush(CPUState *cpu);
void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
-TranslationBlock *tb_from_jmp_cache(CPUArchState *env, target_ulong vaddr);
+
+/* tb_hash_func() in tb-hash.h needs tb_page_addr_t, defined above */
+#include "tb-hash.h"
+
+static inline
+TranslationBlock *tb_from_jmp_cache(CPUArchState *env, target_ulong vaddr)
+{
+ CPUState *cpu = ENV_GET_CPU(env);
+ TranslationBlock *tb;
+ target_ulong cs_base, pc;
+ uint32_t flags;
+
+ if (unlikely(atomic_read(&cpu->exit_request))) {
+ return NULL;
+ }
+ cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+ tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(vaddr)]);
+ if (likely(tb && tb->pc == vaddr && tb->cs_base == cs_base &&
+ tb->flags == flags)) {
+ return tb;
+ }
+ return NULL;
+}
#if defined(USE_DIRECT_JUMP)
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 02/13] exec-all: inline tb_from_jmp_cache
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 02/13] exec-all: inline tb_from_jmp_cache Emilio G. Cota
@ 2017-04-25 11:00 ` Richard Henderson
2017-04-25 11:15 ` Richard Henderson
1 sibling, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:00 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> The inline improves performance, as shown in upcoming commits' logs.
>
> This commit is kept separate to ease review, since the inclusion
> of tb-hash.h might be controversial. The problem here, which was
> introduced before this commit, is that tb_hash_func() depends on
> page_addr_t: this defeats the original purpose of tb-hash.h,
> which was to be self-contained and CPU-agnostic.
>
> Signed-off-by: Emilio G. Cota<cota@braap.org>
> ---
> cpu-exec.c | 19 -------------------
> include/exec/exec-all.h | 24 +++++++++++++++++++++++-
> 2 files changed, 23 insertions(+), 20 deletions(-)
Is there a reason we should just inline this code directly into
HELPER(lookup_tb_ptr)? I think that would save a bit of churn, and I can't
think of any other reason we'd want to use this function.
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 02/13] exec-all: inline tb_from_jmp_cache
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 02/13] exec-all: inline tb_from_jmp_cache Emilio G. Cota
2017-04-25 11:00 ` Richard Henderson
@ 2017-04-25 11:15 ` Richard Henderson
1 sibling, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:15 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> +/* tb_hash_func() in tb-hash.h needs tb_page_addr_t, defined above */
> +#include "tb-hash.h"
> +
This causes an include loop (I think), and quite a few targets fail to build.
Are you using --target-list in your testing?
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 03/13] tcg: enforce 64-byte alignment of TCGContext
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 01/13] exec-all: add tb_from_jmp_cache Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 02/13] exec-all: inline tb_from_jmp_cache Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:01 ` Richard Henderson
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 04/13] tcg: keep TCGContext's read-mostly fields in a separate cache line Emilio G. Cota
` (9 subsequent siblings)
12 siblings, 1 reply; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
This will allow us to prevent cache line false sharing in TCGContext.
Before:
$ objdump -t build/x86_64-linux-user/qemu-x86_64 | grep tcg_ctx
00000000003ea820 g O .bss 00000000000152d8 tcg_ctx
After:
$ objdump -t build/x86_64-linux-user/qemu-x86_64 | grep tcg_ctx
00000000003ea880 g O .bss 0000000000015300 tcg_ctx
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/tcg.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 6c216bb..5fdbfe3 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -727,7 +727,7 @@ struct TCGContext {
uint16_t gen_insn_end_off[TCG_MAX_INSNS];
target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
-};
+} QEMU_ALIGNED(64);
extern TCGContext tcg_ctx;
extern bool parallel_cpus;
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 03/13] tcg: enforce 64-byte alignment of TCGContext
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 03/13] tcg: enforce 64-byte alignment of TCGContext Emilio G. Cota
@ 2017-04-25 11:01 ` Richard Henderson
0 siblings, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:01 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> This will allow us to prevent cache line false sharing in TCGContext.
>
> Before:
> $ objdump -t build/x86_64-linux-user/qemu-x86_64 | grep tcg_ctx
> 00000000003ea820 g O .bss 00000000000152d8 tcg_ctx
>
> After:
> $ objdump -t build/x86_64-linux-user/qemu-x86_64 | grep tcg_ctx
> 00000000003ea880 g O .bss 0000000000015300 tcg_ctx
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
> tcg/tcg.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 6c216bb..5fdbfe3 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -727,7 +727,7 @@ struct TCGContext {
>
> uint16_t gen_insn_end_off[TCG_MAX_INSNS];
> target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
> -};
> +} QEMU_ALIGNED(64);
Let's drop the alignment and structure re-arrangement for now and focus on the
task of goto_ptr.
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 04/13] tcg: keep TCGContext's read-mostly fields in a separate cache line
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (2 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 03/13] tcg: enforce 64-byte alignment of TCGContext Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 05/13] tcg-runtime: add lookup_tb_ptr helper Emilio G. Cota
` (8 subsequent siblings)
12 siblings, 0 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Upcoming changes will require reading from TCGContext from a
parallel fast path. Prepare for this by keeping the struct's
read-mostly fields in a separate cache line, thereby preventing
false cache line sharing.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/tcg.h | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 5fdbfe3..b26f0ef 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -642,6 +642,20 @@ QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 14));
QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
struct TCGContext {
+ /* Read-mostly fields go here to prevent false sharing */
+ struct {
+ GHashTable *helpers;
+
+ void *code_gen_prologue;
+ void *code_gen_buffer;
+ size_t code_gen_buffer_size;
+
+ /* Threshold to flush the translated code buffer. */
+ void *code_gen_highwater;
+
+ int code_gen_max_blocks;
+ } QEMU_ALIGNED(64);
+
uint8_t *pool_cur, *pool_end;
TCGPool *pool_first, *pool_current, *pool_first_large;
int nb_labels;
@@ -663,8 +677,6 @@ struct TCGContext {
tcg_insn_unit *code_ptr;
- GHashTable *helpers;
-
#ifdef CONFIG_PROFILER
/* profiling info */
int64_t tb_count1;
@@ -697,15 +709,8 @@ struct TCGContext {
here, because there's too much arithmetic throughout that relies
on addition and subtraction working on bytes. Rely on the GCC
extension that allows arithmetic on void*. */
- int code_gen_max_blocks;
- void *code_gen_prologue;
- void *code_gen_buffer;
- size_t code_gen_buffer_size;
void *code_gen_ptr;
- /* Threshold to flush the translated code buffer. */
- void *code_gen_highwater;
-
TBContext tb_ctx;
/* Track which vCPU triggers events */
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 05/13] tcg-runtime: add lookup_tb_ptr helper
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (3 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 04/13] tcg: keep TCGContext's read-mostly fields in a separate cache line Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:02 ` Richard Henderson
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 06/13] tcg: add goto_ptr opcode Emilio G. Cota
` (7 subsequent siblings)
12 siblings, 1 reply; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg-runtime.c | 7 +++++++
tcg/tcg-runtime.h | 2 ++
tcg/tcg.h | 1 +
3 files changed, 10 insertions(+)
diff --git a/tcg-runtime.c b/tcg-runtime.c
index 4c60c96..f291184 100644
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -141,6 +141,13 @@ uint64_t HELPER(ctpop_i64)(uint64_t arg)
return ctpop64(arg);
}
+void *HELPER(lookup_tb_ptr)(CPUArchState *env, target_ulong addr)
+{
+ TranslationBlock *tb = tb_from_jmp_cache(env, addr);
+
+ return tb ? tb->tc_ptr : tcg_ctx.code_gen_epilogue;
+}
+
void HELPER(exit_atomic)(CPUArchState *env)
{
cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
index 114ea6f..c41d38a 100644
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -24,6 +24,8 @@ DEF_HELPER_FLAGS_1(clrsb_i64, TCG_CALL_NO_RWG_SE, i64, i64)
DEF_HELPER_FLAGS_1(ctpop_i32, TCG_CALL_NO_RWG_SE, i32, i32)
DEF_HELPER_FLAGS_1(ctpop_i64, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_2(lookup_tb_ptr, TCG_CALL_NO_WG_SE, ptr, env, tl)
+
DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
#ifdef CONFIG_SOFTMMU
diff --git a/tcg/tcg.h b/tcg/tcg.h
index b26f0ef..625e2aa 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -647,6 +647,7 @@ struct TCGContext {
GHashTable *helpers;
void *code_gen_prologue;
+ void *code_gen_epilogue;
void *code_gen_buffer;
size_t code_gen_buffer_size;
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 05/13] tcg-runtime: add lookup_tb_ptr helper
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 05/13] tcg-runtime: add lookup_tb_ptr helper Emilio G. Cota
@ 2017-04-25 11:02 ` Richard Henderson
0 siblings, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:02 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
> tcg-runtime.c | 7 +++++++
> tcg/tcg-runtime.h | 2 ++
> tcg/tcg.h | 1 +
> 3 files changed, 10 insertions(+)
Modulo what I mentioned earlier about maybe directly inlining tb_from_jmp_cache,
Reviewed-by: Richard Henderson <rth@twiddle.net>
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 06/13] tcg: add goto_ptr opcode
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (4 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 05/13] tcg-runtime: add lookup_tb_ptr helper Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:05 ` Richard Henderson
2017-04-25 12:09 ` Richard Henderson
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 07/13] tcg/i386: implement goto_ptr op Emilio G. Cota
` (6 subsequent siblings)
12 siblings, 2 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/README | 11 +++++++++++
tcg/aarch64/tcg-target.h | 1 +
tcg/arm/tcg-target.h | 1 +
tcg/i386/tcg-target.h | 1 +
tcg/ia64/tcg-target.h | 1 +
tcg/mips/tcg-target.h | 1 +
tcg/ppc/tcg-target.h | 1 +
tcg/s390/tcg-target.h | 1 +
tcg/sparc/tcg-target.h | 1 +
tcg/tcg-op.c | 9 +++++++++
tcg/tcg-op.h | 9 +++++++++
tcg/tcg-opc.h | 1 +
tcg/tcg.c | 1 +
tcg/tci/tcg-target.h | 1 +
14 files changed, 40 insertions(+)
diff --git a/tcg/README b/tcg/README
index a9858c2..9cfd422 100644
--- a/tcg/README
+++ b/tcg/README
@@ -477,6 +477,17 @@ current TB was linked to this TB. Otherwise execute the next
instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
at most once with each slot index per TB.
+* goto_ptr ptr
+
+Jump to a host address given by host pointer 'ptr'. Typically ptr is obtained
+from the lookup_tb_ptr TCG helper. The return value of this helper depends on
+whether the TB is currently valid: if it is, the corresponding host address
+is returned; if it is not valid, the helper returns the address of the TCG
+epilogue, which restores state to go back to the exec loop.
+
+Implementing goto_ptr is optional for TCG backends. When not implemented,
+calling it is equivalent to calling exit_tb(0).
+
* qemu_ld_i32/i64 t0, t1, flags, memidx
* qemu_st_i32/i64 t0, t1, flags, memidx
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 1a5ea23..b82eac4 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -77,6 +77,7 @@ typedef enum {
#define TCG_TARGET_HAS_mulsh_i32 0
#define TCG_TARGET_HAS_extrl_i64_i32 0
#define TCG_TARGET_HAS_extrh_i64_i32 0
+#define TCG_TARGET_HAS_goto_ptr 0
#define TCG_TARGET_HAS_div_i64 1
#define TCG_TARGET_HAS_rem_i64 1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 09a19c6..2f3ecfd 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -123,6 +123,7 @@ extern bool use_idiv_instructions;
#define TCG_TARGET_HAS_mulsh_i32 0
#define TCG_TARGET_HAS_div_i32 use_idiv_instructions
#define TCG_TARGET_HAS_rem_i32 0
+#define TCG_TARGET_HAS_goto_ptr 0
enum {
TCG_AREG0 = TCG_REG_R6,
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 4275787..59d9835 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -107,6 +107,7 @@ extern bool have_popcnt;
#define TCG_TARGET_HAS_muls2_i32 1
#define TCG_TARGET_HAS_muluh_i32 0
#define TCG_TARGET_HAS_mulsh_i32 0
+#define TCG_TARGET_HAS_goto_ptr 0
#if TCG_TARGET_REG_BITS == 64
#define TCG_TARGET_HAS_extrl_i64_i32 0
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index 42aea03..901bb75 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -173,6 +173,7 @@ typedef enum {
#define TCG_TARGET_HAS_mulsh_i64 0
#define TCG_TARGET_HAS_extrl_i64_i32 0
#define TCG_TARGET_HAS_extrh_i64_i32 0
+#define TCG_TARGET_HAS_goto_ptr 0
#define TCG_TARGET_deposit_i32_valid(ofs, len) ((len) <= 16)
#define TCG_TARGET_deposit_i64_valid(ofs, len) ((len) <= 16)
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index f46d64a..e3240cf 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -130,6 +130,7 @@ extern bool use_mips32r2_instructions;
#define TCG_TARGET_HAS_muluh_i32 1
#define TCG_TARGET_HAS_mulsh_i32 1
#define TCG_TARGET_HAS_bswap32_i32 1
+#define TCG_TARGET_HAS_goto_ptr 0
#if TCG_TARGET_REG_BITS == 64
#define TCG_TARGET_HAS_add2_i32 0
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index abd8b3d..a9aa974 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -82,6 +82,7 @@ extern bool have_isa_3_00;
#define TCG_TARGET_HAS_muls2_i32 0
#define TCG_TARGET_HAS_muluh_i32 1
#define TCG_TARGET_HAS_mulsh_i32 1
+#define TCG_TARGET_HAS_goto_ptr 0
#if TCG_TARGET_REG_BITS == 64
#define TCG_TARGET_HAS_add2_i32 0
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index cbdd2a6..6b7bcfb 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -92,6 +92,7 @@ extern uint64_t s390_facilities;
#define TCG_TARGET_HAS_mulsh_i32 0
#define TCG_TARGET_HAS_extrl_i64_i32 0
#define TCG_TARGET_HAS_extrh_i64_i32 0
+#define TCG_TARGET_HAS_goto_ptr 0
#define TCG_TARGET_HAS_div2_i64 1
#define TCG_TARGET_HAS_rot_i64 1
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index b8b74f9..9348ddd 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -123,6 +123,7 @@ extern bool use_vis3_instructions;
#define TCG_TARGET_HAS_muls2_i32 1
#define TCG_TARGET_HAS_muluh_i32 0
#define TCG_TARGET_HAS_mulsh_i32 0
+#define TCG_TARGET_HAS_goto_ptr 0
#define TCG_TARGET_HAS_extrl_i64_i32 1
#define TCG_TARGET_HAS_extrh_i64_i32 1
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 95a39b7..e8a140b 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2587,6 +2587,15 @@ void tcg_gen_goto_tb(unsigned idx)
tcg_gen_op1i(INDEX_op_goto_tb, idx);
}
+void tcg_gen_goto_ptr(TCGv_ptr ptr)
+{
+ if (TCG_TARGET_HAS_goto_ptr) {
+ tcg_gen_op1i(INDEX_op_goto_ptr, GET_TCGV_PTR(ptr));
+ } else {
+ tcg_gen_exit_tb(0);
+ }
+}
+
static inline TCGMemOp tcg_canonicalize_memop(TCGMemOp op, bool is64, bool st)
{
/* Trigger the asserts within as early as possible. */
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index c68e300..d65727e 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -796,6 +796,15 @@ static inline void tcg_gen_exit_tb(uintptr_t val)
*/
void tcg_gen_goto_tb(unsigned idx);
+/**
+ * tcg_gen_goto_ptr() - output a jump to a host address
+ * @ptr: pointer with the target host address
+ *
+ * Implementing this operation is optional. If the TCG backend does not support
+ * it, this call is equivalent to tcg_gen_exit_tb() with 0 as the argument.
+ */
+void tcg_gen_goto_ptr(TCGv_ptr ptr);
+
#if TARGET_LONG_BITS == 32
#define tcg_temp_new() tcg_temp_new_i32()
#define tcg_global_reg_new tcg_global_reg_new_i32
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index f06f894..c64b994 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -193,6 +193,7 @@ DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
TCG_OPF_NOT_PRESENT)
DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_END)
DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_END)
+DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_END)
DEF(qemu_ld_i32, 1, TLADDR_ARGS, 1,
TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index cb898f1..0ea57c0 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1138,6 +1138,7 @@ void tcg_dump_ops(TCGContext *s)
}
switch (c) {
case INDEX_op_set_label:
+ case INDEX_op_goto_tb:
case INDEX_op_br:
case INDEX_op_brcond_i32:
case INDEX_op_brcond_i64:
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 838bf3a..0696328 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -85,6 +85,7 @@
#define TCG_TARGET_HAS_muls2_i32 0
#define TCG_TARGET_HAS_muluh_i32 0
#define TCG_TARGET_HAS_mulsh_i32 0
+#define TCG_TARGET_HAS_goto_ptr 0
#if TCG_TARGET_REG_BITS == 64
#define TCG_TARGET_HAS_extrl_i64_i32 0
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 06/13] tcg: add goto_ptr opcode
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 06/13] tcg: add goto_ptr opcode Emilio G. Cota
@ 2017-04-25 11:05 ` Richard Henderson
2017-04-25 12:09 ` Richard Henderson
1 sibling, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:05 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> @@ -1138,6 +1138,7 @@ void tcg_dump_ops(TCGContext *s)
> }
> switch (c) {
> case INDEX_op_set_label:
> + case INDEX_op_goto_tb:
> case INDEX_op_br:
> case INDEX_op_brcond_i32:
> case INDEX_op_brcond_i64:
This is wrong, and causes crashes when dumping. Nor should goto_ptr be here,
so I don't know what you were after.
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 06/13] tcg: add goto_ptr opcode
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 06/13] tcg: add goto_ptr opcode Emilio G. Cota
2017-04-25 11:05 ` Richard Henderson
@ 2017-04-25 12:09 ` Richard Henderson
1 sibling, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 12:09 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> +void tcg_gen_goto_ptr(TCGv_ptr ptr)
> +{
> + if (TCG_TARGET_HAS_goto_ptr) {
> + tcg_gen_op1i(INDEX_op_goto_ptr, GET_TCGV_PTR(ptr));
> + } else {
> + tcg_gen_exit_tb(0);
> + }
> +}
> +
I think this function should look more like
void tcg_gen_lookup_and_goto_ptr(TCGv target_addr)
{
if (TCG_TARGET_HAS_goto_ptr) {
TCGv_ptr ptr = tcg_temp_new_ptr();
gen_helper_lookup_tb_ptr(ptr, tcg_ctx.tcg_env, addr);
tcg_gen_op1i(INDEX_op_goto_ptr, GET_TCGV_PTR(ptr));
tcg_temp_free_ptr(ptr);
} else {
tcg_gen_exit_tb(0);
}
}
since there's not really any point in all targets being exposed to the
implementation detail of lookup_tb_ptr.
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 07/13] tcg/i386: implement goto_ptr op
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (5 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 06/13] tcg: add goto_ptr opcode Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:24 ` Richard Henderson
2017-04-25 11:32 ` Richard Henderson
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 08/13] target/arm: optimize cross-page block chaining in softmmu Emilio G. Cota
` (5 subsequent siblings)
12 siblings, 2 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tcg/i386/tcg-target.h | 2 +-
tcg/i386/tcg-target.inc.c | 13 +++++++++++++
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 59d9835..73a15f7 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -107,7 +107,7 @@ extern bool have_popcnt;
#define TCG_TARGET_HAS_muls2_i32 1
#define TCG_TARGET_HAS_muluh_i32 0
#define TCG_TARGET_HAS_mulsh_i32 0
-#define TCG_TARGET_HAS_goto_ptr 0
+#define TCG_TARGET_HAS_goto_ptr 1
#if TCG_TARGET_REG_BITS == 64
#define TCG_TARGET_HAS_extrl_i64_i32 0
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 5918008..f6fb03e 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1906,6 +1906,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
}
s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
break;
+ case INDEX_op_goto_ptr:
+ /* save target address into new register */
+ tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EDX, a0);
+ /* set return value to 0 */
+ tgen_arithr(s, ARITH_XOR, TCG_REG_EAX, TCG_REG_EAX);
+ /* jmp to the target address (could be epilogue) */
+ tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_EDX);
+ break;
case INDEX_op_br:
tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
break;
@@ -2277,6 +2285,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
{
+ static const TCGTargetOpDef ri = { .args_ct_str = { "ri" } };
static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
@@ -2324,6 +2333,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_st_i64:
return &re_r;
+ case INDEX_op_goto_ptr:
+ return &ri;
+
case INDEX_op_add_i32:
case INDEX_op_add_i64:
return &r_r_re;
@@ -2569,6 +2581,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)
/* TB epilogue */
tb_ret_addr = s->code_ptr;
+ s->code_gen_epilogue = s->code_ptr;
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 07/13] tcg/i386: implement goto_ptr op
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 07/13] tcg/i386: implement goto_ptr op Emilio G. Cota
@ 2017-04-25 11:24 ` Richard Henderson
2017-04-25 11:32 ` Richard Henderson
1 sibling, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:24 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> + case INDEX_op_goto_ptr:
> + /* save target address into new register */
> + tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EDX, a0);
> + /* set return value to 0 */
> + tgen_arithr(s, ARITH_XOR, TCG_REG_EAX, TCG_REG_EAX);
> + /* jmp to the target address (could be epilogue) */
> + tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_EDX);
> + break;
I've just thought of an improvement to be made here -- move the xor after the
jump (and therefore just before the "normal" epilogue) like so.
r~
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index f6fb03e..f636557 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1907,12 +1907,8 @@ static inline void tcg_out_op
s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
break;
case INDEX_op_goto_ptr:
- /* save target address into new register */
- tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EDX, a0);
- /* set return value to 0 */
- tgen_arithr(s, ARITH_XOR, TCG_REG_EAX, TCG_REG_EAX);
- /* jmp to the target address (could be epilogue) */
- tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_EDX);
+ /* jmp to the given host address (could be epilogue) */
+ tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
break;
case INDEX_op_br:
tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
@@ -2579,10 +2575,13 @@ static void tcg_target_qemu_prologue
tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
#endif
- /* TB epilogue */
- tb_ret_addr = s->code_ptr;
+ /* Return path for goto_ptr. Set return value to 0, a-la exit_tb,
+ and fall through to the rest of the epilogue. */
s->code_gen_epilogue = s->code_ptr;
+ tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
+ /* TB epilogue */
+ tb_ret_addr = s->code_ptr;
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 07/13] tcg/i386: implement goto_ptr op
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 07/13] tcg/i386: implement goto_ptr op Emilio G. Cota
2017-04-25 11:24 ` Richard Henderson
@ 2017-04-25 11:32 ` Richard Henderson
1 sibling, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:32 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
> {
> + static const TCGTargetOpDef ri = { .args_ct_str = { "ri" } };
> static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
> static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
> static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
> @@ -2324,6 +2333,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
> case INDEX_op_st_i64:
> return &re_r;
>
> + case INDEX_op_goto_ptr:
> + return &ri;
> +
This is incorrect. You only handle register inputs (i.e. just "r") in your
implementation. Indeed, that's also the only thing that makes sense.
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 08/13] target/arm: optimize cross-page block chaining in softmmu
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (6 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 07/13] tcg/i386: implement goto_ptr op Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:11 ` Richard Henderson
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 09/13] target/arm: optimize indirect branches with TCG's goto_ptr Emilio G. Cota
` (4 subsequent siblings)
12 siblings, 1 reply; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Instead of unconditionally exiting to the exec loop, use the
lookup_tb_ptr helper to jump to the target if it is valid.
As long as the hit rate in tb_jmp_cache remains high, this
will improve performance.
Perf impact: see the next commit's log.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
target/arm/translate.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/target/arm/translate.c b/target/arm/translate.c
index e32e38c..574cf70 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4085,8 +4085,12 @@ static inline void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
gen_set_pc_im(s, dest);
tcg_gen_exit_tb((uintptr_t)s->tb + n);
} else {
+ TCGv_ptr ptr = tcg_temp_new_ptr();
+
gen_set_pc_im(s, dest);
- tcg_gen_exit_tb(0);
+ gen_helper_lookup_tb_ptr(ptr, cpu_env, cpu_R[15]);
+ tcg_gen_goto_ptr(ptr);
+ tcg_temp_free_ptr(ptr);
}
}
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 08/13] target/arm: optimize cross-page block chaining in softmmu
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 08/13] target/arm: optimize cross-page block chaining in softmmu Emilio G. Cota
@ 2017-04-25 11:11 ` Richard Henderson
0 siblings, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:11 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> Instead of unconditionally exiting to the exec loop, use the
> lookup_tb_ptr helper to jump to the target if it is valid.
> As long as the hit rate in tb_jmp_cache remains high, this
> will improve performance.
>
> Perf impact: see the next commit's log.
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
> target/arm/translate.c | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/target/arm/translate.c b/target/arm/translate.c
> index e32e38c..574cf70 100644
> --- a/target/arm/translate.c
> +++ b/target/arm/translate.c
> @@ -4085,8 +4085,12 @@ static inline void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
> gen_set_pc_im(s, dest);
> tcg_gen_exit_tb((uintptr_t)s->tb + n);
> } else {
> + TCGv_ptr ptr = tcg_temp_new_ptr();
> +
> gen_set_pc_im(s, dest);
> - tcg_gen_exit_tb(0);
> + gen_helper_lookup_tb_ptr(ptr, cpu_env, cpu_R[15]);
> + tcg_gen_goto_ptr(ptr);
> + tcg_temp_free_ptr(ptr);
> }
This does not compile for aarch64. You need to tcg_gen_extu_i32_tl first.
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 09/13] target/arm: optimize indirect branches with TCG's goto_ptr
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (7 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 08/13] target/arm: optimize cross-page block chaining in softmmu Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:12 ` Richard Henderson
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 10/13] target/i386: introduce gen_jr() helper to jump to register Emilio G. Cota
` (3 subsequent siblings)
12 siblings, 1 reply; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Speed up indirect branches by directly jumping to the target
if it is valid, i.e. if it is found in tb_jmp_cache.
Softmmu measurements: (see later commit for user-mode results)
Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.
- Impact on Boot time
| setup | ARM debian boot+shutdown time | stddev |
|---------------+-------------------------------+--------|
| v2.9.0 | 10.35 | 0.07 |
| +cross+inline | 10.32 | 0.03 |
| +jr+inline | 10.59 | 0.20 |
- NBench, arm-softmmu (debian jessie guest). Host: Intel i7-4790K @ 4.00GHz
1.25x +-+-------------------------------------------------------------------------------------------------------------+-+
| +++ | |
| cross+inline #### | |
1.2x +cross+jr+inline.........................................#++#......####.........................................+-+
| +++# # #| # |
| +++ +++ **** # #| # +++ |
| | #### * * # #++# | |
1.15x +-+................................****++#............*..*..#......#..#.....####................................+-+
| *++* # * * # # # # |# |
| * * # +++ * * # # # #++# |
1.1x +-+................................*..*..#.......|....*..*..#......#..#.....#..#................................+-+
| * * # #### * * # # # # # #### |
| +++ * * # #++# * * # # # # # # # |
| #### * * # # # * * # # # # # # # |
1.05x +-+..........................#++#..*..*..#......#..#..*..*..#...+++#..#.....#..#................+++......#..#...+-+
| # # * * # # # * * # ***** # # # +++#### ***** # |
| ++++++ +++ +++ # # * * # +++# # * * # * | * # # # **** # *+++* # |
1x +-++-+++++####++****###++++-+#++#+-*++*++#-+++++#-+#++*++*++#++*+-+*++#+-+++#++#-+*****###++*++*++#++*+-+*++#+-++-+
| *****++# *++*++# | # # * * # | # # * * # * * # **** # *+++*++# * * # * * # |
| *+++* # * * # ***** # * * # ***** # * * # * * # *++* # * * # * * # * * # |
| * * # * * # *+++* # * * # * | * # * * # * * # * * # * * # * * # * * # |
0.95x +-+...*...*..#..*..*..#..*...*..#..*..*..#..*+++*..#..*..*..#..*...*..#..*..*..#..*...*..#..*..*..#..*...*..#...+-+
| * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # |
| * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # |
0.9x +-+---*****###--****###--*****###--****###--*****###--****###--*****###--****###--*****###--****###--*****###---+-+
ASSIGNMENT BITFIELD FOURFP EMULATION HUFFMAN LU DECOMPOSITIONEURAL NNUMERIC SOSTRING SORT hmean
png: http://imgur.com/528aS76
NB. 'cross' represents the previous commit.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
target/arm/translate.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 574cf70..d5296b1 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -65,6 +65,7 @@ static TCGv_i32 cpu_R[16];
TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
TCGv_i64 cpu_exclusive_addr;
TCGv_i64 cpu_exclusive_val;
+static bool gen_jr;
/* FIXME: These should be removed. */
static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -221,6 +222,7 @@ static void store_reg(DisasContext *s, int reg, TCGv_i32 var)
*/
tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3);
s->is_jmp = DISAS_JUMP;
+ gen_jr = true;
}
tcg_gen_mov_i32(cpu_R[reg], var);
tcg_temp_free_i32(var);
@@ -893,6 +895,7 @@ static inline void gen_bx_im(DisasContext *s, uint32_t addr)
tcg_temp_free_i32(tmp);
}
tcg_gen_movi_i32(cpu_R[15], addr & ~1);
+ gen_jr = true;
}
/* Set PC and Thumb state from var. var is marked as dead. */
@@ -902,6 +905,7 @@ static inline void gen_bx(DisasContext *s, TCGv_i32 var)
tcg_gen_andi_i32(cpu_R[15], var, ~1);
tcg_gen_andi_i32(var, var, 1);
store_cpu_field(var, thumb);
+ gen_jr = true;
}
/* Variant of store_reg which uses branch&exchange logic when storing
@@ -12034,6 +12038,16 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
gen_set_pc_im(dc, dc->pc);
/* fall through */
case DISAS_JUMP:
+ if (gen_jr) {
+ TCGv_ptr ptr = tcg_temp_new_ptr();
+
+ gen_jr = false;
+ gen_helper_lookup_tb_ptr(ptr, cpu_env, cpu_R[15]);
+ tcg_gen_goto_ptr(ptr);
+ tcg_temp_free_ptr(ptr);
+ break;
+ }
+ /* fall through */
default:
/* indicate that the hash table must be used to find the next TB */
tcg_gen_exit_tb(0);
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 09/13] target/arm: optimize indirect branches with TCG's goto_ptr
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 09/13] target/arm: optimize indirect branches with TCG's goto_ptr Emilio G. Cota
@ 2017-04-25 11:12 ` Richard Henderson
0 siblings, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:12 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> +
> + gen_jr = false;
> + gen_helper_lookup_tb_ptr(ptr, cpu_env, cpu_R[15]);
> + tcg_gen_goto_ptr(ptr);
> + tcg_temp_free_ptr(ptr);
> + break;
Likewise doesn't compile for aarch64.
r~
^ permalink raw reply [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 10/13] target/i386: introduce gen_jr() helper to jump to register
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (8 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 09/13] target/arm: optimize indirect branches with TCG's goto_ptr Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 11/13] target/i386: optimize cross-page direct jumps in softmmu Emilio G. Cota
` (2 subsequent siblings)
12 siblings, 0 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
This helper will be used by subsequent commits.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
target/i386/translate.c | 28 +++++++++++++++++++++++-----
1 file changed, 23 insertions(+), 5 deletions(-)
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 1d1372f..445082b 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -141,6 +141,7 @@ typedef struct DisasContext {
} DisasContext;
static void gen_eob(DisasContext *s);
+static void gen_jr(DisasContext *s, TCGv dest);
static void gen_jmp(DisasContext *s, target_ulong eip);
static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d);
@@ -2509,7 +2510,8 @@ static void gen_bnd_jmp(DisasContext *s)
If INHIBIT, set HF_INHIBIT_IRQ_MASK if it isn't already set.
If RECHECK_TF, emit a rechecking helper for #DB, ignoring the state of
S->TF. This is used by the syscall/sysret insns. */
-static void gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf)
+static void
+gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf, TCGv jr)
{
gen_update_cc_op(s);
@@ -2530,6 +2532,16 @@ static void gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf)
tcg_gen_exit_tb(0);
} else if (s->tf) {
gen_helper_single_step(cpu_env);
+ } else if (jr) {
+ TCGv vaddr = tcg_temp_new();
+ TCGv_ptr ptr = tcg_temp_new_ptr();
+
+ tcg_gen_ld_tl(vaddr, cpu_env, offsetof(CPUX86State, segs[R_CS].base));
+ tcg_gen_add_tl(vaddr, vaddr, jr);
+ gen_helper_lookup_tb_ptr(ptr, cpu_env, vaddr);
+ tcg_temp_free(vaddr);
+ tcg_gen_goto_ptr(ptr);
+ tcg_temp_free_ptr(ptr);
} else {
tcg_gen_exit_tb(0);
}
@@ -2540,13 +2552,19 @@ static void gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf)
If INHIBIT, set HF_INHIBIT_IRQ_MASK if it isn't already set. */
static void gen_eob_inhibit_irq(DisasContext *s, bool inhibit)
{
- gen_eob_worker(s, inhibit, false);
+ gen_eob_worker(s, inhibit, false, NULL);
}
/* End of block, resetting the inhibit irq flag. */
static void gen_eob(DisasContext *s)
{
- gen_eob_worker(s, false, false);
+ gen_eob_worker(s, false, false, NULL);
+}
+
+/* Jump to register */
+static void gen_jr(DisasContext *s, TCGv dest)
+{
+ gen_eob_worker(s, false, false, dest);
}
/* generate a jump to eip. No segment change must happen before as a
@@ -7131,7 +7149,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
/* TF handling for the syscall insn is different. The TF bit is checked
after the syscall insn completes. This allows #DB to not be
generated after one has entered CPL0 if TF is set in FMASK. */
- gen_eob_worker(s, false, true);
+ gen_eob_worker(s, false, true, NULL);
break;
case 0x107: /* sysret */
if (!s->pe) {
@@ -7146,7 +7164,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
checked after the sysret insn completes. This allows #DB to be
generated "as if" the syscall insn in userspace has just
completed. */
- gen_eob_worker(s, false, true);
+ gen_eob_worker(s, false, true, NULL);
}
break;
#endif
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 11/13] target/i386: optimize cross-page direct jumps in softmmu
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (9 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 10/13] target/i386: introduce gen_jr() helper to jump to register Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 12/13] target/i386: optimize indirect branches Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 13/13] tb-hash: improve tb_jmp_cache hash function in user mode Emilio G. Cota
12 siblings, 0 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Instead of unconditionally exiting to the exec loop, use the
gen_jr helper to jump to the target if it is valid.
As long as the hit rate in tb_jmp_cache remains high, this
change improves performance.
Perf impact: see the next commit's log.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
target/i386/translate.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 445082b..9982a2d 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -2154,9 +2154,9 @@ static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip)
gen_jmp_im(eip);
tcg_gen_exit_tb((uintptr_t)s->tb + tb_num);
} else {
- /* jump to another page: currently not optimized */
+ /* jump to another page */
gen_jmp_im(eip);
- gen_eob(s);
+ gen_jr(s, cpu_tmp0);
}
}
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 12/13] target/i386: optimize indirect branches
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (10 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 11/13] target/i386: optimize cross-page direct jumps in softmmu Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 13/13] tb-hash: improve tb_jmp_cache hash function in user mode Emilio G. Cota
12 siblings, 0 replies; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
The appended minimizes exits to the exec loop for indirect branches.
By using the gen_jr helper, we can remain in TCG mode as long as
the indirect branch target is found in tb_jmp_cache.
This should improve performance for workloads that have a high
hit rate in tb_jmp_cache.
Softmmu Measurements: (see user-mode measurements in later commit)
Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.
- SPECint06 (test set), x86_64-softmmu (Ubuntu 16.04 guest). Host: Intel i7-4790K @ 4.00GHz
2.2x +-+--------------------------------------------------------------------------------------------------------------+-+
| +++ |
| cross+inline | |
2x +cross+jr+inline................................................................+++.|............................+-+
| | | |
| | | |
| | | |
1.8x +-+..............................................................................|..|............................+-+
| |#### |
| |# |# |
1.6x +-+............................................................................****.|#...........................+-+
| * |* |# |
| * |* |# |
| * |* |# |
1.4x +-+.......................................................................+++..*.|*.|#...........................+-+
| +++ | * |*++# +++ |
| +++ | #### * |* # +++ | |
1.2x +-+......................###.............+++............|.+++.............#++#.*++*..#...........|..|............+-+
| +++# # +++ | | | ++# # * * # +++ ****## #### |
| ++#### **** # +++#### #### *** | **** # * * # ++#### *| *|# ****++# |
| ****++# ++#### * * # **** # ++#| # ++#### *|*### ****## * * # * * # *** |# *++*+# *++* # |
1x +-++-*++*++#++***+-#++*++*+#++*+-*++#+****++#++***++#+-*+*++#-+*++*+#++*++*-+#+*++*-+#++*+*++#++*-+*+#++*++*++#-++-+
| * * # * * # * * # * * # *++* # * * # *+* |# * * # * * # * * # * * # * * # * * # |
| * * # * * # * * # * * # * * # * * # * *++# * * # * * # * * # * * # * * # * * # |
0.8x +-+--****###--***###--****##--****###-****###--***###--***###--****##--****###-****###--***###--****##--****###--+-+
astar bzip2 gcc gobmk h264ref hmmlibquantum mcf omnetpperlbench sjengxalancbmk hmean
png: http://imgur.com/aSXm0qh
NB. 'cross' represents the previous commit.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
target/i386/translate.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 9982a2d..0b4e1e1 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -4991,7 +4991,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
gen_push_v(s, cpu_T1);
gen_op_jmp_v(cpu_T0);
gen_bnd_jmp(s);
- gen_eob(s);
+ gen_jr(s, cpu_T0);
break;
case 3: /* lcall Ev */
gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
@@ -5009,7 +5009,8 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
tcg_const_i32(dflag - 1),
tcg_const_i32(s->pc - s->cs_base));
}
- gen_eob(s);
+ tcg_gen_ld_tl(cpu_tmp4, cpu_env, offsetof(CPUX86State, eip));
+ gen_jr(s, cpu_tmp4);
break;
case 4: /* jmp Ev */
if (dflag == MO_16) {
@@ -5017,7 +5018,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
}
gen_op_jmp_v(cpu_T0);
gen_bnd_jmp(s);
- gen_eob(s);
+ gen_jr(s, cpu_T0);
break;
case 5: /* ljmp Ev */
gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
@@ -5032,7 +5033,8 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
gen_op_movl_seg_T0_vm(R_CS);
gen_op_jmp_v(cpu_T1);
}
- gen_eob(s);
+ tcg_gen_ld_tl(cpu_tmp4, cpu_env, offsetof(CPUX86State, eip));
+ gen_jr(s, cpu_tmp4);
break;
case 6: /* push Ev */
gen_push_v(s, cpu_T0);
@@ -6412,7 +6414,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
/* Note that gen_pop_T0 uses a zero-extending load. */
gen_op_jmp_v(cpu_T0);
gen_bnd_jmp(s);
- gen_eob(s);
+ gen_jr(s, cpu_T0);
break;
case 0xc3: /* ret */
ot = gen_pop_T0(s);
@@ -6420,7 +6422,7 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
/* Note that gen_pop_T0 uses a zero-extending load. */
gen_op_jmp_v(cpu_T0);
gen_bnd_jmp(s);
- gen_eob(s);
+ gen_jr(s, cpu_T0);
break;
case 0xca: /* lret im */
val = cpu_ldsw_code(env, s->pc);
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [Qemu-devel] [PATCH v2 13/13] tb-hash: improve tb_jmp_cache hash function in user mode
2017-04-25 7:53 [Qemu-devel] [PATCH v2 00/13] TCG optimizations for 2.10 Emilio G. Cota
` (11 preceding siblings ...)
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 12/13] target/i386: optimize indirect branches Emilio G. Cota
@ 2017-04-25 7:53 ` Emilio G. Cota
2017-04-25 11:19 ` Richard Henderson
12 siblings, 1 reply; 25+ messages in thread
From: Emilio G. Cota @ 2017-04-25 7:53 UTC (permalink / raw)
To: qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Richard Henderson,
Peter Maydell, Eduardo Habkost, Andrzej Zaborowski,
Aurelien Jarno, Alexander Graf, Stefan Weil, qemu-arm,
alex.bennee, Pranith Kumar
Optimizations to cross-page chaining and indirect branches make
performance more sensitive to the hit rate of tb_jmp_cache.
The constraint of reserving some bits for the page number
lowers the achievable quality of the hashing function.
However, user-mode does not have this requirement. Thus,
with this change we use for user-mode a hashing function that
is both faster and of better quality than the previous one.
Measurements:
Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.
- SPECint06 (test set), x86_64-linux-user. Host: Intel i7-6700K @ 4.00GHz
2x +-+--------------------------------------------------------------------------------------------------------------+-+
| +++++ |
| jr+noinline | | |
| jr+inline ++%%@ |
1.8x +-+jr+hash+noinline +..............................................|%%@...................................+-+
|jr+multhash+inline |%%@+ |
| jr+hash+inline +$$$%@ |
| ++##|$%@ +++ |
1.6x +-+....................................................................|##|$%@....................+%%%...........+-+
| @@+ **#+$%@ $$+% |
| $$$%@+ +**#+$%@ ++++ ++$$+%@ |
| ++++ $ $%@ **# $%@ +$$%@@+++$$ %@ |
1.4x +-+.....................+%%%@..........##+$%@..........................**#.$%@...........+$$%.@***$$.%@..........+-+
| ++$$+%@ ##+$%@ **# $%@ $$% @* *#$+%@ |
| ***#$ %@ +**# $%@ **# $%@ +###$% @* *#$ %@ |
| *+*#$ %@ +%%@+**# $%@ **# $%@ **+#$% @*+*#$ %@ +%%%@+ |
1.2x +-+..................*.*#$.%@***#$$%@+**#.$%@..........................**#.$%@.........**.#$%.@*.*#$.%@***#$+%@+.+-+
| +++ * *#$ %@* *# $%@ **# $%@ +++++++ **# $%@ +++%%@@** #$% @* *#$ %@*+*#$ %@ |
| ++###$%+ * *#$ %@* *# $%@ **# $%@ **##$%@@ **# $%@+**#$$%+@** #$% @* *#$ %@* *#$ %@ |
| +**+#$%@@ ++$$@@@* *#$ %@* *# $%@ **# $%@ ** #$% @+###++@@++++%%%+ **# $%@ **# $% @** #$% @* *#$ %@* *#$ %@ |
1x +-++-**+#$%-@**##$%+@*+*#$+%@*+*#+$%@+**#+$%@+**+#$%+@**+#$+@@***#$+%@+**#+$%@+**#+$%+@**+#$%+@*+*#$+%@*-*#$+%@-++-+
| ** #$% @** #$% @* *#$ %@* *# $%@ **# $%@ ** #$% @** #$%%@* *#$ %@ **# $%@ **# $% @** #$% @* *#$ %@* *#$ %@ |
| ** #$% @** #$% @* *#$ %@* *# $%@ **# $%@ ** #$% @** #$+%@* *#$ %@ **# $%@ **# $% @** #$% @* *#$ %@* *#$ %@ |
| ** #$% @** #$% @* *#$ %@* *# $%@ **# $%@ ** #$% @** #$ %@* *#$ %@ **# $%@ **# $% @** #$% @* *#$ %@* *#$ %@ |
0.8x +-+--**##$%@@**##$%@@***#$%%@***#$$%@-**#$$%@-**##$%@@**##$%%@***#$%%@-**#$$%@-**#$$%@@**##$%@@***#$%%@***#$%%@--+-+
astar bzip2 gcc gobmk h264ref hmmlibquantum mcf omnetpperlbench sjengxalancbmk hmean
png: http://imgur.com/1ZJGjzV
Here I also tried the hash function suggested by Paolo ("multhash"):
return ((uint64_t) (pc * 2654435761) >> 32) & ();
As you can see it is just as good as the other new function ("hash"),
but I kept "hash" because with it all benchmarks have speedup > 1.
- SPECint06 (train set), x86_64-linux-user. Host: Intel i7-6700K @ 4.00GHz
2.6x +-+--------------------------------------------------------------------------------------------------------------+-+
| |
| jr+inline |
2.4x +jr+inline+hash....................................................................................###...........+-+
| # # |
| # # |
2.2x +-+................................................................................................#.#...........+-+
| # # |
| # # |
2x +-+................................................................................................#.#...........+-+
| # # |
| **** # |
1.8x +-+.............................................................................................*..*.#...........+-+
| +++ #### * * # |
| #### ****++# * * # |
1.6x +-+......................................+++...........................****..#.*++*..#..........*..*.#...........+-+
| #### *++* # * * # +++ * * # |
| +++ ++#++# * * # * * # #### * * # |
1.4x +-+...................+++###..........****..#..........................*..*..#.*..*..#....#..#..*..*.#...........+-+
| ****+# * * # * * # * * # *** # * * # #### |
| *++* # +++ * * # * * # * * # *+* # * * # ****++# |
1.2x +-+...................*..*.#..****###.*..*..#..........................*..*..#.*..*..#..*.*..#..*..*.#..*..*..#..+-+
| ****### +++ * * # * * # * * # * * # * * # * * # * * # * * # |
| * *++# ***### * * # * * # * * # * * # * * # * * # * * # * * # |
1x +-+--****###--***###--****##--****###-****###--***###--***###--****##--****###-****###--***###--****##--****###--+-+
astar bzip2 gcc gobmk h264ref hmmlibquantum mcf omnetpperlbench sjengxalancbmk hmean
png: http://imgur.com/1D2VFze
- NBench, x86_64-linux-user. Host: Intel i7-6700K @ 4.00GHz
1.1x +-+-------------------------------------------------------------------------------------------------------------+-+
| |
| jr+inline |
1.08x +jr+hash+noinline +..............................+++.....................................................+-+
| jr+hash+inline | |
| +++| |
| | |+++ |
1.06x +-+....................................................|.|.|....................................................+-+
| |###| +++++ |
| |#|#| ###$$$ |
1.04x +-+.........................+++....+++.+++.............|#|#$$$..............................++#|#++$............+-+
| |+++ |+++| ****|#| $ +++ |#+# $ |
| | | | | | * |*+#| $ |+++ **** # $ |
| +++ +++ | | ****| | * |* #++$ | |+++ * |* # $ |
1.02x +-+....|..................|####$$.*.|*|$$$$.++++++++.*.|*.#..$..........****|.|............*++*.#..$.++++++++...+-+
| ***+++ |# |#|$ * |*##| $ | | | * |* # $ * |*| | +++ * * # $ ***###$$ |
| *|* |+++ +++ +++ *** |#|$ * |*|#| $ ***###$$ *++* # $ +++ * |*##$$$ ####++ * * # $ *+*++# $ |
1x +-++-+*+*###+++****-$$$$+*+*++#+$+*++*+#++$+*+*++#+$+*++*-#++$+++-++$$$+*++*+#++$+***++#$$+*++*-#++$+*+*++#+$+-++-+
| * *++#$$ *++*|$++$ *|*++# $ * *+#++$ *+*++#|$ * * # $ *** |$+$ * *|#| $ *+* #+$ * * # $ * * # $ |
| * * #+$ * *## $ *+* # $ * * # $ * * #+$ * * # $ *+*### $ * *|#++$ * * # $ * * # $ * * # $ |
| * * # $ * *|# $ * * # $ * * # $ * * # $ * * # $ * *++# $ * *+# $ * * # $ * * # $ * * # $ |
0.98x +-+...*.*..#.$.*..*+#..$.*.*..#.$.*..*.#..$.*.*..#.$.*..*.#..$.*.*..#.$.*..*.#..$.*.*..#.$.*..*.#..$.*.*..#.$...+-+
| * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ |
| * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ * * # $ |
0.96x +-+---***###$$-****##$$$-***###$$-****##$$$-***###$$-****##$$$-***###$$-****##$$$-***###$$-****##$$$-***###$$---+-+
ASSIGNMENT BITFIELD FOURFP EMULATION HUFFMAN LU DECOMPOSITIONEURAL NNUMERIC SOSTRING SORT hmean
png: http://imgur.com/xK9YfOB
- NBench, arm-linux-user. Host: Intel i7-4790K @ 4.00GHz
1.3x +-+-------------------------------------------------------------------------------------------------------------+-+
| #### +++ |
| jr+inline #++# #### |
1.25x +jr+hash+inline..............#..#...........................................#++#................................+-+
| # # # # |
| # # # # |
| # # # # |
1.2x +-+..........................#..#..................................####.....#..#................................+-+
| # # +++#++# # # |
| # # ***** # # # |
1.15x +-+..........................#..#..............................*+++*..#.....#..#................................+-+
| # # * * # **** # |
| # # * * # *++* # |
| # # * * # * * # |
1.1x +-+..........................#..#...............+++............*...*..#..*..*..#................................+-+
| # # +++#### * * # * * # #### |
| # # ***** # * * # * * # # # |
1.05x +-+..........................#..#...........*...*..#...........*...*..#..*..*..#...............####......#..#...+-+
| # # +++ * * # * * # * * # #++# ***** # |
| +++# # ****### * * # ****### * * # * * # +++# # * * # |
| ++++++ ****### ***** # *++*++# * * # *++*++# * * # * * # ++++++ **** # * * # |
1x +-++-+*****###++*++*++#++*+-+*++#+-*++*++#-+*+++*-+#++*++*++#++*+-+*++#+-*++*++#-+*****###++*++*++#++*+-+*++#+-++-+
| *+++*++# * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # |
| * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # |
0.95x +-+---*****###--****###--*****###--****###--*****###--****###--*****###--****###--*****###--****###--*****###---+-+
ASSIGNMENT BITFIELD FOURFP EMULATION HUFFMAN LU DECOMPOSITIONEURAL NNUMERIC SOSTRING SORT hmean
png: http://imgur.com/uhIEOA1
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
include/exec/tb-hash.h | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
index 2c27490..b1fe2d0 100644
--- a/include/exec/tb-hash.h
+++ b/include/exec/tb-hash.h
@@ -22,6 +22,8 @@
#include "exec/tb-hash-xx.h"
+#ifdef CONFIG_SOFTMMU
+
/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
addresses on the same page. The top bits are the same. This allows
TLB invalidation to quickly clear a subset of the hash table. */
@@ -45,6 +47,16 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
| (tmp & TB_JMP_ADDR_MASK));
}
+#else
+
+/* In user-mode we can get better hashing because we do not have a TLB */
+static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
+{
+ return (pc ^ (pc >> TB_JMP_CACHE_BITS)) & (TB_JMP_CACHE_SIZE - 1);
+}
+
+#endif /* CONFIG_SOFTMMU */
+
static inline
uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags)
{
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [Qemu-devel] [PATCH v2 13/13] tb-hash: improve tb_jmp_cache hash function in user mode
2017-04-25 7:53 ` [Qemu-devel] [PATCH v2 13/13] tb-hash: improve tb_jmp_cache hash function in user mode Emilio G. Cota
@ 2017-04-25 11:19 ` Richard Henderson
0 siblings, 0 replies; 25+ messages in thread
From: Richard Henderson @ 2017-04-25 11:19 UTC (permalink / raw)
To: Emilio G. Cota, qemu-devel
Cc: Paolo Bonzini, Peter Crosthwaite, Peter Maydell, Eduardo Habkost,
Andrzej Zaborowski, Aurelien Jarno, Alexander Graf, Stefan Weil,
qemu-arm, alex.bennee, Pranith Kumar
On 04/25/2017 09:53 AM, Emilio G. Cota wrote:
> Optimizations to cross-page chaining and indirect branches make
> performance more sensitive to the hit rate of tb_jmp_cache.
> The constraint of reserving some bits for the page number
> lowers the achievable quality of the hashing function.
Reviewed-by: Richard Henderson <rth@twiddle.net>
r~
^ permalink raw reply [flat|nested] 25+ messages in thread