* [Qemu-devel] [RFC][PATCH v4 2/3] tcg: Add declarations and templates of extended MMU helpers
2012-07-25 7:35 [Qemu-devel] [RFC][PATCH v4 0/3] tcg: enhance code generation quality for qemu_ld/st IRs Yeongkyoon Lee
2012-07-25 7:35 ` [Qemu-devel] [RFC][PATCH v4 1/3] configure: Add CONFIG_QEMU_LDST_OPTIMIZATION for TCG qemu_ld/st optimization Yeongkyoon Lee
@ 2012-07-25 7:35 ` Yeongkyoon Lee
2012-07-25 7:35 ` [Qemu-devel] [RFC][PATCH v4 3/3] tcg: Optimize qemu_ld/st by generating slow paths at the end of a block Yeongkyoon Lee
2 siblings, 0 replies; 13+ messages in thread
From: Yeongkyoon Lee @ 2012-07-25 7:35 UTC (permalink / raw)
To: qemu-devel
Cc: peter.maydell, Yeongkyoon Lee, sw, blauwirbel, laurent.desnogues,
rth
Add declarations and templates of extended MMU helpers.
An extended helper takes an additional argument of the host address accessing
a guest memory which differs from the address of the call site to the helper
because helper call sites locate at the end of a generated code block.
Signed-off-by: Yeongkyoon Lee <yeongkyoon.lee@samsung.com>
---
softmmu_defs.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
| 25 ++++++++++++++++++++
softmmu_template.h | 48 ++++++++++++++++++++++++++++++++------
3 files changed, 129 insertions(+), 8 deletions(-)
diff --git a/softmmu_defs.h b/softmmu_defs.h
index 8d59f9d..505f6ba 100644
--- a/softmmu_defs.h
+++ b/softmmu_defs.h
@@ -10,6 +10,8 @@
#define SOFTMMU_DEFS_H
#ifndef CONFIG_TCG_PASS_AREG0
+
+#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
uint8_t __ldb_mmu(target_ulong addr, int mmu_idx);
void __stb_mmu(target_ulong addr, uint8_t val, int mmu_idx);
uint16_t __ldw_mmu(target_ulong addr, int mmu_idx);
@@ -28,6 +30,30 @@ void __stl_cmmu(target_ulong addr, uint32_t val, int mmu_idx);
uint64_t __ldq_cmmu(target_ulong addr, int mmu_idx);
void __stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx);
#else
+/* Extended versions of MMU helpers for qemu_ld/st optimization.
+ The additional argument is a host code address accessing guest memory */
+uint8_t ext_ldb_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stb_mmu(target_ulong addr, uint8_t val, int mmu_idx, uintptr_t ra);
+uint16_t ext_ldw_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stw_mmu(target_ulong addr, uint16_t val, int mmu_idx, uintptr_t ra);
+uint32_t ext_ldl_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stl_mmu(target_ulong addr, uint32_t val, int mmu_idx, uintptr_t ra);
+uint64_t ext_ldq_mmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stq_mmu(target_ulong addr, uint64_t val, int mmu_idx, uintptr_t ra);
+
+uint8_t ext_ldb_cmmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stb_cmmu(target_ulong addr, uint8_t val, int mmu_idx, uintptr_t ra);
+uint16_t ext_ldw_cmmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stw_cmmu(target_ulong addr, uint16_t val, int mmu_idx, uintptr_t ra);
+uint32_t ext_ldl_cmmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stl_cmmu(target_ulong addr, uint32_t val, int mmu_idx, uintptr_t ra);
+uint64_t ext_ldq_cmmu(target_ulong addr, int mmu_idx, uintptr_t ra);
+void ext_stq_cmmu(target_ulong addr, uint64_t val, int mmu_idx, uintptr_t ra);
+#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */
+
+#else
+
+#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
uint8_t helper_ldb_mmu(CPUArchState *env, target_ulong addr, int mmu_idx);
void helper_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
int mmu_idx);
@@ -53,6 +79,44 @@ void helper_stl_cmmu(CPUArchState *env, target_ulong addr, uint32_t val,
uint64_t helper_ldq_cmmu(CPUArchState *env, target_ulong addr, int mmu_idx);
void helper_stq_cmmu(CPUArchState *env, target_ulong addr, uint64_t val,
int mmu_idx);
+#else
+/* Extended versions of MMU helpers for qemu_ld/st optimization.
+ The additional argument is a host code address accessing guest memory */
+uint8_t ext_helper_ldb_mmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
+ int mmu_idx, uintptr_t ra);
+uint16_t ext_helper_ldw_mmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+ int mmu_idx, uintptr_t ra);
+uint32_t ext_helper_ldl_mmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+ int mmu_idx, uintptr_t ra);
+uint64_t ext_helper_ldq_mmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
+ int mmu_idx, uintptr_t ra);
+
+uint8_t ext_helper_ldb_cmmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stb_cmmu(CPUArchState *env, target_ulong addr, uint8_t val,
+ int mmu_idx, uintptr_t ra);
+uint16_t ext_helper_ldw_cmmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stw_cmmu(CPUArchState *env, target_ulong addr, uint16_t val,
+ int mmu_idx, uintptr_t ra);
+uint32_t ext_helper_ldl_cmmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stl_cmmu(CPUArchState *env, target_ulong addr, uint32_t val,
+ int mmu_idx, uintptr_t ra);
+uint64_t ext_helper_ldq_cmmu(CPUArchState *env, target_ulong addr, int mmu_idx,
+ uintptr_t ra);
+void ext_helper_stq_cmmu(CPUArchState *env, target_ulong addr, uint64_t val,
+ int mmu_idx, uintptr_t ra);
+#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */
+
#endif
#endif
--git a/softmmu_header.h b/softmmu_header.h
index cf1aa38..13bec62 100644
--- a/softmmu_header.h
+++ b/softmmu_header.h
@@ -82,12 +82,20 @@
#define ENV_PARAM
#define ENV_VAR
#define CPU_PREFIX
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+#define HELPER_PREFIX ext_
+#else
#define HELPER_PREFIX __
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
#else
#define ENV_PARAM CPUArchState *env,
#define ENV_VAR env,
#define CPU_PREFIX cpu_
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+#define HELPER_PREFIX ext_helper_
+#else
#define HELPER_PREFIX helper_
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
#endif
/* generic load/store macros */
@@ -106,9 +114,14 @@ glue(glue(glue(CPU_PREFIX, ld), USUFFIX), MEMSUFFIX)(ENV_PARAM
mmu_idx = CPU_MMU_INDEX;
if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
(addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+ res = glue(glue(glue(HELPER_PREFIX, ld), SUFFIX),
+ MMUSUFFIX)(ENV_VAR addr, mmu_idx, (uintptr_t)NULL);
+#else
res = glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_VAR
addr,
mmu_idx);
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
} else {
uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
res = glue(glue(ld, USUFFIX), _raw)(hostaddr);
@@ -130,8 +143,14 @@ glue(glue(glue(CPU_PREFIX, lds), SUFFIX), MEMSUFFIX)(ENV_PARAM
mmu_idx = CPU_MMU_INDEX;
if (unlikely(env->tlb_table[mmu_idx][page_index].ADDR_READ !=
(addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+ res = (DATA_STYPE)glue(glue(glue(HELPER_PREFIX, ld), SUFFIX),
+ MMUSUFFIX)(ENV_VAR addr, mmu_idx,
+ (uintptr_t)NULL);
+#else
res = (DATA_STYPE)glue(glue(glue(HELPER_PREFIX, ld), SUFFIX),
MMUSUFFIX)(ENV_VAR addr, mmu_idx);
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
} else {
uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
res = glue(glue(lds, SUFFIX), _raw)(hostaddr);
@@ -157,8 +176,14 @@ glue(glue(glue(CPU_PREFIX, st), SUFFIX), MEMSUFFIX)(ENV_PARAM target_ulong ptr,
mmu_idx = CPU_MMU_INDEX;
if (unlikely(env->tlb_table[mmu_idx][page_index].addr_write !=
(addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+ glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_VAR addr, v,
+ mmu_idx,
+ (uintptr_t)NULL);
+#else
glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_VAR addr, v,
mmu_idx);
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
} else {
uintptr_t hostaddr = addr + env->tlb_table[mmu_idx][page_index].addend;
glue(glue(st, SUFFIX), _raw)(hostaddr, v);
diff --git a/softmmu_template.h b/softmmu_template.h
index b8bd700..82f2710 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -66,6 +66,21 @@
#define HELPER_PREFIX helper_
#endif
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+#undef HELPER_PREFIX
+/* Redefine helper prefix */
+#ifndef CONFIG_TCG_PASS_AREG0
+#define HELPER_PREFIX ext_
+#else
+#define HELPER_PREFIX ext_helper_
+#endif
+/* An extended MMU helper takes one more argument which is
+ a host address of generated code accessing guest memory */
+#define GET_RET_ADDR() ra
+#else
+#define GET_RET_ADDR() GETPC()
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
+
static DATA_TYPE glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(ENV_PARAM
target_ulong addr,
int mmu_idx,
@@ -103,10 +118,18 @@ static inline DATA_TYPE glue(io_read, SUFFIX)(ENV_PARAM
}
/* handle all cases except unaligned access which span two pages */
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+DATA_TYPE
+glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM
+ target_ulong addr,
+ int mmu_idx,
+ uintptr_t ra)
+#else
DATA_TYPE
glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM
target_ulong addr,
int mmu_idx)
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
{
DATA_TYPE res;
int index;
@@ -124,13 +147,13 @@ glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM
/* IO access */
if ((addr & (DATA_SIZE - 1)) != 0)
goto do_unaligned_access;
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
ioaddr = env->iotlb[mmu_idx][index];
res = glue(io_read, SUFFIX)(ENV_VAR ioaddr, addr, retaddr);
} else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) {
/* slow unaligned access (it spans two pages or IO) */
do_unaligned_access:
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
#ifdef ALIGNED_ONLY
do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
#endif
@@ -141,7 +164,7 @@ glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM
uintptr_t addend;
#ifdef ALIGNED_ONLY
if ((addr & (DATA_SIZE - 1)) != 0) {
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
}
#endif
@@ -151,7 +174,7 @@ glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM
}
} else {
/* the page is not in the TLB : fill it */
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
#ifdef ALIGNED_ONLY
if ((addr & (DATA_SIZE - 1)) != 0)
do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr);
@@ -253,10 +276,18 @@ static inline void glue(io_write, SUFFIX)(ENV_PARAM
#endif /* SHIFT > 2 */
}
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM
+ target_ulong addr,
+ DATA_TYPE val,
+ int mmu_idx,
+ uintptr_t ra)
+#else
void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM
target_ulong addr,
DATA_TYPE val,
int mmu_idx)
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
{
target_phys_addr_t ioaddr;
target_ulong tlb_addr;
@@ -271,12 +302,12 @@ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM
/* IO access */
if ((addr & (DATA_SIZE - 1)) != 0)
goto do_unaligned_access;
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
ioaddr = env->iotlb[mmu_idx][index];
glue(io_write, SUFFIX)(ENV_VAR ioaddr, val, addr, retaddr);
} else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) {
do_unaligned_access:
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
#ifdef ALIGNED_ONLY
do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr);
#endif
@@ -287,7 +318,7 @@ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM
uintptr_t addend;
#ifdef ALIGNED_ONLY
if ((addr & (DATA_SIZE - 1)) != 0) {
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr);
}
#endif
@@ -297,7 +328,7 @@ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM
}
} else {
/* the page is not in the TLB : fill it */
- retaddr = GETPC();
+ retaddr = GET_RET_ADDR();
#ifdef ALIGNED_ONLY
if ((addr & (DATA_SIZE - 1)) != 0)
do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr);
@@ -370,3 +401,4 @@ static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(ENV_PARAM
#undef ENV_VAR
#undef CPU_PREFIX
#undef HELPER_PREFIX
+#undef GET_RET_ADDR
--
1.7.4.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [Qemu-devel] [RFC][PATCH v4 3/3] tcg: Optimize qemu_ld/st by generating slow paths at the end of a block
2012-07-25 7:35 [Qemu-devel] [RFC][PATCH v4 0/3] tcg: enhance code generation quality for qemu_ld/st IRs Yeongkyoon Lee
2012-07-25 7:35 ` [Qemu-devel] [RFC][PATCH v4 1/3] configure: Add CONFIG_QEMU_LDST_OPTIMIZATION for TCG qemu_ld/st optimization Yeongkyoon Lee
2012-07-25 7:35 ` [Qemu-devel] [RFC][PATCH v4 2/3] tcg: Add declarations and templates of extended MMU helpers Yeongkyoon Lee
@ 2012-07-25 7:35 ` Yeongkyoon Lee
2012-07-25 14:00 ` Richard Henderson
2 siblings, 1 reply; 13+ messages in thread
From: Yeongkyoon Lee @ 2012-07-25 7:35 UTC (permalink / raw)
To: qemu-devel
Cc: peter.maydell, Yeongkyoon Lee, sw, blauwirbel, laurent.desnogues,
rth
Add optimized TCG qemu_ld/st generation which locates the code of TLB miss
cases at the end of a block after generating the other IRs.
Currently, this optimization supports only i386 and x86_64 hosts.
Signed-off-by: Yeongkyoon Lee <yeongkyoon.lee@samsung.com>
---
tcg/i386/tcg-target.c | 475 +++++++++++++++++++++++++++++++------------------
tcg/tcg.c | 12 ++
tcg/tcg.h | 35 ++++
3 files changed, 353 insertions(+), 169 deletions(-)
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index da17bba..20c6ba5 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -966,43 +966,53 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
#include "../../softmmu_defs.h"
#ifdef CONFIG_TCG_PASS_AREG0
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
- int mmu_idx) */
+/* extended helper signature: ext_helper_ld_mmu(CPUState *env,
+ target_ulong addr, int mmu_idx, uintptr_t raddr) */
static const void *qemu_ld_helpers[4] = {
- helper_ldb_mmu,
- helper_ldw_mmu,
- helper_ldl_mmu,
- helper_ldq_mmu,
+ ext_helper_ldb_mmu,
+ ext_helper_ldw_mmu,
+ ext_helper_ldl_mmu,
+ ext_helper_ldq_mmu,
};
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
- uintxx_t val, int mmu_idx) */
+/* extended helper signature: ext_helper_st_mmu(CPUState *env,
+ target_ulong addr, uintxx_t val, int mmu_idx, uintptr_t raddr) */
static const void *qemu_st_helpers[4] = {
- helper_stb_mmu,
- helper_stw_mmu,
- helper_stl_mmu,
- helper_stq_mmu,
+ ext_helper_stb_mmu,
+ ext_helper_stw_mmu,
+ ext_helper_stl_mmu,
+ ext_helper_stq_mmu,
};
#else
-/* legacy helper signature: __ld_mmu(target_ulong addr, int
- mmu_idx) */
+/* extended legacy helper signature: ext_ld_mmu(target_ulong addr,
+ int mmu_idx, uintptr_t raddr) */
static void *qemu_ld_helpers[4] = {
- __ldb_mmu,
- __ldw_mmu,
- __ldl_mmu,
- __ldq_mmu,
+ ext_ldb_mmu,
+ ext_ldw_mmu,
+ ext_ldl_mmu,
+ ext_ldq_mmu,
};
-/* legacy helper signature: __st_mmu(target_ulong addr, uintxx_t val,
- int mmu_idx) */
+/* extended legacy helper signature: ext_st_mmu(target_ulong addr,
+ uintxx_t val, int mmu_idx, uintptr_t raddr) */
static void *qemu_st_helpers[4] = {
- __stb_mmu,
- __stw_mmu,
- __stl_mmu,
- __stq_mmu,
+ ext_stb_mmu,
+ ext_stw_mmu,
+ ext_stl_mmu,
+ ext_stq_mmu,
};
#endif
+static void add_qemu_ldst_label(TCGContext *s,
+ int opc_ext,
+ int data_reg,
+ int data_reg2,
+ int addrlo_reg,
+ int addrhi_reg,
+ int mem_index,
+ uint8_t *raddr,
+ uint8_t **label_ptr);
+
/* Perform the TLB load and compare.
Inputs:
@@ -1061,19 +1071,21 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
tcg_out_mov(s, type, r0, addrlo);
- /* jne label1 */
- tcg_out8(s, OPC_JCC_short + JCC_JNE);
+ /* jne slow_path */
+ /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
+ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
label_ptr[0] = s->code_ptr;
- s->code_ptr++;
+ s->code_ptr += 4;
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
/* cmp 4(r1), addrhi */
tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
- /* jne label1 */
- tcg_out8(s, OPC_JCC_short + JCC_JNE);
+ /* jne slow_path */
+ /* XXX: How to avoid using OPC_JCC_long for peephole optimization? */
+ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
label_ptr[1] = s->code_ptr;
- s->code_ptr++;
+ s->code_ptr += 4;
}
/* TLB Hit. */
@@ -1171,12 +1183,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
-#if TCG_TARGET_REG_BITS == 64
- int arg_idx;
-#else
- int stack_adjust;
-#endif
- uint8_t *label_ptr[3];
+ uint8_t *label_ptr[2];
#endif
data_reg = args[0];
@@ -1197,101 +1204,16 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
tcg_target_call_iarg_regs[0], 0, opc);
- /* jmp label2 */
- tcg_out8(s, OPC_JMP_short);
- label_ptr[2] = s->code_ptr;
- s->code_ptr++;
-
- /* TLB Miss. */
-
- /* label1: */
- *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
- if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
- *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
- }
-
- /* XXX: move that code at the end of the TB */
-#if TCG_TARGET_REG_BITS == 32
- tcg_out_pushi(s, mem_index);
- stack_adjust = 4;
- if (TARGET_LONG_BITS == 64) {
- tcg_out_push(s, args[addrlo_idx + 1]);
- stack_adjust += 4;
- }
- tcg_out_push(s, args[addrlo_idx]);
- stack_adjust += 4;
-#ifdef CONFIG_TCG_PASS_AREG0
- tcg_out_push(s, TCG_AREG0);
- stack_adjust += 4;
-#endif
-#else
- /* The first argument is already loaded with addrlo. */
- arg_idx = 1;
- tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx],
- mem_index);
-#ifdef CONFIG_TCG_PASS_AREG0
- /* XXX/FIXME: suboptimal */
- tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
- tcg_target_call_iarg_regs[2]);
- tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
- tcg_target_call_iarg_regs[1]);
- tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
- tcg_target_call_iarg_regs[0]);
- tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
- TCG_AREG0);
-#endif
-#endif
-
- tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
-
-#if TCG_TARGET_REG_BITS == 32
- if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
- /* Pop and discard. This is 2 bytes smaller than the add. */
- tcg_out_pop(s, TCG_REG_ECX);
- } else if (stack_adjust != 0) {
- tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
- }
-#endif
-
- switch(opc) {
- case 0 | 4:
- tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
- break;
- case 1 | 4:
- tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
- break;
- case 0:
- tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
- break;
- case 1:
- tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
- break;
- case 2:
- tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
- break;
-#if TCG_TARGET_REG_BITS == 64
- case 2 | 4:
- tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
- break;
-#endif
- case 3:
- if (TCG_TARGET_REG_BITS == 64) {
- tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
- } else if (data_reg == TCG_REG_EDX) {
- /* xchg %edx, %eax */
- tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
- tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
- } else {
- tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
- tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
- }
- break;
- default:
- tcg_abort();
- }
-
- /* label2: */
- *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+ /* Record the current context of a load into ldst label */
+ add_qemu_ldst_label(s,
+ opc,
+ data_reg,
+ data_reg2,
+ args[addrlo_idx],
+ args[addrlo_idx + 1],
+ mem_index,
+ s->code_ptr,
+ label_ptr);
#else
{
int32_t offset = GUEST_BASE;
@@ -1385,8 +1307,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
- int stack_adjust;
- uint8_t *label_ptr[3];
+ uint8_t *label_ptr[2];
#endif
data_reg = args[0];
@@ -1407,34 +1328,242 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
tcg_out_qemu_st_direct(s, data_reg, data_reg2,
tcg_target_call_iarg_regs[0], 0, opc);
- /* jmp label2 */
- tcg_out8(s, OPC_JMP_short);
- label_ptr[2] = s->code_ptr;
- s->code_ptr++;
+ /* Record the current context of a store into ldst label */
+ add_qemu_ldst_label(s,
+ opc | HL_ST_MASK,
+ data_reg,
+ data_reg2,
+ args[addrlo_idx],
+ args[addrlo_idx + 1],
+ mem_index,
+ s->code_ptr,
+ label_ptr);
+#else
+ {
+ int32_t offset = GUEST_BASE;
+ int base = args[addrlo_idx];
+
+ if (TCG_TARGET_REG_BITS == 64) {
+ /* ??? We assume all operations have left us with register
+ contents that are zero extended. So far this appears to
+ be true. If we want to enforce this, we can either do
+ an explicit zero-extension here, or (if GUEST_BASE == 0)
+ use the ADDR32 prefix. For now, do nothing. */
+
+ if (offset != GUEST_BASE) {
+ tcg_out_movi(s, TCG_TYPE_I64,
+ tcg_target_call_iarg_regs[0], GUEST_BASE);
+ tgen_arithr(s, ARITH_ADD + P_REXW,
+ tcg_target_call_iarg_regs[0], base);
+ base = tcg_target_call_iarg_regs[0];
+ offset = 0;
+ }
+ }
+
+ tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
+ }
+#endif
+}
+
+#if defined(CONFIG_SOFTMMU)
+/*
+ * Record the context of a call to the out of line helper code for the slow path
+ * for a load or store, so that we can later generate the correct helper code
+ */
+static void add_qemu_ldst_label(TCGContext *s,
+ int opc_ext,
+ int data_reg,
+ int data_reg2,
+ int addrlo_reg,
+ int addrhi_reg,
+ int mem_index,
+ uint8_t *raddr,
+ uint8_t **label_ptr)
+{
+ int idx;
+ TCGLabelQemuLdst *label;
- /* TLB Miss. */
+ if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) {
+ tcg_abort();
+ }
- /* label1: */
- *label_ptr[0] = s->code_ptr - label_ptr[0] - 1;
+ idx = s->nb_qemu_ldst_labels++;
+ label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+ label->opc_ext = opc_ext;
+ label->datalo_reg = data_reg;
+ label->datahi_reg = data_reg2;
+ label->addrlo_reg = addrlo_reg;
+ label->addrhi_reg = addrhi_reg;
+ label->mem_index = mem_index;
+ label->raddr = raddr;
+ label->label_ptr[0] = label_ptr[0];
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
- *label_ptr[1] = s->code_ptr - label_ptr[1] - 1;
+ label->label_ptr[1] = label_ptr[1];
}
+}
- /* XXX: move that code at the end of the TB */
+/*
+ * Generate code for the slow path for a load at the end of block
+ */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+ int s_bits;
+ int opc = label->opc_ext & HL_OPC_MASK;
+ int mem_index = label->mem_index;
#if TCG_TARGET_REG_BITS == 32
- tcg_out_pushi(s, mem_index);
+ int stack_adjust;
+ int addrlo_reg = label->addrlo_reg;
+ int addrhi_reg = label->addrhi_reg;
+#endif
+ int data_reg = label->datalo_reg;
+ int data_reg2 = label->datahi_reg;
+ uint8_t *raddr = label->raddr;
+ uint8_t **label_ptr = &label->label_ptr[0];
+
+ s_bits = opc & 3;
+
+ /* resolve label address */
+ *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+ if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+ *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+ }
+
+ /* extended legacy helper signature (w/o CONFIG_TCG_PASS_AREG0):
+ ext_ld_mmu(target_ulong addr, int mmu_idx, uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+ tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1)); /* return address */
stack_adjust = 4;
+ tcg_out_pushi(s, mem_index); /* mmu index */
+ stack_adjust += 4;
+ if (TARGET_LONG_BITS == 64) {
+ tcg_out_push(s, addrhi_reg);
+ stack_adjust += 4;
+ }
+ tcg_out_push(s, addrlo_reg); /* guest addr */
+ stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+ /* extended helper signature (w/ CONFIG_TCG_PASS_AREG0):
+ ext_helper_ld_mmu(CPUState *env, target_ulong addr, int mmu_idx,
+ uintptr_t raddr) */
+ tcg_out_push(s, TCG_AREG0);
+ stack_adjust += 4;
+#endif
+#else
+ /* The first argument is already loaded with addrlo. */
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1],
+ mem_index);
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
+ (tcg_target_ulong)(raddr - 1)); /* return address */
+#ifdef CONFIG_TCG_PASS_AREG0
+ /* XXX/FIXME: suboptimal */
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+ tcg_target_call_iarg_regs[2]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+ tcg_target_call_iarg_regs[1]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+ tcg_target_call_iarg_regs[0]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+ TCG_AREG0);
+#endif
+#endif
+
+ tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+#if TCG_TARGET_REG_BITS == 32
+ if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+ /* Pop and discard. This is 2 bytes smaller than the add. */
+ tcg_out_pop(s, TCG_REG_ECX);
+ } else if (stack_adjust != 0) {
+ tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+ }
+#endif
+
+ switch (opc) {
+ case 0 | 4:
+ tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+ break;
+ case 1 | 4:
+ tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+ break;
+ case 0:
+ tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+ break;
+ case 1:
+ tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+ break;
+ case 2:
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+ break;
+#if TCG_TARGET_REG_BITS == 64
+ case 2 | 4:
+ tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+ break;
+#endif
+ case 3:
+ if (TCG_TARGET_REG_BITS == 64) {
+ tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+ } else if (data_reg == TCG_REG_EDX) {
+ /* xchg %edx, %eax */
+ tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+ } else {
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+ }
+ break;
+ default:
+ tcg_abort();
+ }
+
+ /* Jump back to the original code accessing a guest memory */
+ tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/*
+ * Generate code for the slow path for a store at the end of block
+ */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+ int s_bits;
+ int stack_adjust;
+ int opc = label->opc_ext & HL_OPC_MASK;
+ int mem_index = label->mem_index;
+ int data_reg = label->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
+ int data_reg2 = label->datahi_reg;
+ int addrlo_reg = label->addrlo_reg;
+ int addrhi_reg = label->addrhi_reg;
+#endif
+ uint8_t *raddr = label->raddr;
+ uint8_t **label_ptr = &label->label_ptr[0];
+
+ s_bits = opc & 3;
+
+ /* resolve label address */
+ *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+ if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+ *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+ }
+
+ /* extended legacy helper signature (w/o CONFIG_TCG_PASS_AREG0):
+ ext_st_mmu(target_ulong addr, uintxx_t val, int mmu_idx,
+ uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+ tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1)); /* return address */
+ stack_adjust = 4;
+ tcg_out_pushi(s, mem_index); /* mmu index */
+ stack_adjust += 4;
if (opc == 3) {
tcg_out_push(s, data_reg2);
stack_adjust += 4;
}
- tcg_out_push(s, data_reg);
+ tcg_out_push(s, data_reg); /* guest data */
stack_adjust += 4;
if (TARGET_LONG_BITS == 64) {
- tcg_out_push(s, args[addrlo_idx + 1]);
+ tcg_out_push(s, addrhi_reg);
stack_adjust += 4;
}
- tcg_out_push(s, args[addrlo_idx]);
+ tcg_out_push(s, addrlo_reg); /* guest addr */
stack_adjust += 4;
#ifdef CONFIG_TCG_PASS_AREG0
tcg_out_push(s, TCG_AREG0);
@@ -1444,9 +1573,23 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
tcg_target_call_iarg_regs[1], data_reg);
tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+#if defined(_WIN64)
+ tcg_out_pushi(s, (tcg_target_ulong)(raddr - 1)); /* return address */
+ stack_adjust += 8;
+#else
+ tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+ (tcg_target_ulong)(raddr - 1)); /* return address */
stack_adjust = 0;
+#endif
#ifdef CONFIG_TCG_PASS_AREG0
+ /* extended helper signature (w/ CONFIG_TCG_PASS_AREG0):
+ ext_helper_st_mmu(CPUState *env, target_ulong addr, uintxx_t val,
+ int mmu_idx, uintptr_t raddr) */
/* XXX/FIXME: suboptimal */
+#if !defined(_WIN64)
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[4],
+ tcg_target_call_iarg_regs[3]);
+#endif
tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
tcg_target_call_iarg_regs[2]);
tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
@@ -1467,34 +1610,28 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
}
- /* label2: */
- *label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
-#else
- {
- int32_t offset = GUEST_BASE;
- int base = args[addrlo_idx];
+ /* Jump back to the original code accessing a guest memory */
+ tcg_out_jmp(s, (tcg_target_long) raddr);
+}
- if (TCG_TARGET_REG_BITS == 64) {
- /* ??? We assume all operations have left us with register
- contents that are zero extended. So far this appears to
- be true. If we want to enforce this, we can either do
- an explicit zero-extension here, or (if GUEST_BASE == 0)
- use the ADDR32 prefix. For now, do nothing. */
+/*
+ * Generate all of the slow paths of qemu_ld/st at the end of block
+ */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s)
+{
+ int i;
+ TCGLabelQemuLdst *label;
- if (offset != GUEST_BASE) {
- tcg_out_movi(s, TCG_TYPE_I64,
- tcg_target_call_iarg_regs[0], GUEST_BASE);
- tgen_arithr(s, ARITH_ADD + P_REXW,
- tcg_target_call_iarg_regs[0], base);
- base = tcg_target_call_iarg_regs[0];
- offset = 0;
+ for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+ label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
+ if (IS_QEMU_LD_LABEL(label)) {
+ tcg_out_qemu_ld_slow_path(s, label);
+ } else {
+ tcg_out_qemu_st_slow_path(s, label);
}
}
-
- tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
- }
-#endif
}
+#endif /* CONFIG_SOFTMMU */
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
const TCGArg *args, const int *const_args)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8386b70..346197f 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -301,6 +301,13 @@ void tcg_func_start(TCGContext *s)
gen_opc_ptr = gen_opc_buf;
gen_opparam_ptr = gen_opparam_buf;
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+ /* Initialize qemu_ld/st labels to assist code generation at the end of TB
+ for TLB miss cases at the end of TB */
+ s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) *
+ TCG_MAX_QEMU_LDST);
+ s->nb_qemu_ldst_labels = 0;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
}
static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2169,6 +2176,11 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
#endif
}
the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+ /* Generate slow paths of qemu_ld/st IRs which call MMU helpers at
+ the end of block */
+ tcg_out_qemu_ldst_slow_path(s);
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
return -1;
}
diff --git a/tcg/tcg.h b/tcg/tcg.h
index d710694..a8454f8 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -187,6 +187,29 @@ typedef tcg_target_ulong TCGArg;
are aliases for target_ulong and host pointer sized values respectively.
*/
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Macros/structures for qemu_ld/st IR code optimization:
+ TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */
+#define TCG_MAX_QEMU_LDST 640
+#define HL_LDST_SHIFT 4
+#define HL_LDST_MASK (1 << HL_LDST_SHIFT)
+#define HL_ST_MASK HL_LDST_MASK
+#define HL_OPC_MASK (HL_LDST_MASK - 1)
+#define IS_QEMU_LD_LABEL(L) (!((L)->opc_ext & HL_LDST_MASK))
+#define IS_QEMU_ST_LABEL(L) ((L)->opc_ext & HL_LDST_MASK)
+
+typedef struct TCGLabelQemuLdst {
+ int opc_ext; /* | 27bit(reserved) | 1bit(ld/st) | 4bit(opc) | */
+ int addrlo_reg; /* reg index for low word of guest virtual addr */
+ int addrhi_reg; /* reg index for high word of guest virtual addr */
+ int datalo_reg; /* reg index for low word to be loaded or stored */
+ int datahi_reg; /* reg index for high word to be loaded or stored */
+ int mem_index; /* soft MMU memory index */
+ uint8_t *raddr; /* gen code addr corresponding to qemu_ld/st IR */
+ uint8_t *label_ptr[2]; /* label pointers to be updated */
+} TCGLabelQemuLdst;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
+
#ifdef CONFIG_DEBUG_TCG
#define DEBUG_TCGV 1
#endif
@@ -389,6 +412,13 @@ struct TCGContext {
#ifdef CONFIG_DEBUG_TCG
int temps_in_use;
#endif
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+ /* labels info for qemu_ld/st IRs
+ The labels help to generate TLB miss case codes at the end of TB */
+ TCGLabelQemuLdst *qemu_ldst_labels;
+ int nb_qemu_ldst_labels;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
};
extern TCGContext tcg_ctx;
@@ -588,3 +618,8 @@ extern uint8_t code_gen_prologue[];
#endif
void tcg_register_jit(void *buf, size_t buf_size);
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
+/* Generate all of the slow paths of qemu_ld/st at the end of block */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s);
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */
--
1.7.4.1
^ permalink raw reply related [flat|nested] 13+ messages in thread