From: YeongKyoon Lee <yeongkyoon.lee@samsung.com>
To: qemu-devel <qemu-devel@nongnu.org>
Cc: Blue Swirl <blauwirbel@gmail.com>,
Evgeny Voevodin <e.voevodin@samsung.com>,
Wei-Ren Chen <chenwj@iis.sinica.edu.tw>
Subject: [Qemu-devel] [RFC][PATCH 3/4] tcg: add optimized TCG qemu_ld/st generation
Date: Wed, 04 Jul 2012 06:01:06 +0000 (GMT) [thread overview]
Message-ID: <25206019.11171341381666158.JavaMail.weblogic@epml28> (raw)
Add optimized TCG qemu_ld/st generation which generates the code for TLB miss case handling at the end of TB after generating other IRs.
Signed-off-by: Yeongkyoon Lee
---
tcg/i386/tcg-target.c | 328 +++++++++++++++++++++++++++++++++++++++++++++++++
tcg/tcg.c | 12 ++
tcg/tcg.h | 35 +++++
3 files changed, 375 insertions(+), 0 deletions(-)
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index da17bba..3f2f640 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -984,6 +984,8 @@ static const void *qemu_st_helpers[4] = {
helper_stq_mmu,
};
#else
+
+#ifndef CONFIG_QEMU_LDST_OPTIMIZATION
/* legacy helper signature: __ld_mmu(target_ulong addr, int
mmu_idx) */
static void *qemu_ld_helpers[4] = {
@@ -1001,6 +1003,35 @@ static void *qemu_st_helpers[4] = {
__stl_mmu,
__stq_mmu,
};
+#else
+/* extended legacy helper signature: __ext_ld_mmu(target_ulong addr, int
+ mmu_idx, uintptr raddr) */
+static void *qemu_ld_helpers[4] = {
+ __ext_ldb_mmu,
+ __ext_ldw_mmu,
+ __ext_ldl_mmu,
+ __ext_ldq_mmu,
+};
+
+/* extended legacy helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,
+ int mmu_idx) */
+static void *qemu_st_helpers[4] = {
+ __ext_stb_mmu,
+ __ext_stw_mmu,
+ __ext_stl_mmu,
+ __ext_stq_mmu,
+};
+
+static void add_qemu_ldst_label(TCGContext *s,
+ int opc_ext,
+ int data_reg,
+ int data_reg2,
+ int addrlo_reg,
+ int addrhi_reg,
+ int mem_index,
+ uint8_t *raddr,
+ uint8_t **label_ptr);
+#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */
#endif
/* Perform the TLB load and compare.
@@ -1061,19 +1092,36 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
tcg_out_mov(s, type, r0, addrlo);
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+ /* jne slow_path */
+ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+ if (!label_ptr) {
+ tcg_abort();
+ }
+ label_ptr[0] = s->code_ptr;
+ s->code_ptr += 4;
+#else
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[0] = s->code_ptr;
s->code_ptr++;
+#endif
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
/* cmp 4(r1), addrhi */
tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
+#ifdef CONFIG_QEMU_LDST_OPTIMIZATION
+ /* jne slow_path */
+ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+ label_ptr[1] = s->code_ptr;
+ s->code_ptr += 4;
+#else
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[1] = s->code_ptr;
s->code_ptr++;
+#endif
}
/* TLB Hit. */
@@ -1171,11 +1219,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
#if TCG_TARGET_REG_BITS == 64
int arg_idx;
#else
int stack_adjust;
#endif
+#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */
uint8_t *label_ptr[3];
#endif
@@ -1197,6 +1247,18 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
tcg_target_call_iarg_regs[0], 0, opc);
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* helper stub will be jumped back here */
+ add_qemu_ldst_label(s,
+ opc,
+ data_reg,
+ data_reg2,
+ args[addrlo_idx],
+ args[addrlo_idx + 1],
+ mem_index,
+ s->code_ptr,
+ label_ptr);
+#else
/* jmp label2 */
tcg_out8(s, OPC_JMP_short);
label_ptr[2] = s->code_ptr;
@@ -1292,6 +1354,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
/* label2: */
*label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
#else
{
int32_t offset = GUEST_BASE;
@@ -1385,7 +1448,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
int addrlo_idx;
#if defined(CONFIG_SOFTMMU)
int mem_index, s_bits;
+#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION)
int stack_adjust;
+#endif
uint8_t *label_ptr[3];
#endif
@@ -1407,6 +1472,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
tcg_out_qemu_st_direct(s, data_reg, data_reg2,
tcg_target_call_iarg_regs[0], 0, opc);
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* helper stub will be jumped back here */
+ add_qemu_ldst_label(s,
+ opc | HL_ST_MASK,
+ data_reg,
+ data_reg2,
+ args[addrlo_idx],
+ args[addrlo_idx + 1],
+ mem_index,
+ s->code_ptr,
+ label_ptr);
+#else
/* jmp label2 */
tcg_out8(s, OPC_JMP_short);
label_ptr[2] = s->code_ptr;
@@ -1469,6 +1546,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
/* label2: */
*label_ptr[2] = s->code_ptr - label_ptr[2] - 1;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
#else
{
int32_t offset = GUEST_BASE;
@@ -1496,6 +1574,256 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
#endif
}
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* optimization to reduce jump overheads for qemu_ld/st IRs */
+
+/*
+ * qemu_ld/st code generator call add_qemu_ldst_label,
+ * so that slow case(TLB miss or I/O rw) is handled at the end of TB
+ */
+static void add_qemu_ldst_label(TCGContext *s,
+ int opc_ext,
+ int data_reg,
+ int data_reg2,
+ int addrlo_reg,
+ int addrhi_reg,
+ int mem_index,
+ uint8_t *raddr,
+ uint8_t **label_ptr)
+{
+ int idx;
+ TCGLabelQemuLdst *label;
+
+ if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST)
+ tcg_abort();
+
+ idx = s->nb_qemu_ldst_labels++;
+ label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx];
+ label->opc_ext = opc_ext;
+ label->datalo_reg = data_reg;
+ label->datahi_reg = data_reg2;
+ label->addrlo_reg = addrlo_reg;
+ label->addrhi_reg = addrhi_reg;
+ label->mem_index = mem_index;
+ label->raddr = raddr;
+ if (!label_ptr) {
+ tcg_abort();
+ }
+ label->label_ptr[0] = label_ptr[0];
+ label->label_ptr[1] = label_ptr[1];
+}
+
+/* generates slow case of qemu_ld at the end of TB */
+static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+ int s_bits;
+ int opc = label->opc_ext & HL_OPC_MASK;
+ int mem_index = label->mem_index;
+#if TCG_TARGET_REG_BITS == 64
+ int arg_idx;
+#else
+ int stack_adjust;
+ int addrlo_reg = label->addrlo_reg;
+ int addrhi_reg = label->addrhi_reg;
+#endif
+ int data_reg = label->datalo_reg;
+ int data_reg2 = label->datahi_reg;
+ uint8_t *raddr = label->raddr;
+ uint8_t **label_ptr = &label->label_ptr[0];
+
+ s_bits = opc & 3;
+
+ /* resolve label address */
+ *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+ if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+ *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+ }
+
+ /* extended helper signature: __ext_ld_mmu(target_ulong addr, int mmu_idx,
+ uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+ tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
+ stack_adjust = 4;
+ tcg_out_pushi(s, mem_index); /* mmu index */
+ stack_adjust += 4;
+ if (TARGET_LONG_BITS == 64) {
+ tcg_out_push(s, addrhi_reg);
+ stack_adjust += 4;
+ }
+ tcg_out_push(s, addrlo_reg); /* guest addr */
+ stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+ tcg_out_push(s, TCG_AREG0);
+ stack_adjust += 4;
+#endif
+#else
+ /* The first argument is already loaded with addrlo. */
+ arg_idx = 1;
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
+ mem_index);
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++],
+ (uintptr_t)(raddr - 1));
+#ifdef CONFIG_TCG_PASS_AREG0
+ /* XXX/FIXME: suboptimal */
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+ tcg_target_call_iarg_regs[2]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+ tcg_target_call_iarg_regs[1]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+ tcg_target_call_iarg_regs[0]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+ TCG_AREG0);
+#endif
+#endif
+
+ tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
+
+#if TCG_TARGET_REG_BITS == 32
+ if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+ /* Pop and discard. This is 2 bytes smaller than the add. */
+ tcg_out_pop(s, TCG_REG_ECX);
+ } else if (stack_adjust != 0) {
+ tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+ }
+#endif
+
+ switch(opc) {
+ case 0 | 4:
+ tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
+ break;
+ case 1 | 4:
+ tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
+ break;
+ case 0:
+ tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
+ break;
+ case 1:
+ tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
+ break;
+ case 2:
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+ break;
+#if TCG_TARGET_REG_BITS == 64
+ case 2 | 4:
+ tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
+ break;
+#endif
+ case 3:
+ if (TCG_TARGET_REG_BITS == 64) {
+ tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
+ } else if (data_reg == TCG_REG_EDX) {
+ /* xchg %edx, %eax */
+ tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX);
+ } else {
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
+ tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX);
+ }
+ break;
+ default:
+ tcg_abort();
+ }
+
+ /* jump back to original code */
+ tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/* generates slow case of qemu_st at the end of TB */
+static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label)
+{
+ int s_bits;
+ int stack_adjust;
+ int opc = label->opc_ext & HL_OPC_MASK;
+ int mem_index = label->mem_index;
+ int data_reg = label->datalo_reg;
+#if TCG_TARGET_REG_BITS == 32
+ int data_reg2 = label->datahi_reg;
+ int addrlo_reg = label->addrlo_reg;
+ int addrhi_reg = label->addrhi_reg;
+#endif
+ uint8_t *raddr = label->raddr;
+ uint8_t **label_ptr = &label->label_ptr[0];
+
+ s_bits = opc & 3;
+
+ /* resolve label address */
+ *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
+ if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+ *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4);
+ }
+
+ /* extended helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val,
+ int mmu_idx, uintptr_t raddr) */
+#if TCG_TARGET_REG_BITS == 32
+ tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */
+ stack_adjust = 4;
+ tcg_out_pushi(s, mem_index); /* mmu index */
+ stack_adjust += 4;
+ if (opc == 3) {
+ tcg_out_push(s, data_reg2);
+ stack_adjust += 4;
+ }
+ tcg_out_push(s, data_reg); /* guest data */
+ stack_adjust += 4;
+ if (TARGET_LONG_BITS == 64) {
+ tcg_out_push(s, addrhi_reg);
+ stack_adjust += 4;
+ }
+ tcg_out_push(s, addrlo_reg); /* guest addr */
+ stack_adjust += 4;
+#ifdef CONFIG_TCG_PASS_AREG0
+ tcg_out_push(s, TCG_AREG0);
+ stack_adjust += 4;
+#endif
+#else
+ tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+ tcg_target_call_iarg_regs[1], data_reg);
+ tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index);
+ tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], (uintptr_t)(raddr - 1));
+ stack_adjust = 0;
+#ifdef CONFIG_TCG_PASS_AREG0
+ /* XXX/FIXME: suboptimal */
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3],
+ tcg_target_call_iarg_regs[2]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2],
+ tcg_target_call_iarg_regs[1]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1],
+ tcg_target_call_iarg_regs[0]);
+ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0],
+ TCG_AREG0);
+#endif
+#endif
+
+ tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
+
+ if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) {
+ /* Pop and discard. This is 2 bytes smaller than the add. */
+ tcg_out_pop(s, TCG_REG_ECX);
+ } else if (stack_adjust != 0) {
+ tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust);
+ }
+
+ /* jump back to original code */
+ tcg_out_jmp(s, (tcg_target_long) raddr);
+}
+
+/* generates all of the slow cases of qemu_ld/st at the end of TB */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s)
+{
+ int i;
+ TCGLabelQemuLdst *label;
+
+ for (i = 0; i < s->nb_qemu_ldst_labels; i++) {
+ label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i];
+ if (IS_QEMU_LD_LABEL(label)) {
+ tcg_out_qemu_ld_slow_path(s, label);
+ } else {
+ tcg_out_qemu_st_slow_path(s, label);
+ }
+ }
+}
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
+
static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
const TCGArg *args, const int *const_args)
{
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8386b70..c33cb96 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -301,6 +301,14 @@ void tcg_func_start(TCGContext *s)
gen_opc_ptr = gen_opc_buf;
gen_opparam_ptr = gen_opparam_buf;
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* initialize qemu_ld/st labels which help to generate TLB miss case codes at the end of TB */
+ s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * TCG_MAX_QEMU_LDST);
+ if (!s->qemu_ldst_labels) {
+ tcg_abort();
+ }
+ s->nb_qemu_ldst_labels = 0;
+#endif
}
static inline void tcg_temp_alloc(TCGContext *s, int n)
@@ -2169,6 +2177,10 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf,
#endif
}
the_end:
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* Generate MMU call helpers at the end of block (currently only for qemu_ld/st) */
+ tcg_out_qemu_ldst_slow_path(s);
+#endif
return -1;
}
diff --git a/tcg/tcg.h b/tcg/tcg.h
index d710694..e52d3a4 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -187,6 +187,29 @@ typedef tcg_target_ulong TCGArg;
are aliases for target_ulong and host pointer sized values respectively.
*/
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* Macros and structures for qemu_ld/st IR code optimization:
+ It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in exec-all.h. */
+#define TCG_MAX_QEMU_LDST 320
+#define HL_LDST_SHIFT 4
+#define HL_LDST_MASK (1 << HL_LDST_SHIFT)
+#define HL_ST_MASK HL_LDST_MASK
+#define HL_OPC_MASK (HL_LDST_MASK - 1)
+#define IS_QEMU_LD_LABEL(L) (!((L)->opc_ext & HL_LDST_MASK))
+#define IS_QEMU_ST_LABEL(L) ((L)->opc_ext & HL_LDST_MASK)
+
+typedef struct TCGLabelQemuLdst {
+ int opc_ext; /* | 27bit (reserved) | 1bit (ld/st flag) | 4bit (opc) | */
+ int addrlo_reg; /* reg index for the low word of guest virtual address */
+ int addrhi_reg; /* reg index for the high word of guest virtual address */
+ int datalo_reg; /* reg index for the low word to be loaded or to be stored */
+ int datahi_reg; /* reg index for the high word to be loaded or to be stored */
+ int mem_index; /* soft MMU memory index */
+ uint8_t *raddr; /* return address (located end of TB) */
+ uint8_t *label_ptr[2]; /* label pointers to be updated */
+} TCGLabelQemuLdst;
+#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */
+
#ifdef CONFIG_DEBUG_TCG
#define DEBUG_TCGV 1
#endif
@@ -389,6 +412,13 @@ struct TCGContext {
#ifdef CONFIG_DEBUG_TCG
int temps_in_use;
#endif
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+ /* labels info for qemu_ld/st IRs
+ The labels help to generate TLB miss case codes at the end of TB */
+ TCGLabelQemuLdst *qemu_ldst_labels;
+ int nb_qemu_ldst_labels;
+#endif
};
extern TCGContext tcg_ctx;
@@ -588,3 +618,8 @@ extern uint8_t code_gen_prologue[];
#endif
void tcg_register_jit(void *buf, size_t buf_size);
+
+#if defined(CONFIG_QEMU_LDST_OPTIMIZATION)
+/* qemu_ld/st generation at the end of TB */
+void tcg_out_qemu_ldst_slow_path(TCGContext *s);
+#endif
__________________________________
Principal Engineer
VM Team
Yeongkyoon Lee
S-Core Co., Ltd.
D.L.: +82-31-696-7249
M.P.: +82-10-9965-1265
__________________________________
reply other threads:[~2012-07-04 6:01 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=25206019.11171341381666158.JavaMail.weblogic@epml28 \
--to=yeongkyoon.lee@samsung.com \
--cc=blauwirbel@gmail.com \
--cc=chenwj@iis.sinica.edu.tw \
--cc=e.voevodin@samsung.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.