* [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements
@ 2016-06-04 7:54 Richard Henderson
2016-06-04 7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Richard Henderson @ 2016-06-04 7:54 UTC (permalink / raw)
To: qemu-devel
The first change does two things: (1) improve bsd-user so that it
doesn't continually reload guest_base into a temp register and
(2) extract the bulk of the guest_base logic to a routine that
is run once at startup.
The second change adds segmentation support to 32-bit linux. There,
if we're using a guest base, we can save 3 bytes per memory op by
using a segment override. In addition, if we're using a reserved_va,
we can set up the segment such that guest memory references are
constrained by the segment.
Comments?
r~
Richard Henderson (2):
tcg/i386: Reserve register for guest_base if a segment isn't available
tcg/i386: Use segment for 32-bit guest base on linux
tcg/i386/tcg-target.inc.c | 181 +++++++++++++++++++++++++---------------------
1 file changed, 100 insertions(+), 81 deletions(-)
--
2.5.5
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available
2016-06-04 7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
@ 2016-06-04 7:54 ` Richard Henderson
2016-06-04 7:54 ` [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux Richard Henderson
2017-01-30 10:40 ` [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Alex Bennée
2 siblings, 0 replies; 4+ messages in thread
From: Richard Henderson @ 2016-06-04 7:54 UTC (permalink / raw)
To: qemu-devel
This saves 2 insns and 10 bytes from the implementation of
each memory operation.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/i386/tcg-target.inc.c | 143 +++++++++++++++++++++-------------------------
1 file changed, 66 insertions(+), 77 deletions(-)
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 317484c..11cbb3c 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1441,22 +1441,43 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
tcg_out_push(s, retaddr);
tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
}
-#elif defined(__x86_64__) && defined(__linux__)
-# include <asm/prctl.h>
-# include <sys/prctl.h>
-
+#elif TCG_TARGET_REG_BITS == 64
+# ifdef __linux__
+# include <asm/prctl.h>
+# include <sys/prctl.h>
int arch_prctl(int code, unsigned long addr);
+# endif
+static int32_t guest_base_ofs;
static int guest_base_flags;
-static inline void setup_guest_base_seg(void)
+static int guest_base_reg = -1;
+static void setup_guest_base(TCGContext *s)
{
+ if (guest_base == 0) {
+ if (TARGET_LONG_BITS == 32) {
+ guest_base_flags = P_ADDR32;
+ }
+ return;
+ }
+# ifdef __linux__
if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
- guest_base_flags = P_GS;
+ guest_base_flags = (TARGET_LONG_BITS == 32 ? P_GS | P_ADDR32 : P_GS);
+ return;
+ }
+# endif
+ if (guest_base == (int32_t)guest_base) {
+ guest_base_ofs = guest_base;
+ } else {
+ guest_base_reg = TCG_REG_EBP;
+ tcg_regset_set_reg(s->reserved_regs, guest_base_reg);
+ tcg_out_movi(s, TCG_TYPE_PTR, guest_base_reg, guest_base);
}
}
#else
-# define guest_base_flags 0
-static inline void setup_guest_base_seg(void) { }
+# define guest_base_flags 0
+# define guest_base_reg -1
+# define guest_base_ofs guest_base
+# define setup_guest_base(s)
#endif /* SOFTMMU */
static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
@@ -1595,42 +1616,26 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
s->code_ptr, label_ptr);
#else
{
- int32_t offset = guest_base;
TCGReg base = addrlo;
- int index = -1;
- int seg = 0;
+ int flags = guest_base_flags;
/* For a 32-bit guest, the high 32 bits may contain garbage.
- We can do this with the ADDR32 prefix if we're not using
+ We do this with the ADDR32 prefix if we're not using
a guest base, or when using segmentation. Otherwise we
- need to zero-extend manually. */
- if (guest_base == 0 || guest_base_flags) {
- seg = guest_base_flags;
- offset = 0;
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
- seg |= P_ADDR32;
- }
- } else if (TCG_TARGET_REG_BITS == 64) {
- if (TARGET_LONG_BITS == 32) {
- tcg_out_ext32u(s, TCG_REG_L0, base);
- base = TCG_REG_L0;
- }
- if (offset != guest_base) {
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
- index = TCG_REG_L1;
- offset = 0;
- }
+ need to zero-extend manually. See setup_guest_base. */
+ if (flags == 0 && TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+ tcg_out_ext32u(s, TCG_REG_L0, base);
+ base = TCG_REG_L0;
}
-
- tcg_out_qemu_ld_direct(s, datalo, datahi,
- base, index, offset, seg, opc);
+ tcg_out_qemu_ld_direct(s, datalo, datahi, base, guest_base_reg,
+ guest_base_ofs, flags, opc);
}
#endif
}
static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
- TCGReg base, intptr_t ofs, int seg,
- TCGMemOp memop)
+ TCGReg base, int index, intptr_t ofs,
+ int seg, TCGMemOp memop)
{
/* ??? Ideally we wouldn't need a scratch register. For user-only,
we could perform the bswap twice to restore the original value
@@ -1654,8 +1659,8 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
datalo = scratch;
}
- tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
- datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg, datalo,
+ base, index, 0, ofs);
break;
case MO_16:
if (bswap) {
@@ -1663,7 +1668,8 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_rolw_8(s, scratch);
datalo = scratch;
}
- tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
+ base, index, 0, ofs);
break;
case MO_32:
if (bswap) {
@@ -1671,7 +1677,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_bswap32(s, scratch);
datalo = scratch;
}
- tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
break;
case MO_64:
if (TCG_TARGET_REG_BITS == 64) {
@@ -1680,22 +1686,27 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_bswap64(s, scratch);
datalo = scratch;
}
- tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
+ tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
+ base, index, 0, ofs);
} else if (bswap) {
tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
tcg_out_bswap32(s, scratch);
- tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
+ tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
+ base, index, 0, ofs);
tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
tcg_out_bswap32(s, scratch);
- tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
+ tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
+ base, index, 0, ofs+4);
} else {
if (real_bswap) {
int t = datalo;
datalo = datahi;
datahi = t;
}
- tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
- tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
+ tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+ base, index, 0, ofs);
+ tcg_out_modrm_sib_offset(s, movop + seg, datahi,
+ base, index, 0, ofs+4);
}
break;
default:
@@ -1728,43 +1739,23 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
label_ptr, offsetof(CPUTLBEntry, addr_write));
/* TLB Hit. */
- tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
+ tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
/* Record the current context of a store into ldst label */
add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
s->code_ptr, label_ptr);
#else
{
- int32_t offset = guest_base;
TCGReg base = addrlo;
- int seg = 0;
+ int flags = guest_base_flags;
/* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
- if (guest_base == 0 || guest_base_flags) {
- seg = guest_base_flags;
- offset = 0;
- if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
- seg |= P_ADDR32;
- }
- } else if (TCG_TARGET_REG_BITS == 64) {
- /* ??? Note that we can't use the same SIB addressing scheme
- as for loads, since we require L0 free for bswap. */
- if (offset != guest_base) {
- if (TARGET_LONG_BITS == 32) {
- tcg_out_ext32u(s, TCG_REG_L0, base);
- base = TCG_REG_L0;
- }
- tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
- tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
- base = TCG_REG_L1;
- offset = 0;
- } else if (TARGET_LONG_BITS == 32) {
- tcg_out_ext32u(s, TCG_REG_L1, base);
- base = TCG_REG_L1;
- }
+ if (flags == 0 && TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+ tcg_out_ext32u(s, TCG_REG_L1, base);
+ base = TCG_REG_L1;
}
-
- tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
+ tcg_out_qemu_st_direct(s, datalo, datahi, base, guest_base_reg,
+ guest_base_ofs, flags, opc);
}
#endif
}
@@ -2326,6 +2317,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
#else
tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+
+# if !defined(CONFIG_SOFTMMU)
+ setup_guest_base(s);
+# endif
+
/* jmp *tb. */
tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
#endif
@@ -2339,13 +2335,6 @@ static void tcg_target_qemu_prologue(TCGContext *s)
tcg_out_pop(s, tcg_target_callee_save_regs[i]);
}
tcg_out_opc(s, OPC_RET, 0, 0, 0);
-
-#if !defined(CONFIG_SOFTMMU)
- /* Try to set up a segment register to point to guest_base. */
- if (guest_base) {
- setup_guest_base_seg();
- }
-#endif
}
static void tcg_target_init(TCGContext *s)
--
2.5.5
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux
2016-06-04 7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
2016-06-04 7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
@ 2016-06-04 7:54 ` Richard Henderson
2017-01-30 10:40 ` [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Alex Bennée
2 siblings, 0 replies; 4+ messages in thread
From: Richard Henderson @ 2016-06-04 7:54 UTC (permalink / raw)
To: qemu-devel; +Cc: Richard Henderson
From: Richard Henderson <rth@smalltime.twiddle.net>
This saves 3 bytes per memory operation.
Signed-off-by: Richard Henderson <rth@smalltime.twiddle.net>
---
tcg/i386/tcg-target.inc.c | 44 +++++++++++++++++++++++++++++++++++++-------
1 file changed, 37 insertions(+), 7 deletions(-)
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 11cbb3c..d8c2f6d 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -290,14 +290,13 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
# define P_REXW 0x1000 /* Set REX.W = 1 */
# define P_REXB_R 0x2000 /* REG field as byte register */
# define P_REXB_RM 0x4000 /* R/M field as byte register */
-# define P_GS 0x8000 /* gs segment override */
#else
# define P_ADDR32 0
# define P_REXW 0
# define P_REXB_R 0
# define P_REXB_RM 0
-# define P_GS 0
#endif
+#define P_SEG 0x8000 /* fs/gs segment override */
#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
@@ -420,8 +419,8 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
{
int rex;
- if (opc & P_GS) {
- tcg_out8(s, 0x65);
+ if (opc & P_SEG) {
+ tcg_out8(s, 0x65); /* %gs */
}
if (opc & P_DATA16) {
/* We should never be asking for both 16 and 64-bit operation. */
@@ -462,6 +461,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
#else
static void tcg_out_opc(TCGContext *s, int opc)
{
+ if (opc & P_SEG) {
+ tcg_out8(s, 0x64); /* %fs */
+ }
if (opc & P_DATA16) {
tcg_out8(s, 0x66);
}
@@ -1461,7 +1463,7 @@ static void setup_guest_base(TCGContext *s)
}
# ifdef __linux__
if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
- guest_base_flags = (TARGET_LONG_BITS == 32 ? P_GS | P_ADDR32 : P_GS);
+ guest_base_flags = P_SEG + (TARGET_LONG_BITS == 32) * P_ADDR32;
return;
}
# endif
@@ -1473,6 +1475,33 @@ static void setup_guest_base(TCGContext *s)
tcg_out_movi(s, TCG_TYPE_PTR, guest_base_reg, guest_base);
}
}
+#elif defined(__linux__)
+# include <asm/ldt.h>
+# include <sys/syscall.h>
+
+static int32_t guest_base_ofs;
+static int guest_base_flags;
+#define guest_base_reg -1
+static void setup_guest_base(TCGContext *s)
+{
+ if (guest_base != 0) {
+ struct user_desc desc = {
+ .entry_number = -1,
+ .base_addr = guest_base,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .limit_in_pages = 1,
+ .useable = 1,
+ };
+ if (syscall(SYS_set_thread_area, &desc) == 0) {
+ int seg = desc.entry_number * 8 + 3;
+ asm volatile("movl %0,%%fs" : : "r"(seg));
+ guest_base_flags = P_SEG;
+ return;
+ }
+ }
+ guest_base_ofs = guest_base;
+}
#else
# define guest_base_flags 0
# define guest_base_reg -1
@@ -2310,6 +2339,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
(ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+# if !defined(CONFIG_SOFTMMU)
+ setup_guest_base(s);
+# endif
/* jmp *tb. */
tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
(ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
@@ -2317,11 +2349,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
#else
tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
-
# if !defined(CONFIG_SOFTMMU)
setup_guest_base(s);
# endif
-
/* jmp *tb. */
tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
#endif
--
2.5.5
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements
2016-06-04 7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
2016-06-04 7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
2016-06-04 7:54 ` [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux Richard Henderson
@ 2017-01-30 10:40 ` Alex Bennée
2 siblings, 0 replies; 4+ messages in thread
From: Alex Bennée @ 2017-01-30 10:40 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-devel
Richard Henderson <rth@twiddle.net> writes:
> The first change does two things: (1) improve bsd-user so that it
> doesn't continually reload guest_base into a temp register and
> (2) extract the bulk of the guest_base logic to a routine that
> is run once at startup.
>
> The second change adds segmentation support to 32-bit linux. There,
> if we're using a guest base, we can save 3 bytes per memory op by
> using a segment override. In addition, if we're using a reserved_va,
> we can set up the segment such that guest memory references are
> constrained by the segment.
>
> Comments?
I'm not sure how to best review this given its fairly low level
x86 stuff. Do you have any numbers to show how this improves things?
>
>
> r~
>
>
> Richard Henderson (2):
> tcg/i386: Reserve register for guest_base if a segment isn't available
> tcg/i386: Use segment for 32-bit guest base on linux
>
> tcg/i386/tcg-target.inc.c | 181 +++++++++++++++++++++++++---------------------
> 1 file changed, 100 insertions(+), 81 deletions(-)
--
Alex Bennée
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2017-01-30 10:40 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-06-04 7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
2016-06-04 7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
2016-06-04 7:54 ` [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux Richard Henderson
2017-01-30 10:40 ` [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Alex Bennée
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.