qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements
@ 2016-06-04  7:54 Richard Henderson
  2016-06-04  7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Richard Henderson @ 2016-06-04  7:54 UTC (permalink / raw)
  To: qemu-devel

The first change does two things: (1) improve bsd-user so that it
doesn't continually reload guest_base into a temp register and
(2) extract the bulk of the guest_base logic to a routine that
is run once at startup.

The second change adds segmentation support to 32-bit linux.  There,
if we're using a guest base, we can save 3 bytes per memory op by
using a segment override.  In addition, if we're using a reserved_va,
we can set up the segment such that guest memory references are
constrained by the segment.

Comments?


r~


Richard Henderson (2):
  tcg/i386: Reserve register for guest_base if a segment isn't available
  tcg/i386: Use segment for 32-bit guest base on linux

 tcg/i386/tcg-target.inc.c | 181 +++++++++++++++++++++++++---------------------
 1 file changed, 100 insertions(+), 81 deletions(-)

-- 
2.5.5

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available
  2016-06-04  7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
@ 2016-06-04  7:54 ` Richard Henderson
  2016-06-04  7:54 ` [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux Richard Henderson
  2017-01-30 10:40 ` [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Alex Bennée
  2 siblings, 0 replies; 4+ messages in thread
From: Richard Henderson @ 2016-06-04  7:54 UTC (permalink / raw)
  To: qemu-devel

This saves 2 insns and 10 bytes from the implementation of
each memory operation.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.inc.c | 143 +++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 77 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 317484c..11cbb3c 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1441,22 +1441,43 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     tcg_out_push(s, retaddr);
     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
 }
-#elif defined(__x86_64__) && defined(__linux__)
-# include <asm/prctl.h>
-# include <sys/prctl.h>
-
+#elif TCG_TARGET_REG_BITS == 64
+# ifdef __linux__
+#  include <asm/prctl.h>
+#  include <sys/prctl.h>
 int arch_prctl(int code, unsigned long addr);
+# endif
 
+static int32_t guest_base_ofs;
 static int guest_base_flags;
-static inline void setup_guest_base_seg(void)
+static int guest_base_reg = -1;
+static void setup_guest_base(TCGContext *s)
 {
+    if (guest_base == 0) {
+        if (TARGET_LONG_BITS == 32) {
+            guest_base_flags = P_ADDR32;
+        }
+        return;
+    }
+# ifdef __linux__
     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
-        guest_base_flags = P_GS;
+        guest_base_flags = (TARGET_LONG_BITS == 32 ? P_GS | P_ADDR32 : P_GS);
+        return;
+    }
+# endif
+    if (guest_base == (int32_t)guest_base) {
+        guest_base_ofs = guest_base;
+    } else {
+        guest_base_reg = TCG_REG_EBP;
+        tcg_regset_set_reg(s->reserved_regs, guest_base_reg);
+        tcg_out_movi(s, TCG_TYPE_PTR, guest_base_reg, guest_base);
     }
 }
 #else
-# define guest_base_flags 0
-static inline void setup_guest_base_seg(void) { }
+# define guest_base_flags  0
+# define guest_base_reg    -1
+# define guest_base_ofs    guest_base
+# define setup_guest_base(s)
 #endif /* SOFTMMU */
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
@@ -1595,42 +1616,26 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
                         s->code_ptr, label_ptr);
 #else
     {
-        int32_t offset = guest_base;
         TCGReg base = addrlo;
-        int index = -1;
-        int seg = 0;
+        int flags = guest_base_flags;
 
         /* For a 32-bit guest, the high 32 bits may contain garbage.
-           We can do this with the ADDR32 prefix if we're not using
+           We do this with the ADDR32 prefix if we're not using
            a guest base, or when using segmentation.  Otherwise we
-           need to zero-extend manually.  */
-        if (guest_base == 0 || guest_base_flags) {
-            seg = guest_base_flags;
-            offset = 0;
-            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
-                seg |= P_ADDR32;
-            }
-        } else if (TCG_TARGET_REG_BITS == 64) {
-            if (TARGET_LONG_BITS == 32) {
-                tcg_out_ext32u(s, TCG_REG_L0, base);
-                base = TCG_REG_L0;
-            }
-            if (offset != guest_base) {
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
-                index = TCG_REG_L1;
-                offset = 0;
-            }
+           need to zero-extend manually.  See setup_guest_base.  */
+        if (flags == 0 && TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+            tcg_out_ext32u(s, TCG_REG_L0, base);
+            base = TCG_REG_L0;
         }
-
-        tcg_out_qemu_ld_direct(s, datalo, datahi,
-                               base, index, offset, seg, opc);
+        tcg_out_qemu_ld_direct(s, datalo, datahi, base, guest_base_reg,
+                               guest_base_ofs, flags, opc);
     }
 #endif
 }
 
 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
-                                   TCGReg base, intptr_t ofs, int seg,
-                                   TCGMemOp memop)
+                                   TCGReg base, int index, intptr_t ofs,
+                                   int seg, TCGMemOp memop)
 {
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
@@ -1654,8 +1659,8 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
-                             datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg, datalo,
+                                 base, index, 0, ofs);
         break;
     case MO_16:
         if (bswap) {
@@ -1663,7 +1668,8 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_rolw_8(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
+                                 base, index, 0, ofs);
         break;
     case MO_32:
         if (bswap) {
@@ -1671,7 +1677,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_bswap32(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+        tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
         break;
     case MO_64:
         if (TCG_TARGET_REG_BITS == 64) {
@@ -1680,22 +1686,27 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                 tcg_out_bswap64(s, scratch);
                 datalo = scratch;
             }
-            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
+            tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
+                                     base, index, 0, ofs);
         } else if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
             tcg_out_bswap32(s, scratch);
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
+            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
+                                     base, index, 0, ofs);
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
             tcg_out_bswap32(s, scratch);
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
+            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
+                                     base, index, 0, ofs+4);
         } else {
             if (real_bswap) {
                 int t = datalo;
                 datalo = datahi;
                 datahi = t;
             }
-            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
-            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
+            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+                                     base, index, 0, ofs);
+            tcg_out_modrm_sib_offset(s, movop + seg, datahi,
+                                     base, index, 0, ofs+4);
         }
         break;
     default:
@@ -1728,43 +1739,23 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
                      label_ptr, offsetof(CPUTLBEntry, addr_write));
 
     /* TLB Hit.  */
-    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
+    tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
 
     /* Record the current context of a store into ldst label */
     add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
                         s->code_ptr, label_ptr);
 #else
     {
-        int32_t offset = guest_base;
         TCGReg base = addrlo;
-        int seg = 0;
+        int flags = guest_base_flags;
 
         /* See comment in tcg_out_qemu_ld re zero-extension of addrlo.  */
-        if (guest_base == 0 || guest_base_flags) {
-            seg = guest_base_flags;
-            offset = 0;
-            if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
-                seg |= P_ADDR32;
-            }
-        } else if (TCG_TARGET_REG_BITS == 64) {
-            /* ??? Note that we can't use the same SIB addressing scheme
-               as for loads, since we require L0 free for bswap.  */
-            if (offset != guest_base) {
-                if (TARGET_LONG_BITS == 32) {
-                    tcg_out_ext32u(s, TCG_REG_L0, base);
-                    base = TCG_REG_L0;
-                }
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
-                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
-                base = TCG_REG_L1;
-                offset = 0;
-            } else if (TARGET_LONG_BITS == 32) {
-                tcg_out_ext32u(s, TCG_REG_L1, base);
-                base = TCG_REG_L1;
-            }
+        if (flags == 0 && TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+            tcg_out_ext32u(s, TCG_REG_L1, base);
+            base = TCG_REG_L1;
         }
-
-        tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
+        tcg_out_qemu_st_direct(s, datalo, datahi, base, guest_base_reg,
+                               guest_base_ofs, flags, opc);
     }
 #endif
 }
@@ -2326,6 +2317,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 #else
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+
+# if !defined(CONFIG_SOFTMMU)
+    setup_guest_base(s);
+# endif
+
     /* jmp *tb.  */
     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
 #endif
@@ -2339,13 +2335,6 @@ static void tcg_target_qemu_prologue(TCGContext *s)
         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
     }
     tcg_out_opc(s, OPC_RET, 0, 0, 0);
-
-#if !defined(CONFIG_SOFTMMU)
-    /* Try to set up a segment register to point to guest_base.  */
-    if (guest_base) {
-        setup_guest_base_seg();
-    }
-#endif
 }
 
 static void tcg_target_init(TCGContext *s)
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux
  2016-06-04  7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
  2016-06-04  7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
@ 2016-06-04  7:54 ` Richard Henderson
  2017-01-30 10:40 ` [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Alex Bennée
  2 siblings, 0 replies; 4+ messages in thread
From: Richard Henderson @ 2016-06-04  7:54 UTC (permalink / raw)
  To: qemu-devel; +Cc: Richard Henderson

From: Richard Henderson <rth@smalltime.twiddle.net>

This saves 3 bytes per memory operation.

Signed-off-by: Richard Henderson <rth@smalltime.twiddle.net>
---
 tcg/i386/tcg-target.inc.c | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 11cbb3c..d8c2f6d 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -290,14 +290,13 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 # define P_REXW         0x1000          /* Set REX.W = 1 */
 # define P_REXB_R       0x2000          /* REG field as byte register */
 # define P_REXB_RM      0x4000          /* R/M field as byte register */
-# define P_GS           0x8000          /* gs segment override */
 #else
 # define P_ADDR32	0
 # define P_REXW		0
 # define P_REXB_R	0
 # define P_REXB_RM	0
-# define P_GS           0
 #endif
+#define P_SEG           0x8000          /* fs/gs segment override */
 #define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
 #define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
 
@@ -420,8 +419,8 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 {
     int rex;
 
-    if (opc & P_GS) {
-        tcg_out8(s, 0x65);
+    if (opc & P_SEG) {
+        tcg_out8(s, 0x65); /* %gs */
     }
     if (opc & P_DATA16) {
         /* We should never be asking for both 16 and 64-bit operation.  */
@@ -462,6 +461,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 #else
 static void tcg_out_opc(TCGContext *s, int opc)
 {
+    if (opc & P_SEG) {
+        tcg_out8(s, 0x64); /* %fs */
+    }
     if (opc & P_DATA16) {
         tcg_out8(s, 0x66);
     }
@@ -1461,7 +1463,7 @@ static void setup_guest_base(TCGContext *s)
     }
 # ifdef __linux__
     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
-        guest_base_flags = (TARGET_LONG_BITS == 32 ? P_GS | P_ADDR32 : P_GS);
+        guest_base_flags = P_SEG + (TARGET_LONG_BITS == 32) * P_ADDR32;
         return;
     }
 # endif
@@ -1473,6 +1475,33 @@ static void setup_guest_base(TCGContext *s)
         tcg_out_movi(s, TCG_TYPE_PTR, guest_base_reg, guest_base);
     }
 }
+#elif defined(__linux__)
+# include <asm/ldt.h>
+# include <sys/syscall.h>
+
+static int32_t guest_base_ofs;
+static int guest_base_flags;
+#define guest_base_reg    -1
+static void setup_guest_base(TCGContext *s)
+{
+    if (guest_base != 0) {
+        struct user_desc desc = {
+            .entry_number = -1,
+            .base_addr = guest_base,
+            .limit = 0xfffff,
+            .seg_32bit = 1,
+            .limit_in_pages = 1,
+            .useable = 1,
+        };
+        if (syscall(SYS_set_thread_area, &desc) == 0) {
+	    int seg = desc.entry_number * 8 + 3;
+            asm volatile("movl %0,%%fs" : : "r"(seg));
+	    guest_base_flags = P_SEG;
+            return;
+	}
+    }
+    guest_base_ofs = guest_base;
+}
 #else
 # define guest_base_flags  0
 # define guest_base_reg    -1
@@ -2310,6 +2339,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
+# if !defined(CONFIG_SOFTMMU)
+    setup_guest_base(s);
+# endif
     /* jmp *tb.  */
     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
 		         (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
@@ -2317,11 +2349,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 #else
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
-
 # if !defined(CONFIG_SOFTMMU)
     setup_guest_base(s);
 # endif
-
     /* jmp *tb.  */
     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
 #endif
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements
  2016-06-04  7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
  2016-06-04  7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
  2016-06-04  7:54 ` [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux Richard Henderson
@ 2017-01-30 10:40 ` Alex Bennée
  2 siblings, 0 replies; 4+ messages in thread
From: Alex Bennée @ 2017-01-30 10:40 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <rth@twiddle.net> writes:

> The first change does two things: (1) improve bsd-user so that it
> doesn't continually reload guest_base into a temp register and
> (2) extract the bulk of the guest_base logic to a routine that
> is run once at startup.
>
> The second change adds segmentation support to 32-bit linux.  There,
> if we're using a guest base, we can save 3 bytes per memory op by
> using a segment override.  In addition, if we're using a reserved_va,
> we can set up the segment such that guest memory references are
> constrained by the segment.
>
> Comments?

I'm not sure how to best review this given its fairly low level
x86 stuff. Do you have any numbers to show how this improves things?

>
>
> r~
>
>
> Richard Henderson (2):
>   tcg/i386: Reserve register for guest_base if a segment isn't available
>   tcg/i386: Use segment for 32-bit guest base on linux
>
>  tcg/i386/tcg-target.inc.c | 181 +++++++++++++++++++++++++---------------------
>  1 file changed, 100 insertions(+), 81 deletions(-)


--
Alex Bennée

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-01-30 10:40 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-06-04  7:54 [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Richard Henderson
2016-06-04  7:54 ` [Qemu-devel] [PATCH 1/2] tcg/i386: Reserve register for guest_base if a segment isn't available Richard Henderson
2016-06-04  7:54 ` [Qemu-devel] [PATCH 2/2] tcg/i386: Use segment for 32-bit guest base on linux Richard Henderson
2017-01-30 10:40 ` [Qemu-devel] [PATCH 0/2] tcg/i386 guest_base improvements Alex Bennée

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).