[Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
@ 2013-12-20 23:00 Richard Henderson
  2013-12-21 14:08 ` Paolo Bonzini
  2013-12-22 20:03 ` Aurelien Jarno
  0 siblings, 2 replies; 6+ messages in thread
From: Richard Henderson @ 2013-12-20 23:00 UTC (permalink / raw)
  To: qemu-devel

As present on Atom and Haswell processors.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 disas/i386.c          |   8 ++--
 tcg/i386/tcg-target.c | 127 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 91 insertions(+), 44 deletions(-)

Here's to "stress testing" a Haswell laptop before Santa delivers it.  ;-)


r~


diff --git a/disas/i386.c b/disas/i386.c
index 47f1f2e..beb33f0 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = {
 
   /* PREGRP87 */
   {
-    { "(bad)",	{ XX } },
-    { "(bad)",	{ XX } },
+    { "movbe",	{ Gv, M } },
+    { "movbe",	{ Gw, M } },
     { "(bad)",	{ XX } },
     { "crc32",	{ Gdq, { CRC32_Fixup, b_mode } } },
   },
 
   /* PREGRP88 */
   {
-    { "(bad)",	{ XX } },
-    { "(bad)",	{ XX } },
+    { "movbe",	{ M, Gv } },
+    { "movbe",	{ M, Gw } },
     { "(bad)",	{ XX } },
     { "crc32",	{ Gdq, { CRC32_Fixup, v_mode } } },
   },
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 7ac8e45..e76f3f3 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = {
 # define TCG_REG_L1 TCG_REG_EDX
 #endif
 
+#ifdef CONFIG_CPUID_H
+#include <cpuid.h>
+#endif
+
 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
    is available.  However, the host compiler must supply <cpuid.h>, as we're
    not going to go so far as our own inline assembly.  */
 #if TCG_TARGET_REG_BITS == 64
 # define have_cmov 1
 #elif defined(CONFIG_CPUID_H)
-#include <cpuid.h>
 static bool have_cmov;
 #else
 # define have_cmov 0
 #endif
 
+/* There is no pre-processor definition for MOVBE to fall back on.  */
+#ifdef CONFIG_CPUID_H
+static bool have_movbe;
+#else
+#define have_movbe 0
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 # define P_REXB_RM	0
 # define P_GS           0
 #endif
+#define P_EXT38         0x8000          /* 0x0f 0x38 opcode prefix */
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
@@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_MOVB_EvIz   (0xc6)
 #define OPC_MOVL_EvIz	(0xc7)
 #define OPC_MOVL_Iv     (0xb8)
+#define OPC_MOVBE_GyMy  (0xf0 | P_EXT | P_EXT38)
+#define OPC_MOVBE_MyGy  (0xf1 | P_EXT | P_EXT38)
 #define OPC_MOVSBL	(0xbe | P_EXT)
 #define OPC_MOVSWL	(0xbf | P_EXT)
 #define OPC_MOVSLQ	(0x63 | P_REXW)
@@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 
     if (opc & P_EXT) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+        }
     }
     tcg_out8(s, opc);
 }
@@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc)
     }
     if (opc & P_EXT) {
         tcg_out8(s, 0x0f);
+        if (opc & P_EXT38) {
+            tcg_out8(s, 0x38);
+        }
     }
     tcg_out8(s, opc);
 }
@@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_GvEv;
+
+    if (real_bswap && have_movbe) {
+        bswap = 0;
+        movop = OPC_MOVBE_GyMy;
+    }
 
     switch (memop & MO_SSIZE) {
     case MO_UB:
@@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs);
         break;
     case MO_UW:
-        tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
-        if (bswap) {
-            tcg_out_rolw_8(s, datalo);
+        if (real_bswap && have_movbe) {
+            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                 datalo, base, ofs);
+            tcg_out_ext16u(s, datalo, datalo);
+        } else {
+            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+            if (bswap) {
+                tcg_out_rolw_8(s, datalo);
+            }
         }
         break;
     case MO_SW:
-        if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
-            tcg_out_rolw_8(s, datalo);
-            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
+        if (real_bswap) {
+            if (have_movbe) {
+                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                     datalo, base, ofs);
+            } else {
+                tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
+                tcg_out_rolw_8(s, datalo);
+            }
+            tcg_out_ext16s(s, datalo, datalo, P_REXW);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
                                  datalo, base, ofs);
         }
         break;
     case MO_UL:
-        tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
         break;
 #if TCG_TARGET_REG_BITS == 64
     case MO_SL:
-        if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
-            tcg_out_bswap32(s, datalo);
+        if (real_bswap) {
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            if (bswap) {
+                tcg_out_bswap32(s, datalo);
+            }
             tcg_out_ext32s(s, datalo, datalo);
         } else {
             tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
@@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
 #endif
     case MO_Q:
         if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
             if (bswap) {
                 tcg_out_bswap64(s, datalo);
             }
         } else {
-            if (bswap) {
+            if (real_bswap) {
                 int t = datalo;
                 datalo = datahi;
                 datahi = t;
             }
             if (base != datalo) {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
             } else {
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datahi, base, ofs + 4);
-                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
-                                     datalo, base, ofs);
+                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
             }
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
@@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, intptr_t ofs, int seg,
                                    TCGMemOp memop)
 {
-    const TCGMemOp bswap = memop & MO_BSWAP;
-
     /* ??? Ideally we wouldn't need a scratch register.  For user-only,
        we could perform the bswap twice to restore the original value
        instead of moving to the scratch.  But as it is, the L constraint
        means that TCG_REG_L0 is definitely free here.  */
     const TCGReg scratch = TCG_REG_L0;
+    const TCGMemOp real_bswap = memop & MO_BSWAP;
+    TCGMemOp bswap = real_bswap;
+    int movop = OPC_MOVL_EvGv;
+
+    if (real_bswap && have_movbe) {
+        bswap = 0;
+        movop = OPC_MOVBE_MyGy;
+    }
 
     switch (memop & MO_SIZE) {
     case MO_8:
@@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_rolw_8(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
-                             datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
         break;
     case MO_32:
         if (bswap) {
@@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_bswap32(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
+        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
         break;
     case MO_64:
         if (TCG_TARGET_REG_BITS == 64) {
@@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                 tcg_out_bswap64(s, scratch);
                 datalo = scratch;
             }
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
-                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
         } else if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
             tcg_out_bswap32(s, scratch);
@@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             tcg_out_bswap32(s, scratch);
             tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
-            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4);
+            if (real_bswap) {
+                int t = datalo;
+                datalo = datahi;
+                datahi = t;
+            }
+            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
+            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
         }
         break;
     default:
@@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 static void tcg_target_init(TCGContext *s)
 {
-    /* For 32-bit, 99% certainty that we're running on hardware that supports
-       cmov, but we still need to check.  In case cmov is not available, we'll
-       use a small forward branch.  */
+    unsigned a, b, c, d;
+
+    if (__get_cpuid(1, &a, &b, &c, &d)) {
 #ifndef have_cmov
-    {
-        unsigned a, b, c, d;
-        have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
-    }
+        /* For 32-bit, 99% certainty that we're running on hardware that
+           supports cmov, but we still need to check.  In case cmov is not
+           available, we'll use a small forward branch.  */
+        have_cmov = (d & bit_CMOV) != 0;
 #endif
+#ifndef have_movbe
+        have_movbe = (c & bit_MOVBE) != 0;
+#endif
+    }
 
     if (TCG_TARGET_REG_BITS == 64) {
         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
-- 
1.8.4.2

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
  2013-12-20 23:00 [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available Richard Henderson
@ 2013-12-21 14:08 ` Paolo Bonzini
  2013-12-22 12:24   ` Aurelien Jarno
  2013-12-22 20:03 ` Aurelien Jarno
  1 sibling, 1 reply; 6+ messages in thread
From: Paolo Bonzini @ 2013-12-21 14:08 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

Il 21/12/2013 00:00, Richard Henderson ha scritto:
> +        if (real_bswap && have_movbe) {
> +            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> +                                 datalo, base, ofs);
> +            tcg_out_ext16u(s, datalo, datalo);

Do partial register stalls still exist on Atom and Haswell?  I don't
remember exactly what you had to do to prevent them, but IIRC you first
moved zero to the register and then overwrote the the low 16 bits.

Paolo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
  2013-12-21 14:08 ` Paolo Bonzini
@ 2013-12-22 12:24   ` Aurelien Jarno
  2013-12-22 13:39     ` Paolo Bonzini
  2013-12-22 16:38     ` Richard Henderson
  0 siblings, 2 replies; 6+ messages in thread
From: Aurelien Jarno @ 2013-12-22 12:24 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel, Richard Henderson

On Sat, Dec 21, 2013 at 03:08:21PM +0100, Paolo Bonzini wrote:
> Il 21/12/2013 00:00, Richard Henderson ha scritto:
> > +        if (real_bswap && have_movbe) {
> > +            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> > +                                 datalo, base, ofs);
> > +            tcg_out_ext16u(s, datalo, datalo);
> 
> Do partial register stalls still exist on Atom and Haswell?  I don't
> remember exactly what you had to do to prevent them, but IIRC you first
> moved zero to the register and then overwrote the the low 16 bits.

Note that for unsigned 16-bit load you can do either movzw + bswap or 
movbe + movzw.

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
  2013-12-22 12:24   ` Aurelien Jarno
@ 2013-12-22 13:39     ` Paolo Bonzini
  2013-12-22 16:38     ` Richard Henderson
  1 sibling, 0 replies; 6+ messages in thread
From: Paolo Bonzini @ 2013-12-22 13:39 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: qemu-devel, Richard Henderson

Il 22/12/2013 13:24, Aurelien Jarno ha scritto:
> On Sat, Dec 21, 2013 at 03:08:21PM +0100, Paolo Bonzini wrote:
>> Il 21/12/2013 00:00, Richard Henderson ha scritto:
>>> +        if (real_bswap && have_movbe) {
>>> +            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
>>> +                                 datalo, base, ofs);
>>> +            tcg_out_ext16u(s, datalo, datalo);
>>
>> Do partial register stalls still exist on Atom and Haswell?  I don't
>> remember exactly what you had to do to prevent them, but IIRC you first
>> moved zero to the register and then overwrote the the low 16 bits.
> 
> Note that for unsigned 16-bit load you can do either movzw + bswap or 
> movbe + movzw.

Yeah, I was asking if xor + movbe would be faster.  Benchmarking could
tell, but anyway xor + movbe is likely the smallest code you can produce.

Paolo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
  2013-12-22 12:24   ` Aurelien Jarno
  2013-12-22 13:39     ` Paolo Bonzini
@ 2013-12-22 16:38     ` Richard Henderson
  1 sibling, 0 replies; 6+ messages in thread
From: Richard Henderson @ 2013-12-22 16:38 UTC (permalink / raw)
  To: Aurelien Jarno, Paolo Bonzini; +Cc: qemu-devel

On 12/22/2013 04:24 AM, Aurelien Jarno wrote:
> On Sat, Dec 21, 2013 at 03:08:21PM +0100, Paolo Bonzini wrote:
>> Il 21/12/2013 00:00, Richard Henderson ha scritto:
>>> +        if (real_bswap && have_movbe) {
>>> +            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
>>> +                                 datalo, base, ofs);
>>> +            tcg_out_ext16u(s, datalo, datalo);
>>
>> Do partial register stalls still exist on Atom and Haswell?  I don't
>> remember exactly what you had to do to prevent them, but IIRC you first
>> moved zero to the register and then overwrote the the low 16 bits.
> 
> Note that for unsigned 16-bit load you can do either movzw + bswap or 
> movbe + movzw.

>From the July 2013 Intel Opt Ref Manual,

"Delay of partial register stall is small in ... Intel Core and NetBurst
microarchitectures".  And for Atom "partial register access does not cause
additional delay".

While I agree with Paulo that xor + movbe is probably technically the best, one
has to check for output register overlap and have a fallback.  Thus I think we
can just discard that idea.

As for movzw + bswap, that forces a partial register stall on subsequent 32-bit
access to the value, while movbe + movzw does not.  In the later case we refer
to the unmerged portion of the register in the movzw.

But the optimization note suggests that it shouldn't matter much either way.

r~

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available
  2013-12-20 23:00 [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available Richard Henderson
  2013-12-21 14:08 ` Paolo Bonzini
@ 2013-12-22 20:03 ` Aurelien Jarno
  1 sibling, 0 replies; 6+ messages in thread
From: Aurelien Jarno @ 2013-12-22 20:03 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Dec 20, 2013 at 03:00:12PM -0800, Richard Henderson wrote:
> As present on Atom and Haswell processors.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  disas/i386.c          |   8 ++--
>  tcg/i386/tcg-target.c | 127 ++++++++++++++++++++++++++++++++++----------------
>  2 files changed, 91 insertions(+), 44 deletions(-)
> 
> Here's to "stress testing" a Haswell laptop before Santa delivers it.  ;-)
> 
> 
> r~
> 
> 
> diff --git a/disas/i386.c b/disas/i386.c
> index 47f1f2e..beb33f0 100644
> --- a/disas/i386.c
> +++ b/disas/i386.c
> @@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = {
>  
>    /* PREGRP87 */
>    {
> -    { "(bad)",	{ XX } },
> -    { "(bad)",	{ XX } },
> +    { "movbe",	{ Gv, M } },
> +    { "movbe",	{ Gw, M } },

This is not correct, this disassemble movbe with the REPZ prefix.

>      { "(bad)",	{ XX } },

You want it there, that is for the DATA prefix.

>      { "crc32",	{ Gdq, { CRC32_Fixup, b_mode } } },
>    },
>  
>    /* PREGRP88 */
>    {
> -    { "(bad)",	{ XX } },
> -    { "(bad)",	{ XX } },
> +    { "movbe",	{ M, Gv } },
> +    { "movbe",	{ M, Gw } },
>      { "(bad)",	{ XX } },

Ditto.

>      { "crc32",	{ Gdq, { CRC32_Fixup, v_mode } } },
>    },
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 7ac8e45..e76f3f3 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = {
>  # define TCG_REG_L1 TCG_REG_EDX
>  #endif
>  
> +#ifdef CONFIG_CPUID_H
> +#include <cpuid.h>
> +#endif
> +
>  /* For 32-bit, we are going to attempt to determine at runtime whether cmov
>     is available.  However, the host compiler must supply <cpuid.h>, as we're
>     not going to go so far as our own inline assembly.  */
>  #if TCG_TARGET_REG_BITS == 64
>  # define have_cmov 1
>  #elif defined(CONFIG_CPUID_H)
> -#include <cpuid.h>
>  static bool have_cmov;
>  #else
>  # define have_cmov 0
>  #endif
>  
> +/* There is no pre-processor definition for MOVBE to fall back on.  */
> +#ifdef CONFIG_CPUID_H

As you already remarked, you should check for bit_MOVBE, as it is
something which has been introduced in GCC 4.6.

> +static bool have_movbe;
> +#else
> +#define have_movbe 0

Very minor nitpick, but you probably want to indent the define, like
above for cmov.

> +#endif
> +
>  static uint8_t *tb_ret_addr;
>  
>  static void patch_reloc(uint8_t *code_ptr, int type,
> @@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  # define P_REXB_RM	0
>  # define P_GS           0
>  #endif
> +#define P_EXT38         0x8000          /* 0x0f 0x38 opcode prefix */
>  
>  #define OPC_ARITH_EvIz	(0x81)
>  #define OPC_ARITH_EvIb	(0x83)
> @@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define OPC_MOVB_EvIz   (0xc6)
>  #define OPC_MOVL_EvIz	(0xc7)
>  #define OPC_MOVL_Iv     (0xb8)
> +#define OPC_MOVBE_GyMy  (0xf0 | P_EXT | P_EXT38)
> +#define OPC_MOVBE_MyGy  (0xf1 | P_EXT | P_EXT38)
>  #define OPC_MOVSBL	(0xbe | P_EXT)
>  #define OPC_MOVSWL	(0xbf | P_EXT)
>  #define OPC_MOVSLQ	(0x63 | P_REXW)
> @@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
>  
>      if (opc & P_EXT) {
>          tcg_out8(s, 0x0f);
> +        if (opc & P_EXT38) {
> +            tcg_out8(s, 0x38);
> +        }
>      }
>      tcg_out8(s, opc);
>  }
> @@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc)
>      }
>      if (opc & P_EXT) {
>          tcg_out8(s, 0x0f);
> +        if (opc & P_EXT38) {
> +            tcg_out8(s, 0x38);
> +        }
>      }
>      tcg_out8(s, opc);
>  }
> @@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>                                     TCGReg base, intptr_t ofs, int seg,
>                                     TCGMemOp memop)
>  {
> -    const TCGMemOp bswap = memop & MO_BSWAP;
> +    const TCGMemOp real_bswap = memop & MO_BSWAP;
> +    TCGMemOp bswap = real_bswap;
> +    int movop = OPC_MOVL_GvEv;
> +
> +    if (real_bswap && have_movbe) {
> +        bswap = 0;
> +        movop = OPC_MOVBE_GyMy;
> +    }

Defining movop is clever, that said the name real_bswap doesn't sounds
very self describing. Moreover you still need to check for have_movbe
below...

>      switch (memop & MO_SSIZE) {
>      case MO_UB:
> @@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>          tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs);
>          break;
>      case MO_UW:
> -        tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> -        if (bswap) {
> -            tcg_out_rolw_8(s, datalo);
> +        if (real_bswap && have_movbe) {
> +            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> +                                 datalo, base, ofs);
> +            tcg_out_ext16u(s, datalo, datalo);
> +        } else {
> +            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> +            if (bswap) {
> +                tcg_out_rolw_8(s, datalo);
> +            }

What about keeping the bswap definition like before and do something
like:

        if (bswap && have_movbe) {
            tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
                                 datalo, base, ofs);
            tcg_out_ext16u(s, datalo, datalo);
        } else {
            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
            if (bswap) {
                tcg_out_rolw_8(s, datalo);
            }
        }


>          }
>          break;
>      case MO_SW:
> -        if (bswap) {
> -            tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> -            tcg_out_rolw_8(s, datalo);
> -            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
> +        if (real_bswap) {
> +            if (have_movbe) {
> +                tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
> +                                     datalo, base, ofs);
> +            } else {
> +                tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
> +                tcg_out_rolw_8(s, datalo);
> +            }
> +            tcg_out_ext16s(s, datalo, datalo, P_REXW);
>          } else {
>              tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
>                                   datalo, base, ofs);
>          }
>          break;
>      case MO_UL:
> -        tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
> +        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
>          if (bswap) {
>              tcg_out_bswap32(s, datalo);
>          }

And here:
        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
        if (bswap && !has_movbe) {
            tcg_out_bswap32(s, datalo);
        }

>          break;
>  #if TCG_TARGET_REG_BITS == 64
>      case MO_SL:
> -        if (bswap) {
> -            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
> -            tcg_out_bswap32(s, datalo);
> +        if (real_bswap) {
> +            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> +            if (bswap) {
> +                tcg_out_bswap32(s, datalo);
> +            }
>              tcg_out_ext32s(s, datalo, datalo);
>          } else {

And here:
        if (bswap) {
            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
            if (!has_movbe) {
                tcg_out_bswap32(s, datalo);
            }
            tcg_out_ext32s(s, datalo, datalo);
        } else {


>              tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
> @@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>  #endif
>      case MO_Q:
>          if (TCG_TARGET_REG_BITS == 64) {
> -            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
> -                                 datalo, base, ofs);
> +            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
>              if (bswap) {
>                  tcg_out_bswap64(s, datalo);
>              }
>          } else {
> -            if (bswap) {
> +            if (real_bswap) {
>                  int t = datalo;
>                  datalo = datahi;
>                  datahi = t;
>              }
>              if (base != datalo) {
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datalo, base, ofs);
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datahi, base, ofs + 4);
> +                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> +                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
>              } else {
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datahi, base, ofs + 4);
> -                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
> -                                     datalo, base, ofs);
> +                tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
> +                tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
>              }
>              if (bswap) {
>                  tcg_out_bswap32(s, datalo);
> @@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>                                     TCGReg base, intptr_t ofs, int seg,
>                                     TCGMemOp memop)
>  {
> -    const TCGMemOp bswap = memop & MO_BSWAP;
> -
>      /* ??? Ideally we wouldn't need a scratch register.  For user-only,
>         we could perform the bswap twice to restore the original value
>         instead of moving to the scratch.  But as it is, the L constraint
>         means that TCG_REG_L0 is definitely free here.  */
>      const TCGReg scratch = TCG_REG_L0;
> +    const TCGMemOp real_bswap = memop & MO_BSWAP;
> +    TCGMemOp bswap = real_bswap;
> +    int movop = OPC_MOVL_EvGv;
> +
> +    if (real_bswap && have_movbe) {
> +        bswap = 0;
> +        movop = OPC_MOVBE_MyGy;
> +    }

Ditto here

>      switch (memop & MO_SIZE) {
>      case MO_8:
> @@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>              tcg_out_rolw_8(s, scratch);
>              datalo = scratch;
>          }
> -        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
> -                             datalo, base, ofs);
> +        tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
>          break;
>      case MO_32:
>          if (bswap) {
> @@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>              tcg_out_bswap32(s, scratch);
>              datalo = scratch;
>          }
> -        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
> +        tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
>          break;
>      case MO_64:
>          if (TCG_TARGET_REG_BITS == 64) {
> @@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>                  tcg_out_bswap64(s, scratch);
>                  datalo = scratch;
>              }
> -            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
> -                                 datalo, base, ofs);
> +            tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
>          } else if (bswap) {
>              tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
>              tcg_out_bswap32(s, scratch);
> @@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>              tcg_out_bswap32(s, scratch);
>              tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
>          } else {
> -            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
> -            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4);
> +            if (real_bswap) {
> +                int t = datalo;
> +                datalo = datahi;
> +                datahi = t;
> +            }
> +            tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
> +            tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
>          }
>          break;
>      default:
> @@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>  
>  static void tcg_target_init(TCGContext *s)
>  {
> -    /* For 32-bit, 99% certainty that we're running on hardware that supports
> -       cmov, but we still need to check.  In case cmov is not available, we'll
> -       use a small forward branch.  */
> +    unsigned a, b, c, d;
> +
> +    if (__get_cpuid(1, &a, &b, &c, &d)) {
>  #ifndef have_cmov
> -    {
> -        unsigned a, b, c, d;
> -        have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
> -    }
> +        /* For 32-bit, 99% certainty that we're running on hardware that
> +           supports cmov, but we still need to check.  In case cmov is not
> +           available, we'll use a small forward branch.  */
> +        have_cmov = (d & bit_CMOV) != 0;
>  #endif
> +#ifndef have_movbe
> +        have_movbe = (c & bit_MOVBE) != 0;
> +#endif
> +    }

Your code doesn't check the result of __get_cpuid anymore to initialize
have_cmov or have_movbe.  While C standard mandates that static
variables are initialized to 0, it might be a good idea to do so
explicitly. 

>      if (TCG_TARGET_REG_BITS == 64) {
>          tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);

Otherwise looks good.


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2013-12-22 20:03 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-12-20 23:00 [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available Richard Henderson
2013-12-21 14:08 ` Paolo Bonzini
2013-12-22 12:24   ` Aurelien Jarno
2013-12-22 13:39     ` Paolo Bonzini
2013-12-22 16:38     ` Richard Henderson
2013-12-22 20:03 ` Aurelien Jarno

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).