* [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available @ 2013-12-20 23:00 Richard Henderson 2013-12-21 14:08 ` Paolo Bonzini 2013-12-22 20:03 ` Aurelien Jarno 0 siblings, 2 replies; 6+ messages in thread From: Richard Henderson @ 2013-12-20 23:00 UTC (permalink / raw) To: qemu-devel As present on Atom and Haswell processors. Signed-off-by: Richard Henderson <rth@twiddle.net> --- disas/i386.c | 8 ++-- tcg/i386/tcg-target.c | 127 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 91 insertions(+), 44 deletions(-) Here's to "stress testing" a Haswell laptop before Santa delivers it. ;-) r~ diff --git a/disas/i386.c b/disas/i386.c index 47f1f2e..beb33f0 100644 --- a/disas/i386.c +++ b/disas/i386.c @@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = { /* PREGRP87 */ { - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { "movbe", { Gv, M } }, + { "movbe", { Gw, M } }, { "(bad)", { XX } }, { "crc32", { Gdq, { CRC32_Fixup, b_mode } } }, }, /* PREGRP88 */ { - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { "movbe", { M, Gv } }, + { "movbe", { M, Gw } }, { "(bad)", { XX } }, { "crc32", { Gdq, { CRC32_Fixup, v_mode } } }, }, diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 7ac8e45..e76f3f3 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = { # define TCG_REG_L1 TCG_REG_EDX #endif +#ifdef CONFIG_CPUID_H +#include <cpuid.h> +#endif + /* For 32-bit, we are going to attempt to determine at runtime whether cmov is available. However, the host compiler must supply <cpuid.h>, as we're not going to go so far as our own inline assembly. */ #if TCG_TARGET_REG_BITS == 64 # define have_cmov 1 #elif defined(CONFIG_CPUID_H) -#include <cpuid.h> static bool have_cmov; #else # define have_cmov 0 #endif +/* There is no pre-processor definition for MOVBE to fall back on. */ +#ifdef CONFIG_CPUID_H +static bool have_movbe; +#else +#define have_movbe 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long val, # define P_REXB_RM 0 # define P_GS 0 #endif +#define P_EXT38 0x8000 /* 0x0f 0x38 opcode prefix */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) @@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_MOVB_EvIz (0xc6) #define OPC_MOVL_EvIz (0xc7) #define OPC_MOVL_Iv (0xb8) +#define OPC_MOVBE_GyMy (0xf0 | P_EXT | P_EXT38) +#define OPC_MOVBE_MyGy (0xf1 | P_EXT | P_EXT38) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) @@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) if (opc & P_EXT) { tcg_out8(s, 0x0f); + if (opc & P_EXT38) { + tcg_out8(s, 0x38); + } } tcg_out8(s, opc); } @@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc) } if (opc & P_EXT) { tcg_out8(s, 0x0f); + if (opc & P_EXT38) { + tcg_out8(s, 0x38); + } } tcg_out8(s, opc); } @@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, TCGReg base, intptr_t ofs, int seg, TCGMemOp memop) { - const TCGMemOp bswap = memop & MO_BSWAP; + const TCGMemOp real_bswap = memop & MO_BSWAP; + TCGMemOp bswap = real_bswap; + int movop = OPC_MOVL_GvEv; + + if (real_bswap && have_movbe) { + bswap = 0; + movop = OPC_MOVBE_GyMy; + } switch (memop & MO_SSIZE) { case MO_UB: @@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs); break; case MO_UW: - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - if (bswap) { - tcg_out_rolw_8(s, datalo); + if (real_bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, + datalo, base, ofs); + tcg_out_ext16u(s, datalo, datalo); + } else { + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); + if (bswap) { + tcg_out_rolw_8(s, datalo); + } } break; case MO_SW: - if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - tcg_out_rolw_8(s, datalo); - tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); + if (real_bswap) { + if (have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); + tcg_out_rolw_8(s, datalo); + } + tcg_out_ext16s(s, datalo, datalo, P_REXW); } else { tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, datalo, base, ofs); } break; case MO_UL: - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); if (bswap) { tcg_out_bswap32(s, datalo); } break; #if TCG_TARGET_REG_BITS == 64 case MO_SL: - if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); - tcg_out_bswap32(s, datalo); + if (real_bswap) { + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + if (bswap) { + tcg_out_bswap32(s, datalo); + } tcg_out_ext32s(s, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); @@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, #endif case MO_Q: if (TCG_TARGET_REG_BITS == 64) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); if (bswap) { tcg_out_bswap64(s, datalo); } } else { - if (bswap) { + if (real_bswap) { int t = datalo; datalo = datahi; datahi = t; } if (base != datalo) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); } if (bswap) { tcg_out_bswap32(s, datalo); @@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, TCGReg base, intptr_t ofs, int seg, TCGMemOp memop) { - const TCGMemOp bswap = memop & MO_BSWAP; - /* ??? Ideally we wouldn't need a scratch register. For user-only, we could perform the bswap twice to restore the original value instead of moving to the scratch. But as it is, the L constraint means that TCG_REG_L0 is definitely free here. */ const TCGReg scratch = TCG_REG_L0; + const TCGMemOp real_bswap = memop & MO_BSWAP; + TCGMemOp bswap = real_bswap; + int movop = OPC_MOVL_EvGv; + + if (real_bswap && have_movbe) { + bswap = 0; + movop = OPC_MOVBE_MyGy; + } switch (memop & MO_SIZE) { case MO_8: @@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_rolw_8(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs); break; case MO_32: if (bswap) { @@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap32(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); break; case MO_64: if (TCG_TARGET_REG_BITS == 64) { @@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap64(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); } else if (bswap) { tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); tcg_out_bswap32(s, scratch); @@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap32(s, scratch); tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4); + if (real_bswap) { + int t = datalo; + datalo = datahi; + datahi = t; + } + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4); } break; default: @@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { - /* For 32-bit, 99% certainty that we're running on hardware that supports - cmov, but we still need to check. In case cmov is not available, we'll - use a small forward branch. */ + unsigned a, b, c, d; + + if (__get_cpuid(1, &a, &b, &c, &d)) { #ifndef have_cmov - { - unsigned a, b, c, d; - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); - } + /* For 32-bit, 99% certainty that we're running on hardware that + supports cmov, but we still need to check. In case cmov is not + available, we'll use a small forward branch. */ + have_cmov = (d & bit_CMOV) != 0; #endif +#ifndef have_movbe + have_movbe = (c & bit_MOVBE) != 0; +#endif + } if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff); -- 1.8.4.2 ^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available 2013-12-20 23:00 [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available Richard Henderson @ 2013-12-21 14:08 ` Paolo Bonzini 2013-12-22 12:24 ` Aurelien Jarno 2013-12-22 20:03 ` Aurelien Jarno 1 sibling, 1 reply; 6+ messages in thread From: Paolo Bonzini @ 2013-12-21 14:08 UTC (permalink / raw) To: Richard Henderson; +Cc: qemu-devel Il 21/12/2013 00:00, Richard Henderson ha scritto: > + if (real_bswap && have_movbe) { > + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, > + datalo, base, ofs); > + tcg_out_ext16u(s, datalo, datalo); Do partial register stalls still exist on Atom and Haswell? I don't remember exactly what you had to do to prevent them, but IIRC you first moved zero to the register and then overwrote the the low 16 bits. Paolo ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available 2013-12-21 14:08 ` Paolo Bonzini @ 2013-12-22 12:24 ` Aurelien Jarno 2013-12-22 13:39 ` Paolo Bonzini 2013-12-22 16:38 ` Richard Henderson 0 siblings, 2 replies; 6+ messages in thread From: Aurelien Jarno @ 2013-12-22 12:24 UTC (permalink / raw) To: Paolo Bonzini; +Cc: qemu-devel, Richard Henderson On Sat, Dec 21, 2013 at 03:08:21PM +0100, Paolo Bonzini wrote: > Il 21/12/2013 00:00, Richard Henderson ha scritto: > > + if (real_bswap && have_movbe) { > > + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, > > + datalo, base, ofs); > > + tcg_out_ext16u(s, datalo, datalo); > > Do partial register stalls still exist on Atom and Haswell? I don't > remember exactly what you had to do to prevent them, but IIRC you first > moved zero to the register and then overwrote the the low 16 bits. Note that for unsigned 16-bit load you can do either movzw + bswap or movbe + movzw. -- Aurelien Jarno GPG: 1024D/F1BCDB73 aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available 2013-12-22 12:24 ` Aurelien Jarno @ 2013-12-22 13:39 ` Paolo Bonzini 2013-12-22 16:38 ` Richard Henderson 1 sibling, 0 replies; 6+ messages in thread From: Paolo Bonzini @ 2013-12-22 13:39 UTC (permalink / raw) To: Aurelien Jarno; +Cc: qemu-devel, Richard Henderson Il 22/12/2013 13:24, Aurelien Jarno ha scritto: > On Sat, Dec 21, 2013 at 03:08:21PM +0100, Paolo Bonzini wrote: >> Il 21/12/2013 00:00, Richard Henderson ha scritto: >>> + if (real_bswap && have_movbe) { >>> + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, >>> + datalo, base, ofs); >>> + tcg_out_ext16u(s, datalo, datalo); >> >> Do partial register stalls still exist on Atom and Haswell? I don't >> remember exactly what you had to do to prevent them, but IIRC you first >> moved zero to the register and then overwrote the the low 16 bits. > > Note that for unsigned 16-bit load you can do either movzw + bswap or > movbe + movzw. Yeah, I was asking if xor + movbe would be faster. Benchmarking could tell, but anyway xor + movbe is likely the smallest code you can produce. Paolo ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available 2013-12-22 12:24 ` Aurelien Jarno 2013-12-22 13:39 ` Paolo Bonzini @ 2013-12-22 16:38 ` Richard Henderson 1 sibling, 0 replies; 6+ messages in thread From: Richard Henderson @ 2013-12-22 16:38 UTC (permalink / raw) To: Aurelien Jarno, Paolo Bonzini; +Cc: qemu-devel On 12/22/2013 04:24 AM, Aurelien Jarno wrote: > On Sat, Dec 21, 2013 at 03:08:21PM +0100, Paolo Bonzini wrote: >> Il 21/12/2013 00:00, Richard Henderson ha scritto: >>> + if (real_bswap && have_movbe) { >>> + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, >>> + datalo, base, ofs); >>> + tcg_out_ext16u(s, datalo, datalo); >> >> Do partial register stalls still exist on Atom and Haswell? I don't >> remember exactly what you had to do to prevent them, but IIRC you first >> moved zero to the register and then overwrote the the low 16 bits. > > Note that for unsigned 16-bit load you can do either movzw + bswap or > movbe + movzw. >From the July 2013 Intel Opt Ref Manual, "Delay of partial register stall is small in ... Intel Core and NetBurst microarchitectures". And for Atom "partial register access does not cause additional delay". While I agree with Paulo that xor + movbe is probably technically the best, one has to check for output register overlap and have a fallback. Thus I think we can just discard that idea. As for movzw + bswap, that forces a partial register stall on subsequent 32-bit access to the value, while movbe + movzw does not. In the later case we refer to the unmerged portion of the register in the movzw. But the optimization note suggests that it shouldn't matter much either way. r~ ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available 2013-12-20 23:00 [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available Richard Henderson 2013-12-21 14:08 ` Paolo Bonzini @ 2013-12-22 20:03 ` Aurelien Jarno 1 sibling, 0 replies; 6+ messages in thread From: Aurelien Jarno @ 2013-12-22 20:03 UTC (permalink / raw) To: Richard Henderson; +Cc: qemu-devel On Fri, Dec 20, 2013 at 03:00:12PM -0800, Richard Henderson wrote: > As present on Atom and Haswell processors. > > Signed-off-by: Richard Henderson <rth@twiddle.net> > --- > disas/i386.c | 8 ++-- > tcg/i386/tcg-target.c | 127 ++++++++++++++++++++++++++++++++++---------------- > 2 files changed, 91 insertions(+), 44 deletions(-) > > Here's to "stress testing" a Haswell laptop before Santa delivers it. ;-) > > > r~ > > > diff --git a/disas/i386.c b/disas/i386.c > index 47f1f2e..beb33f0 100644 > --- a/disas/i386.c > +++ b/disas/i386.c > @@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = { > > /* PREGRP87 */ > { > - { "(bad)", { XX } }, > - { "(bad)", { XX } }, > + { "movbe", { Gv, M } }, > + { "movbe", { Gw, M } }, This is not correct, this disassemble movbe with the REPZ prefix. > { "(bad)", { XX } }, You want it there, that is for the DATA prefix. > { "crc32", { Gdq, { CRC32_Fixup, b_mode } } }, > }, > > /* PREGRP88 */ > { > - { "(bad)", { XX } }, > - { "(bad)", { XX } }, > + { "movbe", { M, Gv } }, > + { "movbe", { M, Gw } }, > { "(bad)", { XX } }, Ditto. > { "crc32", { Gdq, { CRC32_Fixup, v_mode } } }, > }, > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index 7ac8e45..e76f3f3 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = { > # define TCG_REG_L1 TCG_REG_EDX > #endif > > +#ifdef CONFIG_CPUID_H > +#include <cpuid.h> > +#endif > + > /* For 32-bit, we are going to attempt to determine at runtime whether cmov > is available. However, the host compiler must supply <cpuid.h>, as we're > not going to go so far as our own inline assembly. */ > #if TCG_TARGET_REG_BITS == 64 > # define have_cmov 1 > #elif defined(CONFIG_CPUID_H) > -#include <cpuid.h> > static bool have_cmov; > #else > # define have_cmov 0 > #endif > > +/* There is no pre-processor definition for MOVBE to fall back on. */ > +#ifdef CONFIG_CPUID_H As you already remarked, you should check for bit_MOVBE, as it is something which has been introduced in GCC 4.6. > +static bool have_movbe; > +#else > +#define have_movbe 0 Very minor nitpick, but you probably want to indent the define, like above for cmov. > +#endif > + > static uint8_t *tb_ret_addr; > > static void patch_reloc(uint8_t *code_ptr, int type, > @@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long val, > # define P_REXB_RM 0 > # define P_GS 0 > #endif > +#define P_EXT38 0x8000 /* 0x0f 0x38 opcode prefix */ > > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > @@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long val, > #define OPC_MOVB_EvIz (0xc6) > #define OPC_MOVL_EvIz (0xc7) > #define OPC_MOVL_Iv (0xb8) > +#define OPC_MOVBE_GyMy (0xf0 | P_EXT | P_EXT38) > +#define OPC_MOVBE_MyGy (0xf1 | P_EXT | P_EXT38) > #define OPC_MOVSBL (0xbe | P_EXT) > #define OPC_MOVSWL (0xbf | P_EXT) > #define OPC_MOVSLQ (0x63 | P_REXW) > @@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) > > if (opc & P_EXT) { > tcg_out8(s, 0x0f); > + if (opc & P_EXT38) { > + tcg_out8(s, 0x38); > + } > } > tcg_out8(s, opc); > } > @@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc) > } > if (opc & P_EXT) { > tcg_out8(s, 0x0f); > + if (opc & P_EXT38) { > + tcg_out8(s, 0x38); > + } > } > tcg_out8(s, opc); > } > @@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > TCGReg base, intptr_t ofs, int seg, > TCGMemOp memop) > { > - const TCGMemOp bswap = memop & MO_BSWAP; > + const TCGMemOp real_bswap = memop & MO_BSWAP; > + TCGMemOp bswap = real_bswap; > + int movop = OPC_MOVL_GvEv; > + > + if (real_bswap && have_movbe) { > + bswap = 0; > + movop = OPC_MOVBE_GyMy; > + } Defining movop is clever, that said the name real_bswap doesn't sounds very self describing. Moreover you still need to check for have_movbe below... > switch (memop & MO_SSIZE) { > case MO_UB: > @@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, ofs); > break; > case MO_UW: > - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > - if (bswap) { > - tcg_out_rolw_8(s, datalo); > + if (real_bswap && have_movbe) { > + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, > + datalo, base, ofs); > + tcg_out_ext16u(s, datalo, datalo); > + } else { > + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > + if (bswap) { > + tcg_out_rolw_8(s, datalo); > + } What about keeping the bswap definition like before and do something like: if (bswap && have_movbe) { tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, datalo, base, ofs); tcg_out_ext16u(s, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); if (bswap) { tcg_out_rolw_8(s, datalo); } } > } > break; > case MO_SW: > - if (bswap) { > - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > - tcg_out_rolw_8(s, datalo); > - tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); > + if (real_bswap) { > + if (have_movbe) { > + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, > + datalo, base, ofs); > + } else { > + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > + tcg_out_rolw_8(s, datalo); > + } > + tcg_out_ext16s(s, datalo, datalo, P_REXW); > } else { > tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, > datalo, base, ofs); > } > break; > case MO_UL: > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > if (bswap) { > tcg_out_bswap32(s, datalo); > } And here: tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); if (bswap && !has_movbe) { tcg_out_bswap32(s, datalo); } > break; > #if TCG_TARGET_REG_BITS == 64 > case MO_SL: > - if (bswap) { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); > - tcg_out_bswap32(s, datalo); > + if (real_bswap) { > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > + if (bswap) { > + tcg_out_bswap32(s, datalo); > + } > tcg_out_ext32s(s, datalo, datalo); > } else { And here: if (bswap) { tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); if (!has_movbe) { tcg_out_bswap32(s, datalo); } tcg_out_ext32s(s, datalo, datalo); } else { > tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); > @@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > #endif > case MO_Q: > if (TCG_TARGET_REG_BITS == 64) { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); > if (bswap) { > tcg_out_bswap64(s, datalo); > } > } else { > - if (bswap) { > + if (real_bswap) { > int t = datalo; > datalo = datahi; > datahi = t; > } > if (base != datalo) { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datalo, base, ofs); > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datahi, base, ofs + 4); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); > } else { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datahi, base, ofs + 4); > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > } > if (bswap) { > tcg_out_bswap32(s, datalo); > @@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > TCGReg base, intptr_t ofs, int seg, > TCGMemOp memop) > { > - const TCGMemOp bswap = memop & MO_BSWAP; > - > /* ??? Ideally we wouldn't need a scratch register. For user-only, > we could perform the bswap twice to restore the original value > instead of moving to the scratch. But as it is, the L constraint > means that TCG_REG_L0 is definitely free here. */ > const TCGReg scratch = TCG_REG_L0; > + const TCGMemOp real_bswap = memop & MO_BSWAP; > + TCGMemOp bswap = real_bswap; > + int movop = OPC_MOVL_EvGv; > + > + if (real_bswap && have_movbe) { > + bswap = 0; > + movop = OPC_MOVBE_MyGy; > + } Ditto here > switch (memop & MO_SIZE) { > case MO_8: > @@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > tcg_out_rolw_8(s, scratch); > datalo = scratch; > } > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs); > break; > case MO_32: > if (bswap) { > @@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > tcg_out_bswap32(s, scratch); > datalo = scratch; > } > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > break; > case MO_64: > if (TCG_TARGET_REG_BITS == 64) { > @@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > tcg_out_bswap64(s, scratch); > datalo = scratch; > } > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); > } else if (bswap) { > tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); > tcg_out_bswap32(s, scratch); > @@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, > tcg_out_bswap32(s, scratch); > tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4); > } else { > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4); > + if (real_bswap) { > + int t = datalo; > + datalo = datahi; > + datahi = t; > + } > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4); > } > break; > default: > @@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s) > > static void tcg_target_init(TCGContext *s) > { > - /* For 32-bit, 99% certainty that we're running on hardware that supports > - cmov, but we still need to check. In case cmov is not available, we'll > - use a small forward branch. */ > + unsigned a, b, c, d; > + > + if (__get_cpuid(1, &a, &b, &c, &d)) { > #ifndef have_cmov > - { > - unsigned a, b, c, d; > - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); > - } > + /* For 32-bit, 99% certainty that we're running on hardware that > + supports cmov, but we still need to check. In case cmov is not > + available, we'll use a small forward branch. */ > + have_cmov = (d & bit_CMOV) != 0; > #endif > +#ifndef have_movbe > + have_movbe = (c & bit_MOVBE) != 0; > +#endif > + } Your code doesn't check the result of __get_cpuid anymore to initialize have_cmov or have_movbe. While C standard mandates that static variables are initialized to 0, it might be a good idea to do so explicitly. > if (TCG_TARGET_REG_BITS == 64) { > tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff); Otherwise looks good. -- Aurelien Jarno GPG: 1024D/F1BCDB73 aurelien@aurel32.net http://www.aurel32.net ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2013-12-22 20:03 UTC | newest] Thread overview: 6+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2013-12-20 23:00 [Qemu-devel] [PATCH] tcg-i386: Use MOVBE if available Richard Henderson 2013-12-21 14:08 ` Paolo Bonzini 2013-12-22 12:24 ` Aurelien Jarno 2013-12-22 13:39 ` Paolo Bonzini 2013-12-22 16:38 ` Richard Henderson 2013-12-22 20:03 ` Aurelien Jarno
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).