Re: [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations.

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Aurelien Jarno <aurelien@aurel32.net>
To: Richard Henderson <rth@twiddle.net>
Cc: qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations.
Date: Mon, 19 Apr 2010 00:13:02 +0200	[thread overview]
Message-ID: <20100418221302.GA26784@volta.aurel32.net> (raw)
In-Reply-To: <e36e7f72dbd7724145773f759814a3c0a184c667.1271277329.git.rth@twiddle.net>

On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote:
> Define OPC_BSWAP.  Factor opcode emission to separate functions.
> Use bswap+shift to implement 16-bit swap instead of a rolw; this
> gets the proper zero-extension required by INDEX_op_bswap16_i32.

This is not required by INDEX_op_bswap16_i32. What is need is that the
value in the input register has the 16 upper bits set to 0. Considering
that, the rolw instruction is faster than bswap + shift.

> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c |   53 +++++++++++++++++++++++++------------------------
>  1 files changed, 27 insertions(+), 26 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 75b9915..0bafd00 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -163,6 +163,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  
>  #define P_EXT   0x100 /* 0x0f opcode prefix */
>  
> +#define OPC_BSWAP	(0xc8 | P_EXT)
>  #define OPC_MOVZBL	(0xb6 | P_EXT)
>  #define OPC_MOVZWL	(0xb7 | P_EXT)
>  #define OPC_MOVSBL	(0xbe | P_EXT)

> @@ -339,6 +340,22 @@ static inline void tcg_out_ext16s(TCGContext *s, int dest, int src)
>      tcg_out_modrm(s, OPC_MOVSWL, dest, src);
>  }
>  
> +static inline void tcg_out_bswap32(TCGContext *s, int reg)
> +{
> +    tcg_out_opc(s, OPC_BSWAP + reg);
> +}
> +
> +static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign)
> +{
> +    /* This swap+shift combination guarantees that the high part contains
> +       the sign or zero extension required.  It also doesn't suffer the
> +       problem of partial register stalls that using rolw does.  */
> +    tcg_out_bswap32(s, reg);
> +    /* shr $16, dest */
> +    tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg);
> +    tcg_out8(s, 16);
> +}
> +
>  static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
>  {
>      if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
> @@ -745,31 +762,21 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>          /* movzwl */
>          tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* rolw $8, data_reg */
> -            tcg_out8(s, 0x66); 
> -            tcg_out_modrm(s, 0xc1, 0, data_reg);
> -            tcg_out8(s, 8);
> +            tcg_out_bswap16(s, data_reg, 0);
>          }
>          break;
>      case 1 | 4:
>          /* movswl */
>          tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* rolw $8, data_reg */
> -            tcg_out8(s, 0x66); 
> -            tcg_out_modrm(s, 0xc1, 0, data_reg);
> -            tcg_out8(s, 8);
> -
> -            /* movswl data_reg, data_reg */
> -            tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg);
> +            tcg_out_bswap16(s, data_reg, 1);
>          }
>          break;
>      case 2:
>          /* movl (r0), data_reg */
>          tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* bswap */
> -            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
> +            tcg_out_bswap32(s, data_reg);
>          }
>          break;
>      case 3:
> @@ -786,11 +793,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>              tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4);
>          } else {
>              tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4);
> -            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
> +            tcg_out_bswap32(s, data_reg);
>  
>              tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE);
> -            /* bswap */
> -            tcg_out_opc(s, (0xc8 + data_reg2) | P_EXT);
> +            tcg_out_bswap32(s, data_reg2);
>          }
>          break;
>      default:
> @@ -982,8 +988,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      case 2:
>          if (bswap) {
>              tcg_out_mov(s, r1, data_reg);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              data_reg = r1;
>          }
>          /* movl */
> @@ -992,12 +997,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      case 3:
>          if (bswap) {
>              tcg_out_mov(s, r1, data_reg2);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE);
>              tcg_out_mov(s, r1, data_reg);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4);
>          } else {
>              tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
> @@ -1195,12 +1198,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>  
>      case INDEX_op_bswap16_i32:
> -        tcg_out8(s, 0x66);
> -        tcg_out_modrm(s, 0xc1, SHIFT_ROL, args[0]);
> -        tcg_out8(s, 8);
> +        tcg_out_bswap16(s, args[0], 0);
>          break;
>      case INDEX_op_bswap32_i32:
> -        tcg_out_opc(s, (0xc8 + args[0]) | P_EXT);
> +        tcg_out_bswap32(s, args[0]);
>          break;
>  
>      case INDEX_op_neg_i32:
> -- 
> 1.6.2.5
> 
> 
> 
> 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

next prev parent reply	other threads:[~2010-04-18 22:13 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
2010-04-13 22:23 ` [Qemu-devel] [PATCH 01/21] tcg-i386: Allocate call-saved registers first Richard Henderson
2010-04-13 22:26 ` [Qemu-devel] [PATCH 02/21] tcg-i386: Tidy initialization of tcg_target_call_clobber_regs Richard Henderson
2010-04-13 22:59 ` [Qemu-devel] [PATCH 03/21] tcg-i386: Tidy ext8u and ext16u operations Richard Henderson
2010-04-13 23:13 ` [Qemu-devel] [PATCH 04/21] tcg-i386: Tidy ext8s and ext16s operations Richard Henderson
2010-04-13 23:33 ` [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations Richard Henderson
2010-04-18 22:13   ` Aurelien Jarno [this message]
2010-04-19 13:56     ` Richard Henderson
2010-04-19 16:05       ` malc
2010-04-19 19:19         ` Richard Henderson
2010-04-13 23:44 ` [Qemu-devel] [PATCH 06/21] tcg-i386: Tidy shift operations Richard Henderson
2010-04-14 14:58 ` [Qemu-devel] [PATCH 07/21] tcg-i386: Tidy move operations Richard Henderson
2010-04-14 15:06 ` [Qemu-devel] [PATCH 08/21] tcg-i386: Eliminate extra move from qemu_ld64 Richard Henderson
2010-04-14 15:26 ` [Qemu-devel] [PATCH 09/21] tcg-i386: Tidy jumps Richard Henderson
2010-04-14 15:38 ` [Qemu-devel] [PATCH 10/21] tcg-i386: Tidy immediate arithmetic operations Richard Henderson
2010-04-14 17:16 ` [Qemu-devel] [PATCH 11/21] tcg-i386: Tidy non-immediate " Richard Henderson
2010-04-14 17:20 ` [Qemu-devel] [PATCH 12/21] tcg-i386: Tidy movi Richard Henderson
2010-04-14 17:59 ` [Qemu-devel] [PATCH 13/21] tcg-i386: Tidy push/pop Richard Henderson
2010-04-14 18:02 ` [Qemu-devel] [PATCH 14/21] tcg-i386: Tidy calls Richard Henderson
2010-04-14 18:04 ` [Qemu-devel] [PATCH 15/21] tcg-i386: Tidy ret Richard Henderson
2010-04-14 18:07 ` [Qemu-devel] [PATCH 16/21] tcg-i386: Tidy setcc Richard Henderson
2010-04-14 18:22 ` [Qemu-devel] [PATCH 17/21] tcg-i386: Tidy unary arithmetic Richard Henderson
2010-04-14 18:29 ` [Qemu-devel] [PATCH 18/21] tcg-i386: Tidy multiply Richard Henderson
2010-04-14 18:32 ` [Qemu-devel] [PATCH 19/21] tcg-i386: Tidy xchg Richard Henderson
2010-04-14 19:08 ` [Qemu-devel] [PATCH 20/21] tcg-i386: Tidy lea Richard Henderson
2010-04-14 20:29 ` [Qemu-devel] [PATCH 21/21] tcg-i386: Use lea for three-operand add Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100418221302.GA26784@volta.aurel32.net \
    --to=aurelien@aurel32.net \
    --cc=qemu-devel@nongnu.org \
    --cc=rth@twiddle.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).