Re: [Qemu-devel] [PATCH 7/7] tcg-i386: Perform tail call to qemu_ret_ld*_mmu

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Aurelien Jarno <aurelien@aurel32.net>
To: Richard Henderson <rth@twiddle.net>
Cc: qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH 7/7] tcg-i386: Perform tail call to qemu_ret_ld*_mmu
Date: Thu, 29 Aug 2013 19:06:22 +0200	[thread overview]
Message-ID: <20130829170622.GP5908@ohm.aurel32.net> (raw)
In-Reply-To: <1377639991-16028-8-git-send-email-rth@twiddle.net>

On Tue, Aug 27, 2013 at 02:46:31PM -0700, Richard Henderson wrote:
> This does require the fast path always load to the function return
> value register, but apparently the loaded value usually needs to be
> spilled back to its memory slot anyway so the change in register
> does not really change much.

My suggestion was to only do that for the store, not for the load.

Forcing a single call clobbered register for load function means we are
going to generate a lot more code for spilling the currently used value,
but also to reload it later.

You might argue that if the value is a global, it has to be saved back
to memory, but spilling it instead of saving it back means it has to be
reloaded later. Here is an example, in the first few TB from seabios:

0x7fd21c18d241:  mov    %ebp,%edi                 0x7fcade58f241:  mov    %ebp,%edi
0x7fd21c18d243:  mov    %ebp,%esi                 0x7fcade58f243:  mov    %ebp,%esi
0x7fd21c18d245:  shr    $0x7,%edi                 0x7fcade58f245:  shr    $0x7,%edi
0x7fd21c18d248:  and    $0xfffff001,%esi          0x7fcade58f248:  and    $0xfffff001,%esi
0x7fd21c18d24e:  and    $0x1fe0,%edi              0x7fcade58f24e:  and    $0x1fe0,%edi
0x7fd21c18d254:  lea    0x380(%r14,%rdi,1),%rdi   0x7fcade58f254:  lea    0x380(%r14,%rdi,1),%rdi
0x7fd21c18d25c:  cmp    (%rdi),%esi               0x7fcade58f25c:  cmp    (%rdi),%esi
0x7fd21c18d25e:  mov    %ebp,%esi                 0x7fcade58f25e:  mov    %ebp,%esi
0x7fd21c18d260:  jne    0x7fd21c18d384            0x7fcade58f260:  jne    0x7fcade58f3a0
0x7fd21c18d266:  add    0x10(%rdi),%rsi           0x7fcade58f266:  add    0x10(%rdi),%rsi
0x7fd21c18d26a:  movzwl (%rsi),%ebx               0x7fcade58f26a:  movzwl (%rsi),%eax
0x7fd21c18d26d:  add    $0x2,%ebp                 0x7fcade58f26d:  add    $0x2,%ebp
                                                  0x7fcade58f270:  mov    %eax,0x80(%rsp)

Here %eax needs to be spilled as the value is needed for the next
qemu_ld operation.

0x7fd21c18d270:  mov    %ebp,%edi                 0x7fcade58f277:  mov    %ebp,%edi
0x7fd21c18d272:  mov    %ebp,%esi                 0x7fcade58f279:  mov    %ebp,%esi
0x7fd21c18d274:  shr    $0x7,%edi                 0x7fcade58f27b:  shr    $0x7,%edi
0x7fd21c18d277:  and    $0xfffff003,%esi          0x7fcade58f27e:  and    $0xfffff003,%esi
0x7fd21c18d27d:  and    $0x1fe0,%edi              0x7fcade58f284:  and    $0x1fe0,%edi
0x7fd21c18d283:  lea    0x380(%r14,%rdi,1),%rdi   0x7fcade58f28a:  lea    0x380(%r14,%rdi,1),%rdi
0x7fd21c18d28b:  cmp    (%rdi),%esi               0x7fcade58f292:  cmp    (%rdi),%esi
0x7fd21c18d28d:  mov    %ebp,%esi                 0x7fcade58f294:  mov    %ebp,%esi
0x7fd21c18d28f:  jne    0x7fd21c18d39d            0x7fcade58f296:  jne    0x7fcade58f3b2
0x7fd21c18d295:  add    0x10(%rdi),%rsi           0x7fcade58f29c:  add    0x10(%rdi),%rsi
0x7fd21c18d299:  mov    (%rsi),%ebp               0x7fcade58f2a0:  mov    (%rsi),%eax
0x7fd21c18d29b:  and    $0xffffff,%ebp            0x7fcade58f2a2:  and    $0xffffff,%eax
0x7fd21c18d2a1:  mov    %ebp,0xd8(%r14)           0x7fcade58f2a8:  mov    %eax,0xd8(%r14)
                                                  0x7fcade58f2b6:  mov    %ebp,0xdc(%r14)

And here it has to be reloaded from memory.

0x7fd21c18d2af:  mov    0x58(%r14),%ebp           0x7fcade58f2af:  mov    0x80(%rsp),%ebp
0x7fd21c18d2a8:  mov    %ebx,0xdc(%r14)           0x7fcade58f2bd:  mov    0x58(%r14),%ebp

Overall it even makes the TB bigger. As the fast path is really what is
important and executed a lot more often than the slow path, we should 
not deoptimize the fast path to get a smaller slow path.


> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c | 107 ++++++++++++++++++--------------------------------
>  1 file changed, 39 insertions(+), 68 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 5aee0fa..b1d05b8 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -1025,11 +1025,20 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
>  /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
>   *                                     int mmu_idx, uintptr_t ra)
>   */
> -static const void * const qemu_ld_helpers[4] = {
> +static const void * const qemu_ld_helpers[8] = {
>      helper_ret_ldub_mmu,
>      helper_ret_lduw_mmu,
>      helper_ret_ldul_mmu,
>      helper_ret_ldq_mmu,
> +
> +    helper_ret_ldsb_mmu,
> +    helper_ret_ldsw_mmu,
> +#if TCG_TARGET_REG_BITS == 64
> +    helper_ret_ldsl_mmu,
> +#else
> +    helper_ret_ldul_mmu,
> +#endif
> +    helper_ret_ldq_mmu,
>  };
>  
>  /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
> @@ -1473,9 +1482,8 @@ static void add_qemu_ldst_label(TCGContext *s,
>  static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
>  {
>      int opc = l->opc;
> -    int s_bits = opc & 3;
> -    TCGReg data_reg;
>      uint8_t **label_ptr = &l->label_ptr[0];
> +    TCGReg retaddr;
>  
>      /* resolve label address */
>      *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4);
> @@ -1500,58 +1508,21 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
>          tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, l->mem_index);
>          ofs += 4;
>  
> -        tcg_out_sti(s, TCG_TYPE_I32, TCG_REG_ESP, ofs, (uintptr_t)l->raddr);
> +        retaddr = TCG_REG_EAX;
> +        tcg_out_movi(s, TCG_TYPE_I32, retaddr, (uintptr_t)l->raddr);
> +        tcg_out_st(s, TCG_TYPE_I32, retaddr, TCG_REG_ESP, ofs);
>      } else {
>          tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
>          /* The second argument is already loaded with addrlo.  */
>          tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2],
>                       l->mem_index);
> -        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
> -                     (uintptr_t)l->raddr);
> -    }
> -
> -    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
> -
> -    data_reg = l->datalo_reg;
> -    switch(opc) {
> -    case 0 | 4:
> -        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
> -        break;
> -    case 1 | 4:
> -        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
> -        break;
> -    case 0:
> -        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
> -        break;
> -    case 1:
> -        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
> -        break;
> -    case 2:
> -        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
> -        break;
> -#if TCG_TARGET_REG_BITS == 64
> -    case 2 | 4:
> -        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
> -        break;
> -#endif
> -    case 3:
> -        if (TCG_TARGET_REG_BITS == 64) {
> -            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
> -        } else if (data_reg == TCG_REG_EDX) {
> -            /* xchg %edx, %eax */
> -            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
> -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
> -        } else {
> -            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
> -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
> -        }
> -        break;
> -    default:
> -        tcg_abort();
> +        retaddr = tcg_target_call_iarg_regs[3];
> +        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
>      }
>  
> -    /* Jump to the code corresponding to next IR of qemu_st */
> -    tcg_out_jmp(s, (tcg_target_long)l->raddr);
> +    /* "Tail call" to the helper, with the return address back inline.  */
> +    tcg_out_push(s, retaddr);
> +    tcg_out_jmp(s, (tcg_target_long)qemu_ld_helpers[opc]);
>  }
>  
>  /*
> @@ -2125,38 +2096,38 @@ static const TCGTargetOpDef x86_op_defs[] = {
>  #endif
>  
>  #if TCG_TARGET_REG_BITS == 64
> -    { INDEX_op_qemu_ld8u, { "r", "L" } },
> -    { INDEX_op_qemu_ld8s, { "r", "L" } },
> -    { INDEX_op_qemu_ld16u, { "r", "L" } },
> -    { INDEX_op_qemu_ld16s, { "r", "L" } },
> -    { INDEX_op_qemu_ld32, { "r", "L" } },
> -    { INDEX_op_qemu_ld32u, { "r", "L" } },
> -    { INDEX_op_qemu_ld32s, { "r", "L" } },
> -    { INDEX_op_qemu_ld64, { "r", "L" } },
> +    { INDEX_op_qemu_ld8u, { "a", "L" } },
> +    { INDEX_op_qemu_ld8s, { "a", "L" } },
> +    { INDEX_op_qemu_ld16u, { "a", "L" } },
> +    { INDEX_op_qemu_ld16s, { "a", "L" } },
> +    { INDEX_op_qemu_ld32, { "a", "L" } },
> +    { INDEX_op_qemu_ld32u, { "a", "L" } },
> +    { INDEX_op_qemu_ld32s, { "a", "L" } },
> +    { INDEX_op_qemu_ld64, { "a", "L" } },
>  
>      { INDEX_op_qemu_st8, { "L", "L" } },
>      { INDEX_op_qemu_st16, { "L", "L" } },
>      { INDEX_op_qemu_st32, { "L", "L" } },
>      { INDEX_op_qemu_st64, { "L", "L" } },
>  #elif TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
> -    { INDEX_op_qemu_ld8u, { "r", "L" } },
> -    { INDEX_op_qemu_ld8s, { "r", "L" } },
> -    { INDEX_op_qemu_ld16u, { "r", "L" } },
> -    { INDEX_op_qemu_ld16s, { "r", "L" } },
> -    { INDEX_op_qemu_ld32, { "r", "L" } },
> -    { INDEX_op_qemu_ld64, { "r", "r", "L" } },
> +    { INDEX_op_qemu_ld8u, { "a", "L" } },
> +    { INDEX_op_qemu_ld8s, { "a", "L" } },
> +    { INDEX_op_qemu_ld16u, { "a", "L" } },
> +    { INDEX_op_qemu_ld16s, { "a", "L" } },
> +    { INDEX_op_qemu_ld32, { "a", "L" } },
> +    { INDEX_op_qemu_ld64, { "a", "d", "L" } },
>  
>      { INDEX_op_qemu_st8, { "cb", "L" } },
>      { INDEX_op_qemu_st16, { "L", "L" } },
>      { INDEX_op_qemu_st32, { "L", "L" } },
>      { INDEX_op_qemu_st64, { "L", "L", "L" } },
>  #else
> -    { INDEX_op_qemu_ld8u, { "r", "L", "L" } },
> -    { INDEX_op_qemu_ld8s, { "r", "L", "L" } },
> -    { INDEX_op_qemu_ld16u, { "r", "L", "L" } },
> -    { INDEX_op_qemu_ld16s, { "r", "L", "L" } },
> -    { INDEX_op_qemu_ld32, { "r", "L", "L" } },
> -    { INDEX_op_qemu_ld64, { "r", "r", "L", "L" } },
> +    { INDEX_op_qemu_ld8u, { "a", "L", "L" } },
> +    { INDEX_op_qemu_ld8s, { "a", "L", "L" } },
> +    { INDEX_op_qemu_ld16u, { "a", "L", "L" } },
> +    { INDEX_op_qemu_ld16s, { "a", "L", "L" } },
> +    { INDEX_op_qemu_ld32, { "a", "L", "L" } },
> +    { INDEX_op_qemu_ld64, { "a", "d", "L", "L" } },
>  
>      { INDEX_op_qemu_st8, { "cb", "L", "L" } },
>      { INDEX_op_qemu_st16, { "L", "L", "L" } },
> -- 
> 1.8.1.4
> 
> 
> 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

next prev parent reply	other threads:[~2013-08-29 17:06 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-27 21:46 [Qemu-devel] [PATCH 0/7] Further tcg ldst improvements Richard Henderson
2013-08-27 21:46 ` [Qemu-devel] [PATCH 1/7] exec: Reorganize the GETRA/GETPC macros Richard Henderson
2013-08-27 21:46 ` [Qemu-devel] [PATCH 2/7] tcg-i386: Don't perform GETPC adjustment in TCG code Richard Henderson
2013-08-27 21:46 ` [Qemu-devel] [PATCH 3/7] exec: Rename USUFFIX to LSUFFIX Richard Henderson
2013-08-27 21:46 ` [Qemu-devel] [PATCH 4/7] target: Include softmmu_exec.h where forgotten Richard Henderson
2013-08-27 21:46 ` [Qemu-devel] [PATCH 5/7] exec: Split softmmu_defs.h Richard Henderson
2013-08-27 21:46 ` [Qemu-devel] [PATCH 6/7] tcg: Introduce zero and sign-extended versions of load helpers Richard Henderson
2013-08-27 21:46 ` [Qemu-devel] [PATCH 7/7] tcg-i386: Perform tail call to qemu_ret_ld*_mmu Richard Henderson
2013-08-28 16:34   ` Richard Henderson
2013-08-29 15:31   ` Paolo Bonzini
2013-08-29 16:08     ` Richard Henderson
2013-08-29 16:36       ` Paolo Bonzini
2013-08-29 17:06   ` Aurelien Jarno [this message]
2013-08-29 17:43     ` Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130829170622.GP5908@ohm.aurel32.net \
    --to=aurelien@aurel32.net \
    --cc=qemu-devel@nongnu.org \
    --cc=rth@twiddle.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).