Re: [Qemu-devel] [PATCH v6 16/20] tcg-arm: Improve scheduling of tcg_out_tlb_read

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Aurelien Jarno <aurelien@aurel32.net>
To: Richard Henderson <rth@twiddle.net>
Cc: qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH v6 16/20] tcg-arm: Improve scheduling of tcg_out_tlb_read
Date: Wed, 24 Apr 2013 09:43:03 +0200	[thread overview]
Message-ID: <20130424074303.GF5000@ohm.aurel32.net> (raw)
In-Reply-To: <1366750012-25015-17-git-send-email-rth@twiddle.net>

On Tue, Apr 23, 2013 at 01:46:48PM -0700, Richard Henderson wrote:
> The schedule was fully serial, with no possibility for dual issue.
> The old schedule had a minimal issue of 7 cycles; the new schedule
> has a minimal issue of 5 cycles.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/arm/tcg-target.c | 110 ++++++++++++++++++++++++++-------------------------
>  1 file changed, 57 insertions(+), 53 deletions(-)
> 
> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
> index a96471c..375c1e1 100644
> --- a/tcg/arm/tcg-target.c
> +++ b/tcg/arm/tcg-target.c
> @@ -182,18 +182,12 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>          ct->ct |= TCG_CT_REG;
>          tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
>  #ifdef CONFIG_SOFTMMU
> -        /* r0 and r1 will be overwritten when reading the tlb entry,
> +        /* r0-r2 will be overwritten when reading the tlb entry,
>             so don't use these. */
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#if TARGET_LONG_BITS == 64
> -        /* If we're passing env to the helper as r0 and need a regpair
> -         * for the address then r2 will be overwritten as we're setting
> -         * up the args to the helper.
> -         */
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
>  #endif
> -#endif
>          break;
>      case 'L':
>          ct->ct |= TCG_CT_REG;
> @@ -207,30 +201,16 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>  
>      /* qemu_st address & data_reg */
>      case 's':
> -        ct->ct |= TCG_CT_REG;
> -        tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> -        /* r0 and r1 will be overwritten when reading the tlb entry
> -           (softmmu only) and doing the byte swapping, so don't
> -           use these. */
> -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
> -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#if defined(CONFIG_SOFTMMU) && (TARGET_LONG_BITS == 64)
> -        /* Avoid clashes with registers being used for helper args */
> -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
> -#endif
> -        break;
>      /* qemu_st64 data_reg2 */
>      case 'S':
>          ct->ct |= TCG_CT_REG;
>          tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> -        /* r0 and r1 will be overwritten when reading the tlb entry
> -            (softmmu only) and doing the byte swapping, so don't
> -            use these. */
> +        /* r0-r2 will be overwritten when reading the tlb entry (softmmu only)
> +           and r0-r1 doing the byte swapping, so don't use these. */
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#ifdef CONFIG_SOFTMMU
> -        /* r2 is still needed to load data_reg, so don't use it. */
> +#if defined(CONFIG_SOFTMMU)
> +        /* Avoid clashes with registers being used for helper args */
>          tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
>  #if TARGET_LONG_BITS == 64
>          /* Avoid clashes with registers being used for helper args */
> @@ -347,6 +327,8 @@ typedef enum {
>      INSN_LDRSB_REG = 0x001000d0,
>      INSN_STRB_IMM  = 0x04400000,
>      INSN_STRB_REG  = 0x06400000,
> +
> +    INSN_LDRD_IMM  = 0x004000d0,
>  } ARMInsn;
>  
>  #define SHIFT_IMM_LSL(im)	(((im) << 7) | 0x00)
> @@ -805,15 +787,6 @@ static inline void tcg_out_ld32_12(TCGContext *s, int cond, TCGReg rt,
>      tcg_out_memop_12(s, cond, INSN_LDR_IMM, rt, rn, imm12, 1, 0);
>  }
>  
> -/* Offset pre-increment with base writeback.  */
> -static inline void tcg_out_ld32_12wb(TCGContext *s, int cond, TCGReg rt,
> -                                     TCGReg rn, int imm12)
> -{
> -    /* ldr with writeback and both register equals is UNPREDICTABLE */
> -    assert(rd != rn);
> -    tcg_out_memop_12(s, cond, INSN_LDR_IMM, rt, rn, imm12, 1, 1);
> -}
> -
>  static inline void tcg_out_st32_12(TCGContext *s, int cond, TCGReg rt,
>                                     TCGReg rn, int imm12)
>  {
> @@ -1150,47 +1123,78 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
>  
>  #define TLB_SHIFT	(CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
>  
> -/* Load and compare a TLB entry, leaving the flags set.  Leaves R0 pointing
> +/* Load and compare a TLB entry, leaving the flags set.  Leaves R2 pointing
>     to the tlb entry.  Clobbers R1 and TMP.  */
>  
>  static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
>                               int s_bits, int tlb_offset)
>  {
> +    TCGReg base = TCG_AREG0;
> +
>      /* Should generate something like the following:
> -     *  shr r8, addr_reg, #TARGET_PAGE_BITS
> -     *  and r0, r8, #(CPU_TLB_SIZE - 1)   @ Assumption: CPU_TLB_BITS <= 8
> -     *  add r0, env, r0 lsl #CPU_TLB_ENTRY_BITS
> +     * pre-v7:
> +     *   shr    tmp, addr_reg, #TARGET_PAGE_BITS                  (1)
> +     *   add    r2, env, #off & 0xff00
> +     *   and    r0, tmp, #(CPU_TLB_SIZE - 1)                      (2)
> +     *   add    r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS               (3)
> +     *   ldr    r0, [r2, #off & 0xff]!                            (4)
> +     *   tst    addr_reg, #s_mask
> +     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS                    (5)
> +     *
> +     * v7 (not implemented yet):
> +     *   ubfx   r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS    (1)
> +     *   movw   tmp, #~TARGET_PAGE_MASK & ~s_mask
> +     *   movw   r0, #off
> +     *   add    r2, env, r2, lsl #CPU_TLB_ENTRY_BITS              (2)
> +     *   bic    tmp, addr_reg, tmp
> +     *   ldr    r0, [r2, r0]!                                     (3)
> +     *   cmp    r0, tmp                                           (4)
>       */
>  #  if CPU_TLB_BITS > 8
>  #   error
>  #  endif
>      tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
>                      0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
> +
> +    /* We assume that the offset is contained within 16 bits.  */
> +    assert((tlb_offset & ~0xffff) == 0);
> +    if (tlb_offset > 0xff) {
> +        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
> +                        (24 << 7) | (tlb_offset >> 8));
> +        tlb_offset &= 0xff;
> +        base = TCG_REG_R2;
> +    }
> +
>      tcg_out_dat_imm(s, COND_AL, ARITH_AND,
>                      TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
> -    tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_AREG0,
> +    tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
>                      TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
>  
> -    /* We assume that the offset is contained within 20 bits.  */
> -    assert((tlb_offset & ~0xfffff) == 0);
> -    if (tlb_offset > 0xfff) {
> -        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_REG_R0,
> -                        0xa00 | (tlb_offset >> 12));
> -        tlb_offset &= 0xfff;
> +    /* Load the tlb comparator.  Use ldrd if needed and available,
> +       but due to how the pointer needs setting up, ldm isn't useful.
> +       Base arm5 doesn't have ldrd, but armv5te does.  */
> +    if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
> +        tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
> +                        TCG_REG_R2, tlb_offset, 1, 1);
> +    } else {
> +        tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
> +                         TCG_REG_R2, tlb_offset, 1, 1);
> +        if (TARGET_LONG_BITS == 64) {
> +            tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
> +                             TCG_REG_R2, 4, 1, 0);
> +        }
>      }
> -    tcg_out_ld32_12wb(s, COND_AL, TCG_REG_R1, TCG_REG_R0, tlb_offset);
> -    tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R1,
> -                    TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
>  
>      /* Check alignment.  */
>      if (s_bits) {
> -        tcg_out_dat_imm(s, COND_EQ, ARITH_TST,
> +        tcg_out_dat_imm(s, COND_AL, ARITH_TST,
>                          0, addrlo, (1 << s_bits) - 1);
>      }
>  
> +    tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
> +                    TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
> +
>      if (TARGET_LONG_BITS == 64) {
> -        /* XXX: possibly we could use a block data load in the first access. */
> -        tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, 4);
>          tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
>                          TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
>      }
> @@ -1223,7 +1227,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
>      tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
>                       offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
>  
> -    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
> +    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
>                      offsetof(CPUTLBEntry, addend)
>                      - offsetof(CPUTLBEntry, addr_read));
>  
> @@ -1395,7 +1399,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
>                       offsetof(CPUArchState,
>                                tlb_table[mem_index][0].addr_write));
>  
> -    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
> +    tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
>                      offsetof(CPUTLBEntry, addend)
>                      - offsetof(CPUTLBEntry, addr_write));
>  

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

next prev parent reply	other threads:[~2013-04-24  7:43 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-04-23 20:46 [Qemu-devel] [PATCH v6 00/20] tcg-arm improvments Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 01/20] tcg-arm: Fix local stack frame Richard Henderson
2013-04-24  7:42   ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 02/20] tcg: Log the contents of the prologue with -d out_asm Richard Henderson
2013-04-26  5:27   ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 03/20] tcg-arm: Use bic to implement and with constant Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 04/20] tcg-arm: Handle negated constant arguments to and/sub Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 05/20] tcg-arm: Allow constant first argument to sub Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 06/20] tcg-arm: Use tcg_out_dat_rIN for compares Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 07/20] tcg-arm: Handle constant arguments to add2/sub2 Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 08/20] tcg-arm: Improve constant generation Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 09/20] tcg-arm: Implement deposit for armv7 Richard Henderson
2013-04-24  7:42   ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 10/20] tcg-arm: Implement division instructions Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 11/20] tcg-arm: Use TCG_REG_TMP name for the tcg temporary Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 12/20] tcg-arm: Use R12 " Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 13/20] tcg-arm: Cleanup multiply subroutines Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 14/20] tcg-arm: Cleanup most primitive load store subroutines Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 15/20] tcg-arm: Split out tcg_out_tlb_read Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 16/20] tcg-arm: Improve scheduling of tcg_out_tlb_read Richard Henderson
2013-04-24  7:43   ` Aurelien Jarno [this message]
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 17/20] tcg-arm: Delete the 'S' constraint Richard Henderson
2013-04-24  7:43   ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 18/20] tcg-arm: Use movi32 + blx for calls on v7 Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 19/20] tcg-arm: Convert to CONFIG_QEMU_LDST_OPTIMIZATION Richard Henderson
2013-04-24  7:43   ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 20/20] tcg-arm: Remove long jump from tcg_out_goto_label Richard Henderson
2013-04-24  7:43   ` Aurelien Jarno
2013-04-26 10:08 ` [Qemu-devel] [PATCH v6 00/20] tcg-arm improvments Peter Maydell
2013-04-27  0:20 ` Aurelien Jarno

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130424074303.GF5000@ohm.aurel32.net \
    --to=aurelien@aurel32.net \
    --cc=qemu-devel@nongnu.org \
    --cc=rth@twiddle.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).