From: Aurelien Jarno <aurelien@aurel32.net>
To: Richard Henderson <rth@twiddle.net>
Cc: qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH v6 16/20] tcg-arm: Improve scheduling of tcg_out_tlb_read
Date: Wed, 24 Apr 2013 09:43:03 +0200 [thread overview]
Message-ID: <20130424074303.GF5000@ohm.aurel32.net> (raw)
In-Reply-To: <1366750012-25015-17-git-send-email-rth@twiddle.net>
On Tue, Apr 23, 2013 at 01:46:48PM -0700, Richard Henderson wrote:
> The schedule was fully serial, with no possibility for dual issue.
> The old schedule had a minimal issue of 7 cycles; the new schedule
> has a minimal issue of 5 cycles.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
> tcg/arm/tcg-target.c | 110 ++++++++++++++++++++++++++-------------------------
> 1 file changed, 57 insertions(+), 53 deletions(-)
>
> diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
> index a96471c..375c1e1 100644
> --- a/tcg/arm/tcg-target.c
> +++ b/tcg/arm/tcg-target.c
> @@ -182,18 +182,12 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
> ct->ct |= TCG_CT_REG;
> tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> #ifdef CONFIG_SOFTMMU
> - /* r0 and r1 will be overwritten when reading the tlb entry,
> + /* r0-r2 will be overwritten when reading the tlb entry,
> so don't use these. */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#if TARGET_LONG_BITS == 64
> - /* If we're passing env to the helper as r0 and need a regpair
> - * for the address then r2 will be overwritten as we're setting
> - * up the args to the helper.
> - */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> #endif
> -#endif
> break;
> case 'L':
> ct->ct |= TCG_CT_REG;
> @@ -207,30 +201,16 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>
> /* qemu_st address & data_reg */
> case 's':
> - ct->ct |= TCG_CT_REG;
> - tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> - /* r0 and r1 will be overwritten when reading the tlb entry
> - (softmmu only) and doing the byte swapping, so don't
> - use these. */
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#if defined(CONFIG_SOFTMMU) && (TARGET_LONG_BITS == 64)
> - /* Avoid clashes with registers being used for helper args */
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
> -#endif
> - break;
> /* qemu_st64 data_reg2 */
> case 'S':
> ct->ct |= TCG_CT_REG;
> tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
> - /* r0 and r1 will be overwritten when reading the tlb entry
> - (softmmu only) and doing the byte swapping, so don't
> - use these. */
> + /* r0-r2 will be overwritten when reading the tlb entry (softmmu only)
> + and r0-r1 doing the byte swapping, so don't use these. */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
> -#ifdef CONFIG_SOFTMMU
> - /* r2 is still needed to load data_reg, so don't use it. */
> +#if defined(CONFIG_SOFTMMU)
> + /* Avoid clashes with registers being used for helper args */
> tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
> #if TARGET_LONG_BITS == 64
> /* Avoid clashes with registers being used for helper args */
> @@ -347,6 +327,8 @@ typedef enum {
> INSN_LDRSB_REG = 0x001000d0,
> INSN_STRB_IMM = 0x04400000,
> INSN_STRB_REG = 0x06400000,
> +
> + INSN_LDRD_IMM = 0x004000d0,
> } ARMInsn;
>
> #define SHIFT_IMM_LSL(im) (((im) << 7) | 0x00)
> @@ -805,15 +787,6 @@ static inline void tcg_out_ld32_12(TCGContext *s, int cond, TCGReg rt,
> tcg_out_memop_12(s, cond, INSN_LDR_IMM, rt, rn, imm12, 1, 0);
> }
>
> -/* Offset pre-increment with base writeback. */
> -static inline void tcg_out_ld32_12wb(TCGContext *s, int cond, TCGReg rt,
> - TCGReg rn, int imm12)
> -{
> - /* ldr with writeback and both register equals is UNPREDICTABLE */
> - assert(rd != rn);
> - tcg_out_memop_12(s, cond, INSN_LDR_IMM, rt, rn, imm12, 1, 1);
> -}
> -
> static inline void tcg_out_st32_12(TCGContext *s, int cond, TCGReg rt,
> TCGReg rn, int imm12)
> {
> @@ -1150,47 +1123,78 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
>
> #define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
>
> -/* Load and compare a TLB entry, leaving the flags set. Leaves R0 pointing
> +/* Load and compare a TLB entry, leaving the flags set. Leaves R2 pointing
> to the tlb entry. Clobbers R1 and TMP. */
>
> static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
> int s_bits, int tlb_offset)
> {
> + TCGReg base = TCG_AREG0;
> +
> /* Should generate something like the following:
> - * shr r8, addr_reg, #TARGET_PAGE_BITS
> - * and r0, r8, #(CPU_TLB_SIZE - 1) @ Assumption: CPU_TLB_BITS <= 8
> - * add r0, env, r0 lsl #CPU_TLB_ENTRY_BITS
> + * pre-v7:
> + * shr tmp, addr_reg, #TARGET_PAGE_BITS (1)
> + * add r2, env, #off & 0xff00
> + * and r0, tmp, #(CPU_TLB_SIZE - 1) (2)
> + * add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3)
> + * ldr r0, [r2, #off & 0xff]! (4)
> + * tst addr_reg, #s_mask
> + * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS (5)
> + *
> + * v7 (not implemented yet):
> + * ubfx r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS (1)
> + * movw tmp, #~TARGET_PAGE_MASK & ~s_mask
> + * movw r0, #off
> + * add r2, env, r2, lsl #CPU_TLB_ENTRY_BITS (2)
> + * bic tmp, addr_reg, tmp
> + * ldr r0, [r2, r0]! (3)
> + * cmp r0, tmp (4)
> */
> # if CPU_TLB_BITS > 8
> # error
> # endif
> tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
> 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
> +
> + /* We assume that the offset is contained within 16 bits. */
> + assert((tlb_offset & ~0xffff) == 0);
> + if (tlb_offset > 0xff) {
> + tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
> + (24 << 7) | (tlb_offset >> 8));
> + tlb_offset &= 0xff;
> + base = TCG_REG_R2;
> + }
> +
> tcg_out_dat_imm(s, COND_AL, ARITH_AND,
> TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
> - tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_AREG0,
> + tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
> TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
>
> - /* We assume that the offset is contained within 20 bits. */
> - assert((tlb_offset & ~0xfffff) == 0);
> - if (tlb_offset > 0xfff) {
> - tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_REG_R0,
> - 0xa00 | (tlb_offset >> 12));
> - tlb_offset &= 0xfff;
> + /* Load the tlb comparator. Use ldrd if needed and available,
> + but due to how the pointer needs setting up, ldm isn't useful.
> + Base arm5 doesn't have ldrd, but armv5te does. */
> + if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
> + tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
> + TCG_REG_R2, tlb_offset, 1, 1);
> + } else {
> + tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
> + TCG_REG_R2, tlb_offset, 1, 1);
> + if (TARGET_LONG_BITS == 64) {
> + tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
> + TCG_REG_R2, 4, 1, 0);
> + }
> }
> - tcg_out_ld32_12wb(s, COND_AL, TCG_REG_R1, TCG_REG_R0, tlb_offset);
> - tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R1,
> - TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
>
> /* Check alignment. */
> if (s_bits) {
> - tcg_out_dat_imm(s, COND_EQ, ARITH_TST,
> + tcg_out_dat_imm(s, COND_AL, ARITH_TST,
> 0, addrlo, (1 << s_bits) - 1);
> }
>
> + tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
> + TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
> +
> if (TARGET_LONG_BITS == 64) {
> - /* XXX: possibly we could use a block data load in the first access. */
> - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, 4);
> tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0,
> TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0));
> }
> @@ -1223,7 +1227,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
> tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
> offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
>
> - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
> + tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
> offsetof(CPUTLBEntry, addend)
> - offsetof(CPUTLBEntry, addr_read));
>
> @@ -1395,7 +1399,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
> offsetof(CPUArchState,
> tlb_table[mem_index][0].addr_write));
>
> - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0,
> + tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2,
> offsetof(CPUTLBEntry, addend)
> - offsetof(CPUTLBEntry, addr_write));
>
Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
--
Aurelien Jarno GPG: 1024D/F1BCDB73
aurelien@aurel32.net http://www.aurel32.net
next prev parent reply other threads:[~2013-04-24 7:43 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-04-23 20:46 [Qemu-devel] [PATCH v6 00/20] tcg-arm improvments Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 01/20] tcg-arm: Fix local stack frame Richard Henderson
2013-04-24 7:42 ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 02/20] tcg: Log the contents of the prologue with -d out_asm Richard Henderson
2013-04-26 5:27 ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 03/20] tcg-arm: Use bic to implement and with constant Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 04/20] tcg-arm: Handle negated constant arguments to and/sub Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 05/20] tcg-arm: Allow constant first argument to sub Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 06/20] tcg-arm: Use tcg_out_dat_rIN for compares Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 07/20] tcg-arm: Handle constant arguments to add2/sub2 Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 08/20] tcg-arm: Improve constant generation Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 09/20] tcg-arm: Implement deposit for armv7 Richard Henderson
2013-04-24 7:42 ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 10/20] tcg-arm: Implement division instructions Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 11/20] tcg-arm: Use TCG_REG_TMP name for the tcg temporary Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 12/20] tcg-arm: Use R12 " Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 13/20] tcg-arm: Cleanup multiply subroutines Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 14/20] tcg-arm: Cleanup most primitive load store subroutines Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 15/20] tcg-arm: Split out tcg_out_tlb_read Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 16/20] tcg-arm: Improve scheduling of tcg_out_tlb_read Richard Henderson
2013-04-24 7:43 ` Aurelien Jarno [this message]
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 17/20] tcg-arm: Delete the 'S' constraint Richard Henderson
2013-04-24 7:43 ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 18/20] tcg-arm: Use movi32 + blx for calls on v7 Richard Henderson
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 19/20] tcg-arm: Convert to CONFIG_QEMU_LDST_OPTIMIZATION Richard Henderson
2013-04-24 7:43 ` Aurelien Jarno
2013-04-23 20:46 ` [Qemu-devel] [PATCH v6 20/20] tcg-arm: Remove long jump from tcg_out_goto_label Richard Henderson
2013-04-24 7:43 ` Aurelien Jarno
2013-04-26 10:08 ` [Qemu-devel] [PATCH v6 00/20] tcg-arm improvments Peter Maydell
2013-04-27 0:20 ` Aurelien Jarno
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130424074303.GF5000@ohm.aurel32.net \
--to=aurelien@aurel32.net \
--cc=qemu-devel@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.