From: Richard Henderson <rth@twiddle.net>
To: qemu-devel@nongnu.org
Cc: aurelien@aurel32.net
Subject: [Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access
Date: Wed, 28 Aug 2013 15:33:33 -0700 [thread overview]
Message-ID: <1377729213-2704-6-git-send-email-rth@twiddle.net> (raw)
In-Reply-To: <1377729213-2704-1-git-send-email-rth@twiddle.net>
Moves the load for the tlb addend earlier, to better load latency.
Avoids the writeback from the comparator load, since we know how
to adjust the offset between the two loads.
: e2862c03 add r2, r6, #768 ; 0x300
: e20c00ff and r0, ip, #255 ; 0xff
: e0822280 add r2, r2, r0, lsl #5
-: e1e209d8 ldrd r0, [r2, #152]!
+: e1c209d8 ldrd r0, [r2, #152]
: e31b0007 tst fp, #7 ; 0x7
+: e59220a8 ldr r2, [r2, #168]
: 0150068c cmpeq r0, ip, lsl #13
: 01510007 cmpeq r1, r7
-: e5921010 ldr r1, [r2, #16]
-: 018b40f1 strdeq r4, [fp, r1]
+: 018b40f2 strdeq r4, [fp, r2]
: 1b0000cb blne 0x75e7e6e4
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
tcg/arm/tcg-target.c | 97 +++++++++++++++++++++++-----------------------------
1 file changed, 42 insertions(+), 55 deletions(-)
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index f1e547f..6d03d6b 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1179,13 +1179,18 @@ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
> 0xffff);
-/* Load and compare a TLB entry, leaving the flags set. Leaves R2 pointing
- to the tlb entry. Clobbers R1 and TMP. */
+/* Load and compare a TLB entry, leaving the flags set. Leaves R2
+ containing the tlb addend. Clobbers R0, R1 and TMP. */
static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
- int s_bits, int tlb_offset)
+ int s_bits, int mem_index, bool is_load)
{
TCGReg base = TCG_AREG0;
+ int tlb_offset =
+ (is_load
+ ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+ : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+ int add_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
/* Should generate something like the following:
* pre-v7:
@@ -1193,18 +1198,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
* add r2, env, #off & 0xff00
* and r0, tmp, #(CPU_TLB_SIZE - 1) (2)
* add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3)
- * ldr r0, [r2, #off & 0xff]! (4)
+ * ldr r0, [r2, #off & 0xff] (4)
* tst addr_reg, #s_mask
- * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS (5)
- *
- * v7 (not implemented yet):
- * ubfx r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS (1)
- * movw tmp, #~TARGET_PAGE_MASK & ~s_mask
- * movw r0, #off
- * add r2, env, r2, lsl #CPU_TLB_ENTRY_BITS (2)
- * bic tmp, addr_reg, tmp
- * ldr r0, [r2, r0]! (3)
- * cmp r0, tmp (4)
+ * ldr r2, [r2, #addoff] (5)
+ * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS
*/
tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
@@ -1213,7 +1210,6 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
if (tlb_offset > 0xff) {
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
(24 << 7) | (tlb_offset >> 8));
- tlb_offset &= 0xff;
base = TCG_REG_R2;
}
@@ -1226,14 +1222,12 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
but due to how the pointer needs setting up, ldm isn't useful.
Base arm5 doesn't have ldrd, but armv5te does. */
if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
- tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
- TCG_REG_R2, tlb_offset, 1, 1);
+ tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
} else {
- tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
- TCG_REG_R2, tlb_offset, 1, 1);
+ tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
if (TARGET_LONG_BITS == 64) {
- tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R1,
- TCG_REG_R2, 4, 1, 0);
+ tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
+ (tlb_offset & 0xff) + 4);
}
}
@@ -1243,6 +1237,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
0, addrlo, (1 << s_bits) - 1);
}
+ /* Load the tlb addend. */
+ tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
+ add_offset - (tlb_offset & 0xff00));
+
tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
@@ -1360,53 +1358,48 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
mem_index = *args;
s_bits = opc & 3;
- tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
- offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
-
- tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
- offsetof(CPUTLBEntry, addend)
- - offsetof(CPUTLBEntry, addr_read));
+ tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1);
switch (opc) {
case 0:
- tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
break;
case 0 | 4:
- tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
break;
case 1:
- tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
if (bswap) {
tcg_out_bswap16(s, COND_EQ, data_reg, data_reg);
}
break;
case 1 | 4:
if (bswap) {
- tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg);
} else {
- tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
}
break;
case 2:
default:
- tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
if (bswap) {
tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
}
break;
case 3:
if (bswap) {
- tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg);
- tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4);
+ tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R2, addr_reg);
+ tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R2, 4);
tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2);
tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
} else if (use_armv6_instructions
&& (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
- tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
} else {
- tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
- tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+ tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+ tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
}
break;
}
@@ -1506,47 +1499,41 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
mem_index = *args;
s_bits = opc & 3;
- tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
- offsetof(CPUArchState,
- tlb_table[mem_index][0].addr_write));
-
- tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
- offsetof(CPUTLBEntry, addend)
- - offsetof(CPUTLBEntry, addr_write));
+ tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0);
switch (opc) {
case 0:
- tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
break;
case 1:
if (bswap) {
tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
- tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+ tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
} else {
- tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
}
break;
case 2:
default:
if (bswap) {
tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
- tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+ tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
} else {
- tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
}
break;
case 3:
if (bswap) {
tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
- tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg);
+ tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, addr_reg);
tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
- tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4);
+ tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, 4);
} else if (use_armv6_instructions
&& (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
- tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+ tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
} else {
- tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
- tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+ tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+ tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
}
break;
}
--
1.8.1.4
prev parent reply other threads:[~2013-08-28 22:34 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-08-28 22:33 [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 1/5] tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64 Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 2/5] tcg-arm: Rearrange slow-path qemu_ld/st Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 3/5] tcg-arm: Use strd for tcg_out_arg_reg64 Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 4/5] tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb Richard Henderson
2013-08-28 22:33 ` Richard Henderson [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1377729213-2704-6-git-send-email-rth@twiddle.net \
--to=rth@twiddle.net \
--cc=aurelien@aurel32.net \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).