qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements
@ 2013-08-28 22:33 Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 1/5] tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64 Richard Henderson
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Richard Henderson @ 2013-08-28 22:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

This is based on the "Further tcg ldst improvements" patch set.

The first patch is unchanged from the original "tcg ldst" patch set.

The second patch is modified to avoid defining local helpers in
the tcg/arm backend.  This is trivial now that we can use the real
return address, so the extra parameter can just be stored from R14.

The last three patches are new.  Stuff I noticed while examining
the disassembly dumps.

For convenience, the first patch set is

  git://github.com/rth7680/qemu.git tcg-ool-2

and this one is

  git://github.com/rth7680/qemu.git tcg-ool-arm-2


r~

Richard Henderson (5):
  tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64
  tcg-arm: Rearrange slow-path qemu_ld/st
  tcg-arm: Use strd for tcg_out_arg_reg64
  tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb
  tcg-arm: Better pipeline for softmmu tlb access

 include/exec/exec-all.h |  14 --
 tcg/arm/tcg-target.c    | 397 ++++++++++++++++++++++++------------------------
 2 files changed, 202 insertions(+), 209 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Qemu-devel] [PATCH 1/5] tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64
  2013-08-28 22:33 [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements Richard Henderson
@ 2013-08-28 22:33 ` Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 2/5] tcg-arm: Rearrange slow-path qemu_ld/st Richard Henderson
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Henderson @ 2013-08-28 22:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/arm/tcg-target.c | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 6d084b3..9176930 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -320,6 +320,9 @@ typedef enum {
     INSN_STRB_REG  = 0x06400000,
 
     INSN_LDRD_IMM  = 0x004000d0,
+    INSN_LDRD_REG  = 0x000000d0,
+    INSN_STRD_IMM  = 0x004000f0,
+    INSN_STRD_REG  = 0x000000f0,
 } ARMInsn;
 
 #define SHIFT_IMM_LSL(im)	(((im) << 7) | 0x00)
@@ -810,6 +813,30 @@ static inline void tcg_out_st32_r(TCGContext *s, int cond, TCGReg rt,
     tcg_out_memop_r(s, cond, INSN_STR_REG, rt, rn, rm, 1, 1, 0);
 }
 
+static inline void tcg_out_ldrd_8(TCGContext *s, int cond, TCGReg rt,
+                                   TCGReg rn, int imm8)
+{
+    tcg_out_memop_8(s, cond, INSN_LDRD_IMM, rt, rn, imm8, 1, 0);
+}
+
+static inline void tcg_out_ldrd_r(TCGContext *s, int cond, TCGReg rt,
+                                  TCGReg rn, TCGReg rm)
+{
+    tcg_out_memop_r(s, cond, INSN_LDRD_REG, rt, rn, rm, 1, 1, 0);
+}
+
+static inline void tcg_out_strd_8(TCGContext *s, int cond, TCGReg rt,
+                                   TCGReg rn, int imm8)
+{
+    tcg_out_memop_8(s, cond, INSN_STRD_IMM, rt, rn, imm8, 1, 0);
+}
+
+static inline void tcg_out_strd_r(TCGContext *s, int cond, TCGReg rt,
+                                  TCGReg rn, TCGReg rm)
+{
+    tcg_out_memop_r(s, cond, INSN_STRD_REG, rt, rn, rm, 1, 1, 0);
+}
+
 /* Register pre-increment with base writeback.  */
 static inline void tcg_out_ld32_rwb(TCGContext *s, int cond, TCGReg rt,
                                     TCGReg rn, TCGReg rm)
@@ -1397,6 +1424,9 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
             tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4);
             tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2);
             tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
+        } else if (use_armv6_instructions
+                   && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
+            tcg_out_ldrd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         } else {
             tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
             tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
@@ -1450,9 +1480,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
         }
         break;
     case 3:
-        /* TODO: use block load -
-         * check that data_reg2 > data_reg or the other way */
-        if (data_reg == addr_reg) {
+        if (use_armv6_instructions && !bswap
+            && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
+            tcg_out_ldrd_8(s, COND_AL, data_reg, addr_reg, 0);
+        } else if (use_armv6_instructions && bswap
+                   && (data_reg2 & 1) == 0 && data_reg == data_reg2 + 1) {
+            tcg_out_ldrd_8(s, COND_AL, data_reg2, addr_reg, 0);
+        } else if (data_reg == addr_reg) {
             tcg_out_ld32_12(s, COND_AL, data_reg2, addr_reg, bswap ? 0 : 4);
             tcg_out_ld32_12(s, COND_AL, data_reg, addr_reg, bswap ? 4 : 0);
         } else {
@@ -1529,6 +1563,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
             tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg);
             tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
             tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4);
+        } else if (use_armv6_instructions
+                   && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
+            tcg_out_strd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
         } else {
             tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
             tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
@@ -1576,13 +1613,14 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
         }
         break;
     case 3:
-        /* TODO: use block store -
-         * check that data_reg2 > data_reg or the other way */
         if (bswap) {
             tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2);
             tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 0);
             tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
             tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 4);
+        } else if (use_armv6_instructions
+                   && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
+            tcg_out_strd_8(s, COND_AL, data_reg, addr_reg, 0);
         } else {
             tcg_out_st32_12(s, COND_AL, data_reg, addr_reg, 0);
             tcg_out_st32_12(s, COND_AL, data_reg2, addr_reg, 4);
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [Qemu-devel] [PATCH 2/5] tcg-arm: Rearrange slow-path qemu_ld/st
  2013-08-28 22:33 [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 1/5] tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64 Richard Henderson
@ 2013-08-28 22:33 ` Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 3/5] tcg-arm: Use strd for tcg_out_arg_reg64 Richard Henderson
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Henderson @ 2013-08-28 22:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Instead of using a branch-call-branch sequence, arrange for a
call-branch sequence, using the ARM's conditional call insn.
This reduces the size of the slow-path within the TB, and makes
the GETPC_EXT implementation identical for TCG and not-TCG.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/exec-all.h |  14 ---
 tcg/arm/tcg-target.c    | 270 +++++++++++++++++++++---------------------------
 2 files changed, 120 insertions(+), 164 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index ffee83f..60c025c 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -326,20 +326,6 @@ extern uintptr_t tci_tb_ptr;
 #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
 # if defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
 #  define GETRA_LDST(RA)   (*(int32_t *)((RA) - 4))
-# elif defined(__arm__)
-/* We define two insns between the return address and the branch back to
-   straight-line.  Find and decode that branch insn.  */
-#  define GETRA_LDST(RA)   tcg_getra_ldst(RA)
-static inline uintptr_t tcg_getra_ldst(uintptr_t ra)
-{
-    int32_t b;
-    ra += 8;                    /* skip the two insns */
-    b = *(int32_t *)ra;         /* load the branch insn */
-    b = (b << 8) >> (8 - 2);    /* extract the displacement */
-    ra += 8;                    /* branches are relative to pc+8 */
-    ra += b;                    /* apply the displacement */
-    return ra;
-}
 # elif defined(__aarch64__)
 #  define GETRA_LDST(RA)  tcg_getra_ldst(RA)
 static inline uintptr_t tcg_getra_ldst(uintptr_t ra)
diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 9176930..5f71f24 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -166,29 +166,37 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         break;
 
     case 'r':
+#ifndef CONFIG_SOFTMMU
+    case 'a':
+    case 'b':
+#endif
         ct->ct |= TCG_CT_REG;
         tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
         break;
 
+#ifdef CONFIG_SOFTMMU
+    /* qemu_ld data registers -- softmmu uses the return registers only.  */
+    case 'a':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set32(ct->u.regs, 0, 1);
+        break;
+    case 'b':
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_set32(ct->u.regs, 0, 2);
+        break;
+#endif
+
     /* qemu_ld address */
     case 'l':
         ct->ct |= TCG_CT_REG;
         tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
 #ifdef CONFIG_SOFTMMU
-        /* r0-r2 will be overwritten when reading the tlb entry,
+        /* r0-r2,lr will be overwritten when reading the tlb entry,
            so don't use these. */
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
-#endif
-        break;
-    case 'L':
-        ct->ct |= TCG_CT_REG;
-        tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1);
-#ifdef CONFIG_SOFTMMU
-        /* r1 is still needed to load data_reg or data_reg2,
-           so don't use it. */
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
 #endif
         break;
 
@@ -207,6 +215,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         /* Avoid clashes with registers being used for helper args */
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
 #endif
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
 #endif
         break;
 
@@ -382,13 +391,17 @@ static inline void tcg_out_b_noaddr(TCGContext *s, int cond)
     /* We pay attention here to not modify the branch target by skipping
        the corresponding bytes. This ensure that caches and memory are
        kept coherent during retranslation. */
-#ifdef HOST_WORDS_BIGENDIAN
-    tcg_out8(s, (cond << 4) | 0x0a);
-    s->code_ptr += 3;
-#else
     s->code_ptr += 3;
     tcg_out8(s, (cond << 4) | 0x0a);
-#endif
+}
+
+static inline void tcg_out_bl_noaddr(TCGContext *s, int cond)
+{
+    /* We pay attention here to not modify the branch target by skipping
+       the corresponding bytes. This ensure that caches and memory are
+       kept coherent during retranslation. */
+    s->code_ptr += 3;
+    tcg_out8(s, (cond << 4) | 0x0b);
 }
 
 static inline void tcg_out_bl(TCGContext *s, int cond, int32_t offset)
@@ -1002,34 +1015,27 @@ static inline void tcg_out_st8(TCGContext *s, int cond,
         tcg_out_st8_12(s, cond, rd, rn, offset);
 }
 
-/* The _goto case is normally between TBs within the same code buffer,
- * and with the code buffer limited to 16MB we shouldn't need the long
- * case.
- *
- * .... except to the prologue that is in its own buffer.
+/* The _goto case is normally between TBs within the same code buffer, and
+ * with the code buffer limited to 16MB we wouldn't need the long case.
+ * But we also use it for the tail-call to the qemu_ld/st helpers, which does.
  */
 static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr)
 {
-    int32_t val;
+    int32_t disp = addr - (tcg_target_long) s->code_ptr;
 
-    if (addr & 1) {
-        /* goto to a Thumb destination isn't supported */
-        tcg_abort();
+    if ((addr & 1) == 0 && disp - 8 < 0x01fffffd && disp - 8 > -0x01fffffd) {
+        tcg_out_b(s, cond, disp);
+        return;
     }
 
-    val = addr - (tcg_target_long) s->code_ptr;
-    if (val - 8 < 0x01fffffd && val - 8 > -0x01fffffd)
-        tcg_out_b(s, cond, val);
-    else {
-        if (cond == COND_AL) {
-            tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4);
-            tcg_out32(s, addr);
-        } else {
-            tcg_out_movi32(s, cond, TCG_REG_TMP, val - 8);
-            tcg_out_dat_reg(s, cond, ARITH_ADD,
-                            TCG_REG_PC, TCG_REG_PC,
-                            TCG_REG_TMP, SHIFT_IMM_LSL(0));
+    tcg_out_movi32(s, cond, TCG_REG_TMP, addr);
+    if (use_armv5t_instructions) {
+        tcg_out_bx(s, cond, TCG_REG_TMP);
+    } else {
+        if (addr & 1) {
+            tcg_abort();
         }
+        tcg_out_mov_reg(s, cond, TCG_REG_PC, TCG_REG_TMP);
     }
 }
 
@@ -1084,23 +1090,29 @@ static inline void tcg_out_goto_label(TCGContext *s, int cond, int label_index)
 }
 
 #ifdef CONFIG_SOFTMMU
-
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
-static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+ *                                     int mmu_idx, uintptr_t ra)
+ */
+static const void * const qemu_ld_helpers[8] = {
+    helper_ret_ldub_mmu,
+    helper_ret_lduw_mmu,
+    helper_ret_ldul_mmu,
+    helper_ret_ldq_mmu,
+
+    helper_ret_ldsb_mmu,
+    helper_ret_ldsw_mmu,
+    helper_ret_ldul_mmu,
+    helper_ret_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
+/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
+ *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
 static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+    helper_ret_stb_mmu,
+    helper_ret_stw_mmu,
+    helper_ret_stl_mmu,
+    helper_ret_stq_mmu,
 };
 
 /* Helper routines for marshalling helper function arguments into
@@ -1258,8 +1270,7 @@ static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc,
 
 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 {
-    TCGReg argreg, data_reg, data_reg2;
-    uint8_t *start;
+    TCGReg argreg;
 
     reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
 
@@ -1270,49 +1281,10 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
         argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
     }
     argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
-    tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[lb->opc & 3]);
+    argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
-    data_reg = lb->datalo_reg;
-    data_reg2 = lb->datahi_reg;
-
-    start = s->code_ptr;
-    switch (lb->opc) {
-    case 0 | 4:
-        tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0);
-        break;
-    case 1 | 4:
-        tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0);
-        break;
-    case 0:
-    case 1:
-    case 2:
-    default:
-        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
-        break;
-    case 3:
-        tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0);
-        tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1);
-        break;
-    }
-
-    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
-       the call and the branch back to straight-line code.  Note that the
-       moves above could be elided by register allocation, nor do we know
-       which code alternative we chose for extension.  */
-    switch (s->code_ptr - start) {
-    case 0:
-        tcg_out_nop(s);
-        /* FALLTHRU */
-    case 4:
-        tcg_out_nop(s);
-        /* FALLTHRU */
-    case 8:
-        break;
-    default:
-        abort();
-    }
-
-    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
+    /* Tail-call to the helper, which will return to the fast path.  */
+    tcg_out_goto(s, COND_AL, (tcg_target_long) qemu_ld_helpers[lb->opc]);
 }
 
 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
@@ -1321,8 +1293,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
 
     reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr);
 
-    argreg = TCG_REG_R0;
-    argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
+    argreg = tcg_out_arg_reg32(s, TCG_REG_R0, TCG_AREG0);
     if (TARGET_LONG_BITS == 64) {
         argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, lb->addrhi_reg);
     } else {
@@ -1347,13 +1318,10 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     }
 
     argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index);
-    tcg_out_call(s, (tcg_target_long) qemu_st_helpers[lb->opc & 3]);
+    argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
 
-    /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between
-       the call and the branch back to straight-line code.  */
-    tcg_out_nop(s);
-    tcg_out_nop(s);
-    tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr);
+    /* Tail-call to the helper, which will return to the fast path.  */
+    tcg_out_goto(s, COND_AL, (tcg_target_long) qemu_st_helpers[lb->opc & 3]);
 }
 #endif /* SOFTMMU */
 
@@ -1383,57 +1351,58 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
                      offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
 
-    label_ptr = s->code_ptr;
-    tcg_out_b_noaddr(s, COND_NE);
-
     tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
                     offsetof(CPUTLBEntry, addend)
                     - offsetof(CPUTLBEntry, addr_read));
 
     switch (opc) {
     case 0:
-        tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         break;
     case 0 | 4:
-        tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         break;
     case 1:
-        tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         if (bswap) {
-            tcg_out_bswap16(s, COND_AL, data_reg, data_reg);
+            tcg_out_bswap16(s, COND_EQ, data_reg, data_reg);
         }
         break;
     case 1 | 4:
         if (bswap) {
-            tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
-            tcg_out_bswap16s(s, COND_AL, data_reg, data_reg);
+            tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg);
         } else {
-            tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         }
         break;
     case 2:
     default:
-        tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
+            tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
         }
         break;
     case 3:
         if (bswap) {
-            tcg_out_ld32_rwb(s, COND_AL, data_reg2, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4);
-            tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2);
-            tcg_out_bswap32(s, COND_AL, data_reg, data_reg);
+            tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg);
+            tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4);
+            tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2);
+            tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
         } else if (use_armv6_instructions
                    && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
-            tcg_out_ldrd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         } else {
-            tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
+            tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
+            tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
         }
         break;
     }
 
+    /* The conditional call must come last, as we're going to return here.  */
+    label_ptr = s->code_ptr;
+    tcg_out_bl_noaddr(s, COND_NE);
+
     add_qemu_ldst_label(s, 1, opc, data_reg, data_reg2, addr_reg, addr_reg2,
                         mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
@@ -1529,50 +1498,51 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
                      offsetof(CPUArchState,
                               tlb_table[mem_index][0].addr_write));
 
-    label_ptr = s->code_ptr;
-    tcg_out_b_noaddr(s, COND_NE);
-
     tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
                     offsetof(CPUTLBEntry, addend)
                     - offsetof(CPUTLBEntry, addr_write));
 
     switch (opc) {
     case 0:
-        tcg_out_st8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         break;
     case 1:
         if (bswap) {
-            tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st16_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
+            tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
         } else {
-            tcg_out_st16_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         }
         break;
     case 2:
     default:
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st32_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
+            tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
         } else {
-            tcg_out_st32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         }
         break;
     case 3:
         if (bswap) {
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2);
-            tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg);
-            tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg);
-            tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4);
+            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
+            tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg);
+            tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
+            tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4);
         } else if (use_armv6_instructions
                    && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
-            tcg_out_strd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
         } else {
-            tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4);
+            tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
+            tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
         }
         break;
     }
 
+    /* The conditional call must come last, as we're going to return here.  */
+    label_ptr = s->code_ptr;
+    tcg_out_bl_noaddr(s, COND_NE);
+
     add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2,
                         mem_index, s->code_ptr, label_ptr);
 #else /* !CONFIG_SOFTMMU */
@@ -2024,24 +1994,24 @@ static const TCGTargetOpDef arm_op_defs[] = {
     { INDEX_op_setcond2_i32, { "r", "r", "r", "rIN", "rIN" } },
 
 #if TARGET_LONG_BITS == 32
-    { INDEX_op_qemu_ld8u, { "r", "l" } },
-    { INDEX_op_qemu_ld8s, { "r", "l" } },
-    { INDEX_op_qemu_ld16u, { "r", "l" } },
-    { INDEX_op_qemu_ld16s, { "r", "l" } },
-    { INDEX_op_qemu_ld32, { "r", "l" } },
-    { INDEX_op_qemu_ld64, { "L", "L", "l" } },
+    { INDEX_op_qemu_ld8u, { "a", "l" } },
+    { INDEX_op_qemu_ld8s, { "a", "l" } },
+    { INDEX_op_qemu_ld16u, { "a", "l" } },
+    { INDEX_op_qemu_ld16s, { "a", "l" } },
+    { INDEX_op_qemu_ld32, { "a", "l" } },
+    { INDEX_op_qemu_ld64, { "a", "b", "l" } },
 
     { INDEX_op_qemu_st8, { "s", "s" } },
     { INDEX_op_qemu_st16, { "s", "s" } },
     { INDEX_op_qemu_st32, { "s", "s" } },
     { INDEX_op_qemu_st64, { "s", "s", "s" } },
 #else
-    { INDEX_op_qemu_ld8u, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld8s, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld16u, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld16s, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld32, { "r", "l", "l" } },
-    { INDEX_op_qemu_ld64, { "L", "L", "l", "l" } },
+    { INDEX_op_qemu_ld8u, { "a", "l", "l" } },
+    { INDEX_op_qemu_ld8s, { "a", "l", "l" } },
+    { INDEX_op_qemu_ld16u, { "a", "l", "l" } },
+    { INDEX_op_qemu_ld16s, { "a", "l", "l" } },
+    { INDEX_op_qemu_ld32, { "a", "l", "l" } },
+    { INDEX_op_qemu_ld64, { "a", "b", "l", "l" } },
 
     { INDEX_op_qemu_st8, { "s", "s", "s" } },
     { INDEX_op_qemu_st16, { "s", "s", "s" } },
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [Qemu-devel] [PATCH 3/5] tcg-arm: Use strd for tcg_out_arg_reg64
  2013-08-28 22:33 [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 1/5] tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64 Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 2/5] tcg-arm: Rearrange slow-path qemu_ld/st Richard Henderson
@ 2013-08-28 22:33 ` Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 4/5] tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access Richard Henderson
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Henderson @ 2013-08-28 22:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/arm/tcg-target.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index 5f71f24..a0a7539 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1156,9 +1156,16 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
     if (argreg & 1) {
         argreg++;
     }
-    argreg = tcg_out_arg_reg32(s, argreg, arglo);
-    argreg = tcg_out_arg_reg32(s, argreg, arghi);
-    return argreg;
+    if (use_armv6_instructions && argreg >= 4
+        && (arglo & 1) == 0 && arghi == arglo + 1) {
+        tcg_out_strd_8(s, COND_AL, arglo,
+                       TCG_REG_CALL_STACK, (argreg - 4) * 4);
+        return argreg + 2;
+    } else {
+        argreg = tcg_out_arg_reg32(s, argreg, arglo);
+        argreg = tcg_out_arg_reg32(s, argreg, arghi);
+        return argreg;
+    }
 }
 
 #define TLB_SHIFT	(CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [Qemu-devel] [PATCH 4/5] tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb
  2013-08-28 22:33 [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements Richard Henderson
                   ` (2 preceding siblings ...)
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 3/5] tcg-arm: Use strd for tcg_out_arg_reg64 Richard Henderson
@ 2013-08-28 22:33 ` Richard Henderson
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access Richard Henderson
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Henderson @ 2013-08-28 22:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

One of the two constraints we already checked via #if, but
the tlb offset distance was only checked at runtime.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/arm/tcg-target.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index a0a7539..f1e547f 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1170,6 +1170,15 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
 
 #define TLB_SHIFT	(CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
 
+/* We're expecting to use an 8-bit immediate and to mask.  */
+QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
+
+/* We're expecting to use an 8-bit immediate add + 8-bit ldrd offset.
+   Using the offset of the second entry in the last tlb table ensures
+   that we can index all of the elements of the first entry.  */
+QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
+                  > 0xffff);
+
 /* Load and compare a TLB entry, leaving the flags set.  Leaves R2 pointing
    to the tlb entry.  Clobbers R1 and TMP.  */
 
@@ -1197,14 +1206,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
      *   ldr    r0, [r2, r0]!                                     (3)
      *   cmp    r0, tmp                                           (4)
      */
-#  if CPU_TLB_BITS > 8
-#   error
-#  endif
     tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
                     0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
 
-    /* We assume that the offset is contained within 16 bits.  */
-    assert((tlb_offset & ~0xffff) == 0);
+    /* We checked that the offset is contained within 16 bits above.  */
     if (tlb_offset > 0xff) {
         tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
                         (24 << 7) | (tlb_offset >> 8));
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access
  2013-08-28 22:33 [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements Richard Henderson
                   ` (3 preceding siblings ...)
  2013-08-28 22:33 ` [Qemu-devel] [PATCH 4/5] tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb Richard Henderson
@ 2013-08-28 22:33 ` Richard Henderson
  4 siblings, 0 replies; 6+ messages in thread
From: Richard Henderson @ 2013-08-28 22:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Moves the load for the tlb addend earlier, to better load latency.
Avoids the writeback from the comparator load, since we know how
to adjust the offset between the two loads.

 :  e2862c03      add   r2, r6, #768    ; 0x300
 :  e20c00ff      and   r0, ip, #255    ; 0xff
 :  e0822280      add   r2, r2, r0, lsl #5
-:  e1e209d8      ldrd  r0, [r2, #152]!
+:  e1c209d8      ldrd  r0, [r2, #152]
 :  e31b0007      tst   fp, #7  ; 0x7
+:  e59220a8      ldr   r2, [r2, #168]
 :  0150068c      cmpeq r0, ip, lsl #13
 :  01510007      cmpeq r1, r7
-:  e5921010      ldr   r1, [r2, #16]
-:  018b40f1      strdeq        r4, [fp, r1]
+:  018b40f2      strdeq        r4, [fp, r2]
 :  1b0000cb      blne  0x75e7e6e4

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/arm/tcg-target.c | 97 +++++++++++++++++++++++-----------------------------
 1 file changed, 42 insertions(+), 55 deletions(-)

diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c
index f1e547f..6d03d6b 100644
--- a/tcg/arm/tcg-target.c
+++ b/tcg/arm/tcg-target.c
@@ -1179,13 +1179,18 @@ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
 QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
                   > 0xffff);
 
-/* Load and compare a TLB entry, leaving the flags set.  Leaves R2 pointing
-   to the tlb entry.  Clobbers R1 and TMP.  */
+/* Load and compare a TLB entry, leaving the flags set.  Leaves R2 
+   containing the tlb addend.  Clobbers R0, R1 and TMP.  */
 
 static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-                             int s_bits, int tlb_offset)
+                             int s_bits, int mem_index, bool is_load)
 {
     TCGReg base = TCG_AREG0;
+    int tlb_offset = 
+        (is_load
+         ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
+         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
+    int add_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
 
     /* Should generate something like the following:
      * pre-v7:
@@ -1193,18 +1198,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
      *   add    r2, env, #off & 0xff00
      *   and    r0, tmp, #(CPU_TLB_SIZE - 1)                      (2)
      *   add    r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS               (3)
-     *   ldr    r0, [r2, #off & 0xff]!                            (4)
+     *   ldr    r0, [r2, #off & 0xff]                             (4)
      *   tst    addr_reg, #s_mask
-     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS                    (5)
-     *
-     * v7 (not implemented yet):
-     *   ubfx   r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS    (1)
-     *   movw   tmp, #~TARGET_PAGE_MASK & ~s_mask
-     *   movw   r0, #off
-     *   add    r2, env, r2, lsl #CPU_TLB_ENTRY_BITS              (2)
-     *   bic    tmp, addr_reg, tmp
-     *   ldr    r0, [r2, r0]!                                     (3)
-     *   cmp    r0, tmp                                           (4)
+     *   ldr    r2, [r2, #addoff]                                 (5)
+     *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS
      */
     tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
                     0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
@@ -1213,7 +1210,6 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     if (tlb_offset > 0xff) {
         tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
                         (24 << 7) | (tlb_offset >> 8));
-        tlb_offset &= 0xff;
         base = TCG_REG_R2;
     }
 
@@ -1226,14 +1222,12 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
        but due to how the pointer needs setting up, ldm isn't useful.
        Base arm5 doesn't have ldrd, but armv5te does.  */
     if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
-        tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0,
-                        TCG_REG_R2, tlb_offset, 1, 1);
+        tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
     } else {
-        tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0,
-                         TCG_REG_R2, tlb_offset, 1, 1);
+        tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, tlb_offset & 0xff);
         if (TARGET_LONG_BITS == 64) {
-            tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R1,
-                             TCG_REG_R2, 4, 1, 0);
+            tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
+			    (tlb_offset & 0xff) + 4);
         }
     }
 
@@ -1243,6 +1237,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
                         0, addrlo, (1 << s_bits) - 1);
     }
 
+    /* Load the tlb addend.  */
+    tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2,
+                    add_offset - (tlb_offset & 0xff00));
+
     tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0,
                     TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS));
 
@@ -1360,53 +1358,48 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
     mem_index = *args;
     s_bits = opc & 3;
 
-    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
-                     offsetof(CPUArchState, tlb_table[mem_index][0].addr_read));
-
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
-                    offsetof(CPUTLBEntry, addend)
-                    - offsetof(CPUTLBEntry, addr_read));
+    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1);
 
     switch (opc) {
     case 0:
-        tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         break;
     case 0 | 4:
-        tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         break;
     case 1:
-        tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         if (bswap) {
             tcg_out_bswap16(s, COND_EQ, data_reg, data_reg);
         }
         break;
     case 1 | 4:
         if (bswap) {
-            tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
             tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg);
         } else {
-            tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         }
         break;
     case 2:
     default:
-        tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         if (bswap) {
             tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
         }
         break;
     case 3:
         if (bswap) {
-            tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4);
+            tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R2, addr_reg);
+            tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R2, 4);
             tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2);
             tcg_out_bswap32(s, COND_EQ, data_reg, data_reg);
         } else if (use_armv6_instructions
                    && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
-            tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_ldrd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+            tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+            tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
         }
         break;
     }
@@ -1506,47 +1499,41 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc)
     mem_index = *args;
     s_bits = opc & 3;
 
-    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits,
-                     offsetof(CPUArchState,
-                              tlb_table[mem_index][0].addr_write));
-
-    tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2,
-                    offsetof(CPUTLBEntry, addend)
-                    - offsetof(CPUTLBEntry, addr_write));
+    tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0);
 
     switch (opc) {
     case 0:
-        tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+        tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         break;
     case 1:
         if (bswap) {
             tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         }
         break;
     case 2:
     default:
         if (bswap) {
             tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1);
+            tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         }
         break;
     case 3:
         if (bswap) {
             tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2);
-            tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg);
+            tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, addr_reg);
             tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg);
-            tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4);
+            tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R2, 4);
         } else if (use_armv6_instructions
                    && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) {
-            tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1);
+            tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R2);
         } else {
-            tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg);
-            tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4);
+            tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R2, addr_reg);
+            tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R2, 4);
         }
         break;
     }
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2013-08-28 22:34 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-08-28 22:33 [Qemu-devel] [PATCH 0/5] tcg-arm ldst improvements Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 1/5] tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64 Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 2/5] tcg-arm: Rearrange slow-path qemu_ld/st Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 3/5] tcg-arm: Use strd for tcg_out_arg_reg64 Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 4/5] tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb Richard Henderson
2013-08-28 22:33 ` [Qemu-devel] [PATCH 5/5] tcg-arm: Better pipeline for softmmu tlb access Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).