qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Richard Henderson <rth@twiddle.net>
To: qemu-devel@nongnu.org
Subject: [Qemu-devel] [PATCH for-2.11 21/23] tcg/ppc: Change TCG_REG_RA to TCG_REG_TB
Date: Thu,  3 Aug 2017 22:44:24 -0700	[thread overview]
Message-ID: <20170804054426.10590-22-rth@twiddle.net> (raw)
In-Reply-To: <20170804054426.10590-1-rth@twiddle.net>

At this point the conversion is a wash.  Loading of TB+ofs is
smaller, but the actual return address from exit_tb is larger.
There are a few more insns required to transition between TBs.

But the expectation is that accesses to the constant pool will
on the whole be smaller.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/ppc/tcg-target.inc.c | 273 +++++++++++++++++++++--------------------------
 1 file changed, 122 insertions(+), 151 deletions(-)

diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index d772faf7be..bc14d2c9c6 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -39,29 +39,8 @@
 # define TCG_REG_TMP1   TCG_REG_R12
 #endif
 
-/* For the 64-bit target, we don't like the 5 insn sequence needed to build
-   full 64-bit addresses.  Better to have a base register to which we can
-   apply a 32-bit displacement.
-
-   There are generally three items of interest:
-   (1) helper functions in the main executable,
-   (2) TranslationBlock data structures,
-   (3) the return address in the epilogue.
-
-   For user-only, we USE_STATIC_CODE_GEN_BUFFER, so the code_gen_buffer
-   will be inside the main executable, and thus near enough to make a
-   pointer to the epilogue be within 2GB of all helper functions.
-
-   For softmmu, we'll let the kernel choose the address of code_gen_buffer,
-   and odds are it'll be somewhere close to the main malloc arena, and so
-   a pointer to the epilogue will be within 2GB of the TranslationBlocks.
-
-   For --enable-pie, everything will be kinda near everything else,
-   somewhere in high memory.
-
-   Thus we choose to keep the return address in a call-saved register.  */
-#define TCG_REG_RA     TCG_REG_R31
-#define USE_REG_RA     (TCG_TARGET_REG_BITS == 64)
+#define TCG_REG_TB     TCG_REG_R31
+#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64)
 
 /* Shorthand for size of a pointer.  Avoid promotion to unsigned.  */
 #define SZP  ((int)sizeof(void *))
@@ -614,50 +593,68 @@ static inline void tcg_out_shri64(TCGContext *s, TCGReg dst, TCGReg src, int c)
     tcg_out_rld(s, RLDICL, dst, src, 64 - c, c);
 }
 
-static void tcg_out_movi32(TCGContext *s, TCGReg ret, int32_t arg)
+static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+                             tcg_target_long arg, bool in_prologue)
 {
-    if (arg == (int16_t) arg) {
+    intptr_t tb_diff;
+    int32_t high;
+
+    tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
+
+    if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I32) {
+        arg = (int32_t)arg;
+    }
+
+    /* Load 16-bit immediates with one insn.  */
+    if (arg == (int16_t)arg) {
         tcg_out32(s, ADDI | TAI(ret, 0, arg));
-    } else {
+        return;
+    }
+
+    /* Load addresses within the TB with one insn.  */
+    tb_diff = arg - (intptr_t)s->code_gen_ptr;
+    if (!in_prologue && USE_REG_TB && tb_diff == (int16_t)tb_diff) {
+        tcg_out32(s, ADDI | TAI(ret, TCG_REG_TB, tb_diff));
+        return;
+    }
+
+    /* Load 32-bit immediates with two insns.  */
+    if (TCG_TARGET_REG_BITS == 32 || arg == (int32_t)arg) {
         tcg_out32(s, ADDIS | TAI(ret, 0, arg >> 16));
         if (arg & 0xffff) {
             tcg_out32(s, ORI | SAI(ret, ret, arg));
         }
+        return;
     }
-}
-
-static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
-                         tcg_target_long arg)
-{
-    tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
-    if (type == TCG_TYPE_I32 || arg == (int32_t)arg) {
-        tcg_out_movi32(s, ret, arg);
-    } else if (arg == (uint32_t)arg && !(arg & 0x8000)) {
+    if (arg == (uint32_t)arg && !(arg & 0x8000)) {
         tcg_out32(s, ADDI | TAI(ret, 0, arg));
         tcg_out32(s, ORIS | SAI(ret, ret, arg >> 16));
-    } else {
-        int32_t high;
+        return;
+    }
 
-        if (USE_REG_RA) {
-            intptr_t diff = arg - (intptr_t)tb_ret_addr;
-            if (diff == (int32_t)diff) {
-                tcg_out_mem_long(s, ADDI, ADD, ret, TCG_REG_RA, diff);
-                return;
-            }
-        }
+    /* Load addresses within 2GB of TB with 2 (or rarely 3) insns.  */
+    if (!in_prologue && USE_REG_TB && tb_diff == (int32_t)tb_diff) {
+        tcg_out_mem_long(s, ADDI, ADD, ret, TCG_REG_TB, tb_diff);
+        return;
+    }
 
-        high = arg >> 31 >> 1;
-        tcg_out_movi32(s, ret, high);
-        if (high) {
-            tcg_out_shli64(s, ret, ret, 32);
-        }
-        if (arg & 0xffff0000) {
-            tcg_out32(s, ORIS | SAI(ret, ret, arg >> 16));
-        }
-        if (arg & 0xffff) {
-            tcg_out32(s, ORI | SAI(ret, ret, arg));
-        }
+    high = arg >> 31 >> 1;
+    tcg_out_movi(s, TCG_TYPE_I32, ret, high);
+    if (high) {
+        tcg_out_shli64(s, ret, ret, 32);
     }
+    if (arg & 0xffff0000) {
+        tcg_out32(s, ORIS | SAI(ret, ret, arg >> 16));
+    }
+    if (arg & 0xffff) {
+        tcg_out32(s, ORI | SAI(ret, ret, arg));
+    }
+}
+
+static inline void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
+                                tcg_target_long arg)
+{
+    tcg_out_movi_int(s, type, ret, arg, false);
 }
 
 static bool mask_operand(uint32_t c, int *mb, int *me)
@@ -1293,49 +1290,43 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out32(s, insn);
 }
 
-#ifdef __powerpc64__
 void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_addr,
                               uintptr_t addr)
 {
-    tcg_insn_unit i1, i2;
-    uint64_t pair;
-    intptr_t diff = addr - jmp_addr;
-
-    if (in_range_b(diff)) {
-        i1 = B | (diff & 0x3fffffc);
-        i2 = NOP;
-    } else if (USE_REG_RA) {
-        intptr_t lo, hi;
-        diff = addr - (uintptr_t)tb_ret_addr;
-        lo = (int16_t)diff;
-        hi = (int32_t)(diff - lo);
-        tcg_debug_assert(diff == hi + lo);
-        i1 = ADDIS | TAI(TCG_REG_TMP1, TCG_REG_RA, hi >> 16);
-        i2 = ADDI | TAI(TCG_REG_TMP1, TCG_REG_TMP1, lo);
-    } else {
-        tcg_debug_assert(TCG_TARGET_REG_BITS == 32 || addr == (int32_t)addr);
-        i1 = ADDIS | TAI(TCG_REG_TMP1, 0, addr >> 16);
-        i2 = ORI | SAI(TCG_REG_TMP1, TCG_REG_TMP1, addr);
-    }
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_insn_unit i1, i2;
+        intptr_t tb_diff = addr - tc_ptr;
+        intptr_t br_diff = addr - (jmp_addr + 4);
+        uint64_t pair;
+
+        /* This does not exercise the range of the branch, but we do
+           still need to be able to load the new value of TCG_REG_TB.
+           But this does still happen quite often.  */
+        if (tb_diff == (int16_t)tb_diff) {
+            i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
+            i2 = B | (br_diff & 0x3fffffc);
+        } else {
+            intptr_t lo = (int16_t)tb_diff;
+            intptr_t hi = (int32_t)(tb_diff - lo);
+            assert(tb_diff == hi + lo);
+            i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
+            i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
+        }
 #ifdef HOST_WORDS_BIGENDIAN
-    pair = (uint64_t)i1 << 32 | i2;
+        pair = (uint64_t)i1 << 32 | i2;
 #else
-    pair = (uint64_t)i2 << 32 | i1;
+        pair = (uint64_t)i2 << 32 | i1;
 #endif
 
-    atomic_set((uint64_t *)jmp_addr, pair);
-    flush_icache_range(jmp_addr, jmp_addr + 8);
-}
-#else
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_addr,
-                              uintptr_t addr)
-{
-    intptr_t diff = addr - jmp_addr;
-    tcg_debug_assert(in_range_b(diff));
-    atomic_set((uint32_t *)jmp_addr, B | (diff & 0x3fffffc));
-    flush_icache_range(jmp_addr, jmp_addr + 4);
+        atomic_set((uint64_t *)jmp_addr, pair);
+        flush_icache_range(jmp_addr, jmp_addr + 8);
+    } else {
+        intptr_t diff = addr - jmp_addr;
+        tcg_debug_assert(in_range_b(diff));
+        atomic_set((uint32_t *)jmp_addr, B | (diff & 0x3fffffc));
+        flush_icache_range(jmp_addr, jmp_addr + 4);
+    }
 }
-#endif
 
 static void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
 {
@@ -1897,44 +1888,20 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 #ifndef CONFIG_SOFTMMU
     if (guest_base) {
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base);
+        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, true);
         tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
     }
 #endif
 
     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
     tcg_out32(s, MTSPR | RS(tcg_target_call_iarg_regs[1]) | CTR);
-
-    if (USE_REG_RA) {
-#ifdef _CALL_AIX
-        /* Make the caller load the value as the TOC into R2.  */
-        tb_ret_addr = s->code_ptr + 2;
-        desc[1] = tb_ret_addr;
-        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_RA, TCG_REG_R2);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-#elif defined(_CALL_ELF) && _CALL_ELF == 2
-        /* Compute from the incoming R12 value.  */
-        tb_ret_addr = s->code_ptr + 2;
-        tcg_out32(s, ADDI | TAI(TCG_REG_RA, TCG_REG_R12,
-                                tcg_ptr_byte_diff(tb_ret_addr, s->code_buf)));
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-#else
-        /* Reserve max 5 insns for the constant load.  */
-        tb_ret_addr = s->code_ptr + 6;
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RA, (intptr_t)tb_ret_addr);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        while (s->code_ptr < tb_ret_addr) {
-            tcg_out32(s, NOP);
-        }
-#endif
-    } else {
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        tb_ret_addr = s->code_ptr;
+    if (USE_REG_TB) {
+        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, tcg_target_call_iarg_regs[1]);
     }
+    tcg_out32(s, BCCTR | BO_ALWAYS);
 
     /* Epilogue */
-    tcg_debug_assert(tb_ret_addr == s->code_ptr);
-    s->code_gen_epilogue = tb_ret_addr;
+    s->code_gen_epilogue = tb_ret_addr = s->code_ptr;
 
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_REG_R1, FRAME_SIZE+LR_OFFSET);
     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); ++i) {
@@ -1954,44 +1921,48 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
 
     switch (opc) {
     case INDEX_op_exit_tb:
-        if (USE_REG_RA) {
-            ptrdiff_t disp = tcg_pcrel_diff(s, tb_ret_addr);
-
-            /* Use a direct branch if we can, otherwise use the value in RA.
-               Note that the direct branch is always backward, thus we need
-               to account for the possibility of 5 insns from the movi.  */
-            if (!in_range_b(disp - 20)) {
-                tcg_out32(s, MTSPR | RS(TCG_REG_RA) | CTR);
-                tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
-                tcg_out32(s, BCCTR | BO_ALWAYS);
-                break;
-            }
-        }
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, args[0]);
         tcg_out_b(s, 0, tb_ret_addr);
         break;
     case INDEX_op_goto_tb:
-        tcg_debug_assert(s->tb_jmp_insn_offset);
-        /* Direct jump. */
-#ifdef __powerpc64__
-        /* Ensure the next insns are 8-byte aligned. */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out32(s, NOP);
-        }
-        s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
-        /* To be replaced by either a branch+nop or a load into TMP1.  */
-        s->code_ptr += 2;
-        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
+        if (s->tb_jmp_insn_offset) {
+            /* Direct jump. */
+            if (TCG_TARGET_REG_BITS == 64) {
+                /* Ensure the next insns are 8-byte aligned. */
+                if ((uintptr_t)s->code_ptr & 7) {
+                    tcg_out32(s, NOP);
+                }
+                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+                tcg_out32(s, ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+                tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, 0));
+            } else {
+                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
+                tcg_out32(s, B);
+                s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+                break;
+            }
+        } else {
+            /* Indirect jump. */
+            tcg_debug_assert(s->tb_jmp_insn_offset == NULL);
+            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TB, 0,
+                       (intptr_t)(s->tb_jmp_insn_offset + args[0]));
+        }
+        tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
         tcg_out32(s, BCCTR | BO_ALWAYS);
-#else
-        /* To be replaced by a branch.  */
-        s->code_ptr++;
-#endif
-        s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
+        s->tb_jmp_reset_offset[args[0]] = c = tcg_current_code_size(s);
+        if (USE_REG_TB) {
+            /* For the unlinked case, need to reset TCG_REG_TB.  */
+            c = -c;
+            assert(c == (int16_t)c);
+            tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, c));
+        }
         break;
     case INDEX_op_goto_ptr:
         tcg_out32(s, MTSPR | RS(args[0]) | CTR);
-        tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, 0);
+        if (USE_REG_TB) {
+            tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, args[0]);
+        }
+        tcg_out32(s, ADDI | TAI(TCG_REG_R3, 0, 0));
         tcg_out32(s, BCCTR | BO_ALWAYS);
         break;
     case INDEX_op_br:
@@ -2761,8 +2732,8 @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_R13); /* thread pointer */
 #endif
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1); /* mem temp */
-    if (USE_REG_RA) {
-        tcg_regset_set_reg(s->reserved_regs, TCG_REG_RA);  /* return addr */
+    if (USE_REG_TB) {
+        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);  /* tb->tc_ptr */
     }
 }
 
-- 
2.13.3

  parent reply	other threads:[~2017-08-04  5:44 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-08-04  5:44 [Qemu-devel] [PATCH for-2.11 00/23] tcg constant pools Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 01/23] tcg: Move USE_DIRECT_JUMP discriminator to tcg/cpu/tcg-target.h Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 02/23] tcg: Rearrange ldst label tracking Richard Henderson
2017-08-04 10:33   ` Paolo Bonzini
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 03/23] tcg: Infrastructure for managing constant pools Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 04/23] tcg/i386: Store out-of-range call targets in constant pool Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 05/23] tcg/s390: Introduce TCG_REG_TB Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 06/23] tcg/s390: Fix sign of patch_reloc addend Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 07/23] tcg/s390: Use constant pool for movi Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 08/23] tcg/s390: Use constant pool for andi Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 09/23] tcg/s390: Use constant pool for ori Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 10/23] tcg/s390: Use constant pool for xori Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 11/23] tcg/s390: Use constant pool for cmpi Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 12/23] tcg/aarch64: Use constant pool for movi Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 13/23] tcg/sparc: Introduce TCG_REG_TB Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 14/23] tcg/sparc: Use constant pool for movi Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 15/23] tcg/arm: Improve tlb load for armv7 Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 16/23] tcg/arm: Tighten tlb indexing offset test Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 17/23] tcg/arm: Code rearrangement Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 18/23] tcg/arm: Extract INSN_NOP Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 19/23] tcg/arm: Use constant pool for movi Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 20/23] tcg/arm: Use constant pool for call Richard Henderson
2017-08-04  5:44 ` Richard Henderson [this message]
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 22/23] tcg/ppc: Look for shifted constants Richard Henderson
2017-08-04 16:39   ` Philippe Mathieu-Daudé
2017-08-04 16:58     ` Richard Henderson
2017-08-04  5:44 ` [Qemu-devel] [PATCH for-2.11 23/23] tcg/ppc: Use constant pool for movi Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170804054426.10590-22-rth@twiddle.net \
    --to=rth@twiddle.net \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).