From: Aurelien Jarno <aurelien@aurel32.net>
To: Richard Henderson <rth@twiddle.net>
Cc: peter.maydell@linaro.org, alex.bennee@linaro.org, qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH v3 20/25] tcg: Save insn data and use it in cpu_restore_state_from_tb
Date: Fri, 25 Sep 2015 23:10:36 +0200 [thread overview]
Message-ID: <20150925211036.GA20144@aurel32.net> (raw)
In-Reply-To: <1442953507-4074-21-git-send-email-rth@twiddle.net>
On 2015-09-22 13:25, Richard Henderson wrote:
> We can now restore state without retranslation.
>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
> include/exec/exec-all.h | 1 +
> tcg/tcg.c | 40 ++++++++-----
> tcg/tcg.h | 4 +-
> translate-all.c | 149 +++++++++++++++++++++++++++++++++++-------------
> 4 files changed, 139 insertions(+), 55 deletions(-)
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 6a69802..402dd87 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -199,6 +199,7 @@ struct TranslationBlock {
> #define CF_USE_ICOUNT 0x20000
>
> void *tc_ptr; /* pointer to the translated code */
> + uint8_t *tc_search; /* pointer to search data */
> /* next matching tb for physical address. */
> struct TranslationBlock *phys_hash_next;
> /* original tb when cflags has CF_NOCACHE */
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index bdb83d9..a0fce5b 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -2294,7 +2294,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
> tcg_insn_unit *gen_code_buf,
> long search_pc)
> {
> - int i, oi, oi_next;
> + int i, oi, oi_next, num_insns;
>
> #ifdef DEBUG_DISAS
> if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) {
> @@ -2338,6 +2338,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
>
> tcg_out_tb_init(s);
>
> + num_insns = -1;
> for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
> TCGOp * const op = &s->gen_op_buf[oi];
> TCGArg * const args = &s->gen_opparam_buf[op->args];
> @@ -2361,6 +2362,10 @@ static inline int tcg_gen_code_common(TCGContext *s,
> tcg_reg_alloc_movi(s, args, dead_args, sync_args);
> break;
> case INDEX_op_insn_start:
> + if (num_insns >= 0) {
> + s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
> + }
> + num_insns++;
> for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
> target_ulong a;
> #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> @@ -2368,7 +2373,7 @@ static inline int tcg_gen_code_common(TCGContext *s,
> #else
> a = args[i];
> #endif
> - s->gen_opc_data[i] = a;
> + s->gen_insn_data[num_insns][i] = a;
> }
> break;
> case INDEX_op_discard:
> @@ -2400,6 +2405,8 @@ static inline int tcg_gen_code_common(TCGContext *s,
> check_regs(s);
> #endif
> }
> + tcg_debug_assert(num_insns >= 0);
> + s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
>
> /* Generate TB finalization at the end of block */
> tcg_out_tb_finalize(s);
> @@ -2448,24 +2455,26 @@ int tcg_gen_code_search_pc(TCGContext *s, tcg_insn_unit *gen_code_buf,
> void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
> {
> TCGContext *s = &tcg_ctx;
> - int64_t tot;
> + int64_t tb_count = s->tb_count;
> + int64_t tb_div_count = tb_count ? tb_count : 1;
> + int64_t tot = s->interm_time + s->code_time;
>
> - tot = s->interm_time + s->code_time;
> cpu_fprintf(f, "JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n",
> tot, tot / 2.4e9);
> cpu_fprintf(f, "translated TBs %" PRId64 " (aborted=%" PRId64 " %0.1f%%)\n",
> - s->tb_count,
> - s->tb_count1 - s->tb_count,
> - s->tb_count1 ? (double)(s->tb_count1 - s->tb_count) / s->tb_count1 * 100.0 : 0);
> + tb_count, s->tb_count1 - tb_count,
> + (double)(s->tb_count1 - s->tb_count)
> + / (s->tb_count1 ? s->tb_count1 : 1) * 100.0);
> cpu_fprintf(f, "avg ops/TB %0.1f max=%d\n",
> - s->tb_count ? (double)s->op_count / s->tb_count : 0, s->op_count_max);
> + (double)s->op_count / tb_div_count, s->op_count_max);
> cpu_fprintf(f, "deleted ops/TB %0.2f\n",
> - s->tb_count ?
> - (double)s->del_op_count / s->tb_count : 0);
> + (double)s->del_op_count / tb_div_count);
> cpu_fprintf(f, "avg temps/TB %0.2f max=%d\n",
> - s->tb_count ?
> - (double)s->temp_count / s->tb_count : 0,
> - s->temp_count_max);
> + (double)s->temp_count / tb_div_count, s->temp_count_max);
> + cpu_fprintf(f, "avg host code/TB %0.1f\n",
> + (double)s->code_out_len / tb_div_count);
> + cpu_fprintf(f, "avg search data/TB %0.1f\n",
> + (double)s->search_out_len / tb_div_count);
>
> cpu_fprintf(f, "cycles/op %0.1f\n",
> s->op_count ? (double)tot / s->op_count : 0);
> @@ -2473,8 +2482,11 @@ void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
> s->code_in_len ? (double)tot / s->code_in_len : 0);
> cpu_fprintf(f, "cycles/out byte %0.1f\n",
> s->code_out_len ? (double)tot / s->code_out_len : 0);
> - if (tot == 0)
> + cpu_fprintf(f, "cycles/search byte %0.1f\n",
> + s->search_out_len ? (double)tot / s->search_out_len : 0);
> + if (tot == 0) {
> tot = 1;
> + }
> cpu_fprintf(f, " gen_interm time %0.1f%%\n",
> (double)s->interm_time / tot * 100.0);
> cpu_fprintf(f, " gen_code time %0.1f%%\n",
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 8fd1252..df499c6 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -532,6 +532,7 @@ struct TCGContext {
> int64_t del_op_count;
> int64_t code_in_len;
> int64_t code_out_len;
> + int64_t search_out_len;
> int64_t interm_time;
> int64_t code_time;
> int64_t la_time;
> @@ -581,7 +582,8 @@ struct TCGContext {
> uint16_t gen_opc_icount[OPC_BUF_SIZE];
> uint8_t gen_opc_instr_start[OPC_BUF_SIZE];
>
> - target_ulong gen_opc_data[TARGET_INSN_START_WORDS];
> + uint16_t gen_insn_end_off[TCG_MAX_INSNS];
> + target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
> };
>
> extern TCGContext tcg_ctx;
> diff --git a/translate-all.c b/translate-all.c
> index 9f801ae..f6b8148 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -168,61 +168,127 @@ void cpu_gen_init(void)
> tcg_context_init(&tcg_ctx);
> }
>
> +/* Encode VAL as a signed leb128 sequence at P.
> + Return P incremented past the encoded value. */
> +static uint8_t *encode_sleb128(uint8_t *p, target_long val)
> +{
> + int more, byte;
> +
> + do {
> + byte = val & 0x7f;
> + val >>= 7;
> + more = !((val == 0 && (byte & 0x40) == 0)
> + || (val == -1 && (byte & 0x40) != 0));
> + if (more)
> + byte |= 0x80;
You are missing braces here.
> + *p++ = byte;
> + } while (more);
> +
> + return p;
> +}
> +
> +/* Decode a signed leb128 sequence at *PP; increment *PP past the
> + decoded value. Return the decoded value. */
> +static target_long decode_sleb128(uint8_t **pp)
> +{
> + uint8_t *p = *pp;
> + target_long val = 0;
> + int byte, shift = 0;
> +
> + do {
> + byte = *p++;
> + val |= (target_ulong)(byte & 0x7f) << shift;
> + shift += 7;
> + } while (byte & 0x80);
> + if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
> + val |= -(target_ulong)1 << shift;
> + }
> +
> + *pp = p;
> + return val;
> +}
> +
> +/* Encode the data collected about the instructions while compiling TB.
> + Place the data at BLOCK, and return the number of bytes consumed.
> +
> + The logical table consisits of TARGET_INSN_START_WORDS target_ulong's,
> + which come from the target's insn_start data, followed by a uintptr_t
> + which comes from the host pc of the end of the code implementing the insn.
> +
> + Each line of the table is encoded as sleb128 deltas from the previous
> + line. The seed for the first line is { tb->pc, 0..., tb->tc_ptr }.
> + That is, the first column is seeded with the guest pc, the last column
> + with the host pc, and the middle columns with zeros. */
> +
> +static int encode_search(TranslationBlock *tb, uint8_t *block)
> +{
> + uint8_t *p = block;
> + int i, j, n;
> +
> + tb->tc_search = block;
> +
> + for (i = 0, n = tb->icount; i < n; ++i) {
> + target_ulong prev;
> +
> + for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> + if (i == 0) {
> + prev = (j == 0 ? tb->pc : 0);
> + } else {
> + prev = tcg_ctx.gen_insn_data[i - 1][j];
> + }
> + p = encode_sleb128(p, tcg_ctx.gen_insn_data[i][j] - prev);
> + }
> + prev = (i == 0 ? 0 : tcg_ctx.gen_insn_end_off[i - 1]);
> + p = encode_sleb128(p, tcg_ctx.gen_insn_end_off[i] - prev);
> + }
> +
> + return p - block;
> +}
> +
Given we save both the host and the guest PC in this structure, one
obvious optimization would be to skip saving data for host instructions
which can not generate exception. It means that all the TCG ops in this
instruction do not generate exceptions either. We can easily test that
for all TCG instructions except all by looking at the
TCG_OPF_SIDE_EFFECTS flag. For the call op, we have to look at the
TCG_CALL_NO_SIDE_EFFECTS flag, even if it doesn't necessary means the
helper might generate exception.
That should significantly save space on load/store architectures. That
said we can probably do that in a latter time.
> /* The cpu state corresponding to 'searched_pc' is restored. */
> static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
> uintptr_t searched_pc)
> {
> + target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
> + uintptr_t host_pc = (uintptr_t)tb->tc_ptr;
> CPUArchState *env = cpu->env_ptr;
> - TCGContext *s = &tcg_ctx;
> - int j;
> - uintptr_t tc_ptr;
> + uint8_t *p = tb->tc_search;
> + int i, j, num_insns = tb->icount;
> #ifdef CONFIG_PROFILER
> - int64_t ti;
> + int64_t ti = profile_getclock();
> #endif
>
> -#ifdef CONFIG_PROFILER
> - ti = profile_getclock();
> -#endif
> - tcg_func_start(s);
> + if (searched_pc < host_pc) {
> + return -1;
> + }
>
> - gen_intermediate_code_pc(env, tb);
> + /* Reconstruct the stored insn data while looking for the point at
> + which the end of the insn exceeds the searched_pc. */
> + for (i = 0; i < num_insns; ++i) {
> + for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
> + data[j] += decode_sleb128(&p);
> + }
> + host_pc += decode_sleb128(&p);
> + if (host_pc > searched_pc) {
> + goto found;
> + }
> + }
> + return -1;
>
> + found:
> if (tb->cflags & CF_USE_ICOUNT) {
> assert(use_icount);
> /* Reset the cycle counter to the start of the block. */
> - cpu->icount_decr.u16.low += tb->icount;
> + cpu->icount_decr.u16.low += num_insns;
> /* Clear the IO flag. */
> cpu->can_do_io = 0;
> }
> -
> - /* find opc index corresponding to search_pc */
> - tc_ptr = (uintptr_t)tb->tc_ptr;
> - if (searched_pc < tc_ptr)
> - return -1;
> -
> - s->tb_next_offset = tb->tb_next_offset;
> -#ifdef USE_DIRECT_JUMP
> - s->tb_jmp_offset = tb->tb_jmp_offset;
> - s->tb_next = NULL;
> -#else
> - s->tb_jmp_offset = NULL;
> - s->tb_next = tb->tb_next;
> -#endif
> - j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr,
> - searched_pc - tc_ptr);
> - if (j < 0)
> - return -1;
> - /* now find start of instruction before */
> - while (s->gen_opc_instr_start[j] == 0) {
> - j--;
> - }
> - cpu->icount_decr.u16.low -= s->gen_opc_icount[j];
> -
> - restore_state_to_opc(env, tb, s->gen_opc_data);
> + cpu->icount_decr.u16.low -= i;
> + restore_state_to_opc(env, tb, data);
>
> #ifdef CONFIG_PROFILER
> - s->restore_time += profile_getclock() - ti;
> - s->restore_count++;
> + tcg_ctx.restore_time += profile_getclock() - ti;
> + tcg_ctx.restore_count++;
> #endif
> return 0;
> }
> @@ -969,7 +1035,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> tb_page_addr_t phys_pc, phys_page2;
> target_ulong virt_page2;
> tcg_insn_unit *gen_code_buf;
> - int gen_code_size;
> + int gen_code_size, search_size;
> #ifdef CONFIG_PROFILER
> int64_t ti;
> #endif
> @@ -1025,11 +1091,13 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> #endif
>
> gen_code_size = tcg_gen_code(&tcg_ctx, gen_code_buf);
> + search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
>
> #ifdef CONFIG_PROFILER
> tcg_ctx.code_time += profile_getclock();
> tcg_ctx.code_in_len += tb->size;
> tcg_ctx.code_out_len += gen_code_size;
> + tcg_ctx.search_out_len += search_size;
> #endif
>
> #ifdef DEBUG_DISAS
> @@ -1041,8 +1109,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
> }
> #endif
>
> - tcg_ctx.code_gen_ptr = (void *)(((uintptr_t)gen_code_buf +
> - gen_code_size + CODE_GEN_ALIGN - 1) & ~(CODE_GEN_ALIGN - 1));
> + tcg_ctx.code_gen_ptr = (void *)
> + ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
> + CODE_GEN_ALIGN);
>
> /* check next page if needed */
> virt_page2 = (pc + tb->size - 1) & TARGET_PAGE_MASK;
If you fix the coding style issue I mentioned above, you get:
Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
--
Aurelien Jarno GPG: 4096R/1DDD8C9B
aurelien@aurel32.net http://www.aurel32.net
next prev parent reply other threads:[~2015-09-25 21:10 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-09-22 20:24 [Qemu-devel] [PATCH v3 00/25] Do away with TB retranslation Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 01/25] tcg: Rename debug_insn_start to insn_start Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 02/25] target-*: Unconditionally emit tcg_gen_insn_start Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 03/25] target-*: Increment num_insns immediately after tcg_gen_insn_start Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 04/25] target-*: Introduce and use cpu_breakpoint_test Richard Henderson
2015-09-23 19:19 ` Peter Maydell
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 05/25] tcg: Allow extra data to be attached to insn_start Richard Henderson
2015-09-23 14:55 ` Kevin O'Connor
2015-09-23 16:37 ` Richard Henderson
2015-09-23 16:38 ` Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 06/25] target-arm: Add condexec state " Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 07/25] target-i386: Add cc_op " Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 08/25] target-mips: Add delayed branch " Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 09/25] target-s390x: Add cc_op " Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 10/25] target-sh4: Add flags " Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 11/25] target-cris: Mirror gen_opc_pc into insn_start Richard Henderson
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 12/25] target-sparc: Tidy gen_branch_a interface Richard Henderson
2015-09-22 21:23 ` Aurelien Jarno
2015-09-24 19:42 ` Aurelien Jarno
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 13/25] target-sparc: Split out gen_branch_n Richard Henderson
2015-09-24 19:42 ` Aurelien Jarno
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 14/25] target-sparc: Remove gen_opc_jump_pc Richard Henderson
2015-09-24 19:42 ` Aurelien Jarno
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 15/25] target-sparc: Add npc state to insn_start Richard Henderson
2015-09-24 19:42 ` Aurelien Jarno
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 16/25] tcg: Merge cpu_gen_code into tb_gen_code Richard Henderson
2015-09-24 19:48 ` Aurelien Jarno
2015-09-22 20:24 ` [Qemu-devel] [PATCH v3 17/25] target-*: Drop cpu_gen_code define Richard Henderson
2015-09-24 19:49 ` Aurelien Jarno
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 18/25] tcg: Add TCG_MAX_INSNS Richard Henderson
2015-09-24 20:02 ` Aurelien Jarno
2015-09-24 20:43 ` Richard Henderson
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 19/25] tcg: Pass data argument to restore_state_to_opc Richard Henderson
2015-09-24 20:11 ` Aurelien Jarno
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 20/25] tcg: Save insn data and use it in cpu_restore_state_from_tb Richard Henderson
2015-09-23 19:20 ` Peter Maydell
2015-09-25 21:10 ` Aurelien Jarno [this message]
2015-09-25 23:05 ` Richard Henderson
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 21/25] tcg: Remove gen_intermediate_code_pc Richard Henderson
2015-09-25 21:11 ` Aurelien Jarno
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 22/25] tcg: Remove tcg_gen_code_search_pc Richard Henderson
2015-09-25 21:11 ` Aurelien Jarno
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 23/25] tcg: Emit prologue to the beginning of code_gen_buffer Richard Henderson
2015-09-23 19:28 ` Peter Maydell
2015-09-23 19:39 ` Richard Henderson
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 24/25] tcg: Allocate a guard page after code_gen_buffer Richard Henderson
2015-09-23 19:39 ` Peter Maydell
2015-09-23 20:00 ` Richard Henderson
2015-09-23 20:37 ` Peter Maydell
2015-09-23 22:12 ` Richard Henderson
2015-09-22 20:25 ` [Qemu-devel] [PATCH v3 25/25] tcg: Check for overflow via highwater mark Richard Henderson
2015-09-23 19:42 ` Peter Maydell
2015-09-23 20:01 ` Richard Henderson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150925211036.GA20144@aurel32.net \
--to=aurelien@aurel32.net \
--cc=alex.bennee@linaro.org \
--cc=peter.maydell@linaro.org \
--cc=qemu-devel@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.