qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Filip Navara <filip.navara@gmail.com>
To: Laurent Desnogues <laurent.desnogues@gmail.com>
Cc: Blue Swirl <blauwirbel@gmail.com>,
	Anthony Liguori <aliguori@us.ibm.com>,
	qemu-devel@nongnu.org, Avi Kivity <avi@redhat.com>
Subject: Re: OT: TCG SSA, speed, misc (was Re: [Qemu-devel] Re: [PATCH 08/11] QMP: Port balloon command)
Date: Mon, 29 Jun 2009 01:19:49 +0200	[thread overview]
Message-ID: <5b31733c0906281619k6a4bbf54s46de7d07b0395b2e@mail.gmail.com> (raw)
In-Reply-To: <761ea48b0906281424p5966022erbcb20143c06fd6b3@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1623 bytes --]

On Sun, Jun 28, 2009 at 11:24 PM, Laurent
Desnogues<laurent.desnogues@gmail.com> wrote:
> On Sun, Jun 28, 2009 at 8:19 PM, Filip Navara<filip.navara@gmail.com> wrote:
>> Doing a profiling run on several ARM demo programs showed that most of
>> the generated code was doing load/store operations to the machine
>> registers (in CPU_env). Sample run of FreeRTOS looked like this (OP
>> counts):
>>
>> movi_i32 1603
>> ld_i32 1305
>> st_i32 1174
>> add_i32 530
>> ...
>>
>> If there could be done something that would allow the guest registers
>> to be stored in host registers, even if for a temporary amount of time
>> it would certainly help the guests that I'm dealing with.
>
> TCG does a good job for register allocation.
>
> The problem you have here is that the ARM translator
> isn't using tcg_global_mem_new_i32 for ARM registers.

Interesting, thanks for the tip. I have been trying to achieve the
same effect using tcg_global_reg_new_i32, no wonder it felt so hard.
:)

> Here's an example of number of ops I see when using
> tcg_global_mem_new_i32:
>
> exit_tb 4991
> add_i32 7945
> st_i32 8257
> movi_i32 26812
> mov_i32 38369
>
> And with the trunk:
>
> exit_tb 4957
> add_i32 8165
> st_i32 20281
> ld_i32 21926
> movi_i32 25083
>
>
> Laurent
>

Attached is a proof-of-concept of ARM patch for using
tcg_global_mem_new_i32. I didn't have much time to test it yet, but on
synthetic benchmark it improved the performance by 13 DMIPS to the
total of 216 DMIPS, which equals to 6% improvement. On x86 host the
register allocation still looks very pathetic, I will post a follow-up
soon.

Best regards,
Filip Navara

[-- Attachment #2: 0001-First-try-at-using-tcg_global_mem_new_i32.patch.txt --]
[-- Type: text/plain, Size: 3869 bytes --]

From 4feddee0e7e02e1daab764dbbf9d694277b1e00a Mon Sep 17 00:00:00 2001
From: Filip Navara <filip.navara@gmail.com>
Date: Mon, 29 Jun 2009 01:13:42 +0200
Subject: [PATCH] First try at using tcg_global_mem_new_i32.

---
 target-arm/translate.c |   40 +++++++++++++++++++++++-----------------
 1 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 62c9eff..9a39536 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -77,6 +77,7 @@ typedef struct DisasContext {
 static TCGv_ptr cpu_env;
 /* We reuse the same 64-bit temporaries for efficiency.  */
 static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
+static TCGv_i32 cpu_R[16];
 
 /* FIXME:  These should be removed.  */
 static TCGv cpu_T[2];
@@ -86,14 +87,26 @@ static TCGv_i64 cpu_F0d, cpu_F1d;
 #define ICOUNT_TEMP cpu_T[0]
 #include "gen-icount.h"
 
+static const char *regnames[] =
+    { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
+
 /* initialize TCG globals.  */
 void arm_translate_init(void)
 {
+    int i;
+
     cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
 
     cpu_T[0] = tcg_global_reg_new_i32(TCG_AREG1, "T0");
     cpu_T[1] = tcg_global_reg_new_i32(TCG_AREG2, "T1");
 
+    for (i = 0; i < 16; i++) {
+        cpu_R[i] = tcg_global_mem_new_i32(TCG_AREG0,
+                                          offsetof(CPUState, regs[i]),
+                                          regnames[i]);
+    }
+
 #define GEN_HELPER 2
 #include "helpers.h"
 }
@@ -168,7 +181,7 @@ static void load_reg_var(DisasContext *s, TCGv var, int reg)
             addr = (long)s->pc + 4;
         tcg_gen_movi_i32(var, addr);
     } else {
-        tcg_gen_ld_i32(var, cpu_env, offsetof(CPUState, regs[reg]));
+        tcg_gen_mov_i32(var, cpu_R[reg]);
     }
 }
 
@@ -188,7 +201,7 @@ static void store_reg(DisasContext *s, int reg, TCGv var)
         tcg_gen_andi_i32(var, var, ~1);
         s->is_jmp = DISAS_JUMP;
     }
-    tcg_gen_st_i32(var, cpu_env, offsetof(CPUState, regs[reg]));
+    tcg_gen_mov_i32(cpu_R[reg], var);
     dead_tmp(var);
 }
 
@@ -790,27 +803,22 @@ static inline void gen_bx_im(DisasContext *s, uint32_t addr)
     TCGv tmp;
 
     s->is_jmp = DISAS_UPDATE;
-    tmp = new_tmp();
     if (s->thumb != (addr & 1)) {
+        tmp = new_tmp();
         tcg_gen_movi_i32(tmp, addr & 1);
         tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUState, thumb));
+        dead_tmp(tmp);
     }
-    tcg_gen_movi_i32(tmp, addr & ~1);
-    tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUState, regs[15]));
-    dead_tmp(tmp);
+    tcg_gen_mov_i32(cpu_R[15], addr & ~1);
 }
 
 /* Set PC and Thumb state from var.  var is marked as dead.  */
 static inline void gen_bx(DisasContext *s, TCGv var)
 {
-    TCGv tmp;
-
     s->is_jmp = DISAS_UPDATE;
-    tmp = new_tmp();
-    tcg_gen_andi_i32(tmp, var, 1);
-    store_cpu_field(tmp, thumb);
-    tcg_gen_andi_i32(var, var, ~1);
-    store_cpu_field(var, regs[15]);
+    tcg_gen_andi_i32(cpu_R[15], var, ~1);
+    tcg_gen_andi_i32(var, var, 1);
+    store_cpu_field(var, thumb);
 }
 
 /* Variant of store_reg which uses branch&exchange logic when storing
@@ -889,9 +897,7 @@ static inline void gen_movl_T2_reg(DisasContext *s, int reg)
 
 static inline void gen_set_pc_im(uint32_t val)
 {
-    TCGv tmp = new_tmp();
-    tcg_gen_movi_i32(tmp, val);
-    store_cpu_field(tmp, regs[15]);
+    tcg_gen_movi_i32(cpu_R[15], val);
 }
 
 static inline void gen_movl_reg_TN(DisasContext *s, int reg, int t)
@@ -903,7 +909,7 @@ static inline void gen_movl_reg_TN(DisasContext *s, int reg, int t)
     } else {
         tmp = cpu_T[t];
     }
-    tcg_gen_st_i32(tmp, cpu_env, offsetof(CPUState, regs[reg]));
+    tcg_gen_mov_i32(cpu_R[reg], tmp);
     if (reg == 15) {
         dead_tmp(tmp);
         s->is_jmp = DISAS_JUMP;
-- 
1.6.3.msysgit.0


  reply	other threads:[~2009-06-28 23:19 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-06-28 18:19 OT: TCG SSA, speed, misc (was Re: [Qemu-devel] Re: [PATCH 08/11] QMP: Port balloon command) Filip Navara
2009-06-28 21:24 ` Laurent Desnogues
2009-06-28 23:19   ` Filip Navara [this message]
2009-06-28 23:35     ` Filip Navara
2009-06-29  6:39       ` Laurent Desnogues

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5b31733c0906281619k6a4bbf54s46de7d07b0395b2e@mail.gmail.com \
    --to=filip.navara@gmail.com \
    --cc=aliguori@us.ibm.com \
    --cc=avi@redhat.com \
    --cc=blauwirbel@gmail.com \
    --cc=laurent.desnogues@gmail.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).