qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/2] tcg-ppc: use new return-argument ld/st helpers
@ 2013-09-05  8:22 Paolo Bonzini
  2013-09-05  8:22 ` [Qemu-devel] [PATCH 1/2] tcg-ppc: fix qemu_ld/qemu_st for AIX ABI Paolo Bonzini
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Paolo Bonzini @ 2013-09-05  8:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-ppc, aurelien, rth

Last month I revived my old PowerBook, and here are the resulting patches
to use the new return-argument ld/st helpers.  I have a few more tcg-ppc
patches but they have a much smaller performance impact so I'll wait
till I have some more free time before posting.  But the impact of the
new helpers is huge, and AIUI Richard wants to get rid of GETPC_LDST
so here are these.

Paolo Bonzini (2):
  tcg-ppc: fix qemu_ld/qemu_st for AIX ABI
  tcg-ppc: use new return-argument ld/st helpers

 include/exec/exec-all.h |  4 +---
 tcg/ppc/tcg-target.c    | 56 ++++++++++++++++++++++++-------------------------
 2 files changed, 29 insertions(+), 31 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Qemu-devel] [PATCH 1/2] tcg-ppc: fix qemu_ld/qemu_st for AIX ABI
  2013-09-05  8:22 [Qemu-devel] [PATCH 0/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
@ 2013-09-05  8:22 ` Paolo Bonzini
  2013-09-05  8:22 ` [Qemu-devel] [PATCH 2/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
  2013-09-05  9:46 ` [Qemu-devel] [Qemu-ppc] [PATCH 0/2] " Alexander Graf
  2 siblings, 0 replies; 6+ messages in thread
From: Paolo Bonzini @ 2013-09-05  8:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-ppc, aurelien, rth

For the AIX ABI, the function pointer and small area pointer need
to be loaded in the trampoline.  The trampoline instead is called
with a normal BL instruction.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tcg/ppc/tcg-target.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 2595556..204ffbe 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -490,7 +490,8 @@ static void tcg_out_b (TCGContext *s, int mask, tcg_target_long target)
     }
 }
 
-static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg)
+static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg,
+                          int lk)
 {
 #ifdef _CALL_AIX
     int reg;
@@ -504,14 +505,14 @@ static void tcg_out_call (TCGContext *s, tcg_target_long arg, int const_arg)
     tcg_out32 (s, LWZ | RT (0) | RA (reg));
     tcg_out32 (s, MTSPR | RA (0) | CTR);
     tcg_out32 (s, LWZ | RT (2) | RA (reg) | 4);
-    tcg_out32 (s, BCCTR | BO_ALWAYS | LK);
+    tcg_out32 (s, BCCTR | BO_ALWAYS | lk);
 #else
     if (const_arg) {
-        tcg_out_b (s, LK, arg);
+        tcg_out_b (s, lk, arg);
     }
     else {
         tcg_out32 (s, MTSPR | RS (arg) | LR);
-        tcg_out32 (s, BCLR | BO_ALWAYS | LK);
+        tcg_out32 (s, BCLR | BO_ALWAYS | lk);
     }
 #endif
 }
@@ -860,7 +861,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
 #endif
     tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1);
+    tcg_out_b (s, LK, (tcg_target_long) ld_trampolines[s_bits]);
     tcg_out32 (s, (tcg_target_long) raddr);
     switch (opc) {
     case 0|4:
@@ -954,7 +955,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     ir++;
 
     tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1);
+    tcg_out_b (s, LK, (tcg_target_long) st_trampolines[opc]);
     tcg_out32 (s, (tcg_target_long) raddr);
     tcg_out_b (s, 0, (tcg_target_long) raddr);
 }
@@ -984,7 +985,7 @@ static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
     tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
     tcg_out32 (s, MTSPR | RS (3) | LR);
     tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
-    tcg_out_b (s, 0, (tcg_target_long) ptr);
+    tcg_out_call (s, (tcg_target_long) ptr, 1, 0);
 }
 #endif
 
@@ -1493,7 +1494,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         }
         break;
     case INDEX_op_call:
-        tcg_out_call (s, args[0], const_args[0]);
+        tcg_out_call (s, args[0], const_args[0], LK);
         break;
     case INDEX_op_movi_i32:
         tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [Qemu-devel] [PATCH 2/2] tcg-ppc: use new return-argument ld/st helpers
  2013-09-05  8:22 [Qemu-devel] [PATCH 0/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
  2013-09-05  8:22 ` [Qemu-devel] [PATCH 1/2] tcg-ppc: fix qemu_ld/qemu_st for AIX ABI Paolo Bonzini
@ 2013-09-05  8:22 ` Paolo Bonzini
  2013-09-05 15:17   ` Richard Henderson
  2013-09-05  9:46 ` [Qemu-devel] [Qemu-ppc] [PATCH 0/2] " Alexander Graf
  2 siblings, 1 reply; 6+ messages in thread
From: Paolo Bonzini @ 2013-09-05  8:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-ppc, aurelien, rth

These use a 32-bit load-of-immediate to save a mflr+addi+mtlr sequence.
Tested with a Windows 98 guest (pretty much the most recent thing I
could run on my PPC machine) and kvm-unit-tests's sieve.flat.  The
speed up for sieve.flat is as high as 10% for qemu-system-i386, 25%
(no kidding) for qemu-system-x86_64 on my PowerBook G4.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/exec/exec-all.h |  4 +---
 tcg/ppc/tcg-target.c    | 41 ++++++++++++++++++++---------------------
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index beb4149..a81e805 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -324,9 +324,7 @@ extern uintptr_t tci_tb_ptr;
    In some implementations, we pass the "logical" return address manually;
    in others, we must infer the logical return from the true return.  */
 #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU)
-# if defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
-#  define GETRA_LDST(RA)   (*(int32_t *)((RA) - 4))
-# elif defined(__arm__)
+# if defined(__arm__)
 /* We define two insns between the return address and the branch back to
    straight-line.  Find and decode that branch insn.  */
 #  define GETRA_LDST(RA)   tcg_getra_ldst(RA)
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index 204ffbe..24a8621 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -550,22 +550,24 @@ static void add_qemu_ldst_label (TCGContext *s,
     label->label_ptr[0] = label_ptr;
 }
 
-/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
-   int mmu_idx) */
+/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
+ *                                     int mmu_idx, uintptr_t ra)
+ */
 static const void * const qemu_ld_helpers[4] = {
-    helper_ldb_mmu,
-    helper_ldw_mmu,
-    helper_ldl_mmu,
-    helper_ldq_mmu,
+    helper_ret_ldub_mmu,
+    helper_ret_lduw_mmu,
+    helper_ret_ldul_mmu,
+    helper_ret_ldq_mmu,
 };
 
-/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
-   uintxx_t val, int mmu_idx) */
+/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
+ *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
+ */
 static const void * const qemu_st_helpers[4] = {
-    helper_stb_mmu,
-    helper_stw_mmu,
-    helper_stl_mmu,
-    helper_stq_mmu,
+    helper_ret_stb_mmu,
+    helper_ret_stw_mmu,
+    helper_ret_stl_mmu,
+    helper_ret_stq_mmu,
 };
 
 static void *ld_trampolines[4];
@@ -860,9 +862,9 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     tcg_out_mov (s, TCG_TYPE_I32, ir++, label->addrhi_reg);
     tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
 #endif
-    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
+    tcg_out_movi (s, TCG_TYPE_I32, ir++, mem_index);
+    tcg_out_movi (s, TCG_TYPE_I32, ir, (tcg_target_long) raddr);
     tcg_out_b (s, LK, (tcg_target_long) ld_trampolines[s_bits]);
-    tcg_out32 (s, (tcg_target_long) raddr);
     switch (opc) {
     case 0|4:
         tcg_out32 (s, EXTSB | RA (data_reg) | RS (3));
@@ -954,10 +956,10 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     }
     ir++;
 
-    tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_b (s, LK, (tcg_target_long) st_trampolines[opc]);
-    tcg_out32 (s, (tcg_target_long) raddr);
-    tcg_out_b (s, 0, (tcg_target_long) raddr);
+    tcg_out_movi (s, TCG_TYPE_I32, ir++, mem_index);
+    tcg_out_movi (s, TCG_TYPE_I32, ir, (tcg_target_long) raddr);
+    tcg_out32 (s, MTSPR | RS (ir) | LR);
+    tcg_out_b (s, 0, (tcg_target_long) st_trampolines[opc]);
 }
 
 void tcg_out_tb_finalize(TCGContext *s)
@@ -981,9 +983,6 @@ void tcg_out_tb_finalize(TCGContext *s)
 #ifdef CONFIG_SOFTMMU
 static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
 {
-    tcg_out32 (s, MFSPR | RT (3) | LR);
-    tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
-    tcg_out32 (s, MTSPR | RS (3) | LR);
     tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
     tcg_out_call (s, (tcg_target_long) ptr, 1, 0);
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [Qemu-ppc] [PATCH 0/2] tcg-ppc: use new return-argument ld/st helpers
  2013-09-05  8:22 [Qemu-devel] [PATCH 0/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
  2013-09-05  8:22 ` [Qemu-devel] [PATCH 1/2] tcg-ppc: fix qemu_ld/qemu_st for AIX ABI Paolo Bonzini
  2013-09-05  8:22 ` [Qemu-devel] [PATCH 2/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
@ 2013-09-05  9:46 ` Alexander Graf
  2 siblings, 0 replies; 6+ messages in thread
From: Alexander Graf @ 2013-09-05  9:46 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-ppc, qemu-devel, rth


On 05.09.2013, at 10:22, Paolo Bonzini wrote:

> Last month I revived my old PowerBook, and here are the resulting patches
> to use the new return-argument ld/st helpers.  I have a few more tcg-ppc
> patches but they have a much smaller performance impact so I'll wait
> till I have some more free time before posting.  But the impact of the
> new helpers is huge, and AIUI Richard wants to get rid of GETPC_LDST
> so here are these.

Reviewed-by: Alexander Graf <agraf@suse.de>


Alex

> 
> Paolo Bonzini (2):
>  tcg-ppc: fix qemu_ld/qemu_st for AIX ABI
>  tcg-ppc: use new return-argument ld/st helpers
> 
> include/exec/exec-all.h |  4 +---
> tcg/ppc/tcg-target.c    | 56 ++++++++++++++++++++++++-------------------------
> 2 files changed, 29 insertions(+), 31 deletions(-)
> 
> -- 
> 1.8.3.1
> 
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] tcg-ppc: use new return-argument ld/st helpers
  2013-09-05  8:22 ` [Qemu-devel] [PATCH 2/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
@ 2013-09-05 15:17   ` Richard Henderson
  2013-09-05 15:41     ` Paolo Bonzini
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Henderson @ 2013-09-05 15:17 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-ppc, qemu-devel, aurelien

On 09/05/2013 01:22 AM, Paolo Bonzini wrote:
> These use a 32-bit load-of-immediate to save a mflr+addi+mtlr sequence.
> Tested with a Windows 98 guest (pretty much the most recent thing I
> could run on my PPC machine) and kvm-unit-tests's sieve.flat.  The
> speed up for sieve.flat is as high as 10% for qemu-system-i386, 25%
> (no kidding) for qemu-system-x86_64 on my PowerBook G4.

See also the series beginning at

http://lists.nongnu.org/archive/html/qemu-devel/2013-09/msg00025.html

The major difference is that I use a conditional call out of the fast
path, which lets me later just use one mflr to pass the parameter.  I
also, perhaps foolishly, got rid of the trampolines.  E.g.

0xf57a1838:  rlwinm  r3,r15,24,20,27
0xf57a183c:  rlwinm  r0,r15,0,30,19
0xf57a1840:  add     r3,r3,r27
0xf57a1844:  lwz     r4,6436(r3)
0xf57a1848:  cmpw    cr7,r0,r4
0xf57a184c:  lwz     r3,6444(r3)
0xf57a1850:  bnel-   cr7,0xf57a1910
0xf57a1854:  stwx    r16,r3,r15
...
0xf57a1910:  mr      r3,r27
0xf57a1914:  mr      r4,r15
0xf57a1918:  mr      r5,r16
0xf57a191c:  li      r6,1
0xf57a1920:  mflr    r7
0xf57a1924:  lis     r0,4120
0xf57a1928:  ori     r0,r0,45040
0xf57a192c:  mtctr   r0
0xf57a1930:  bctrl
0xf57a1934:  b       0xf57a1858


I don't see anything technically wrong with your patch.  But I'd be
interested to compare vs mine.


r~

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] tcg-ppc: use new return-argument ld/st helpers
  2013-09-05 15:17   ` Richard Henderson
@ 2013-09-05 15:41     ` Paolo Bonzini
  0 siblings, 0 replies; 6+ messages in thread
From: Paolo Bonzini @ 2013-09-05 15:41 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-ppc, qemu-devel, aurelien

Il 05/09/2013 17:17, Richard Henderson ha scritto:
> On 09/05/2013 01:22 AM, Paolo Bonzini wrote:
>> These use a 32-bit load-of-immediate to save a mflr+addi+mtlr sequence.
>> Tested with a Windows 98 guest (pretty much the most recent thing I
>> could run on my PPC machine) and kvm-unit-tests's sieve.flat.  The
>> speed up for sieve.flat is as high as 10% for qemu-system-i386, 25%
>> (no kidding) for qemu-system-x86_64 on my PowerBook G4.
> 
> See also the series beginning at
> 
> http://lists.nongnu.org/archive/html/qemu-devel/2013-09/msg00025.html
> 
> The major difference is that I use a conditional call out of the fast
> path, which lets me later just use one mflr to pass the parameter.  I
> also, perhaps foolishly, got rid of the trampolines.  E.g.
> 
> 0xf57a1838:  rlwinm  r3,r15,24,20,27
> 0xf57a183c:  rlwinm  r0,r15,0,30,19
> 0xf57a1840:  add     r3,r3,r27
> 0xf57a1844:  lwz     r4,6436(r3)
> 0xf57a1848:  cmpw    cr7,r0,r4
> 0xf57a184c:  lwz     r3,6444(r3)
> 0xf57a1850:  bnel-   cr7,0xf57a1910
> 0xf57a1854:  stwx    r16,r3,r15
> ...
> 0xf57a1910:  mr      r3,r27
> 0xf57a1914:  mr      r4,r15
> 0xf57a1918:  mr      r5,r16
> 0xf57a191c:  li      r6,1
> 0xf57a1920:  mflr    r7
> 0xf57a1924:  lis     r0,4120
> 0xf57a1928:  ori     r0,r0,45040
> 0xf57a192c:  mtctr   r0
> 0xf57a1930:  bctrl
> 0xf57a1934:  b       0xf57a1858
> 
> I don't see anything technically wrong with your patch.  But I'd be
> interested to compare vs mine.

Sure, I'll give it a try tomorrow or in the weekend.

The G4 in my computer must simply hate the mflr/add/mtlr sequence in the
trampoline; there's no other explanation for such a huge performance
improvement.  So even though I suspect that there won't be much
difference between our patches it's good to check what's better in case
your sequences are triggering something as bad.  The bnel/mflr is a nice
trick to save one instruction, though!

Regarding removal of the trampolines, the extra icache cost should be a
wash now that they are half the size, but I'd still prefer it to be a
separate patch.

Paolo

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2013-09-05 15:41 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-05  8:22 [Qemu-devel] [PATCH 0/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
2013-09-05  8:22 ` [Qemu-devel] [PATCH 1/2] tcg-ppc: fix qemu_ld/qemu_st for AIX ABI Paolo Bonzini
2013-09-05  8:22 ` [Qemu-devel] [PATCH 2/2] tcg-ppc: use new return-argument ld/st helpers Paolo Bonzini
2013-09-05 15:17   ` Richard Henderson
2013-09-05 15:41     ` Paolo Bonzini
2013-09-05  9:46 ` [Qemu-devel] [Qemu-ppc] [PATCH 0/2] " Alexander Graf

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).