From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43)
	id 1MypoB-00051R-6k
	for qemu-devel@nongnu.org; Fri, 16 Oct 2009 12:37:39 -0400
Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43)
	id 1Mypo6-0004uq-Ka
	for qemu-devel@nongnu.org; Fri, 16 Oct 2009 12:37:38 -0400
Received: from [199.232.76.173] (port=35805 helo=monty-python.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1Mypo6-0004ub-5e
	for qemu-devel@nongnu.org; Fri, 16 Oct 2009 12:37:34 -0400
Received: from cantor2.suse.de ([195.135.220.15]:45318 helo=mx2.suse.de)
	by monty-python.gnu.org with esmtp (Exim 4.60)
	(envelope-from <uli@suse.de>) id 1Mypo5-0003FY-Ex
	for qemu-devel@nongnu.org; Fri, 16 Oct 2009 12:37:33 -0400
From: Ulrich Hecht <uli@suse.de>
Subject: Re: [Qemu-devel] [PATCH 1/9] TCG "sync" op
Date: Fri, 16 Oct 2009 18:37:31 +0200
References: <1255696735-21396-1-git-send-email-uli@suse.de>
	<1255696735-21396-2-git-send-email-uli@suse.de>
	<20091016155221.GF4127@hall.aurel32.net>
In-Reply-To: <20091016155221.GF4127@hall.aurel32.net>
MIME-Version: 1.0
Content-Type: text/plain;
  charset="iso-8859-15"
Content-Disposition: inline
Message-Id: <200910161837.31850.uli@suse.de>
Content-Transfer-Encoding: quoted-printable
List-Id: qemu-devel.nongnu.org
List-Unsubscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.gnu.org/pipermail/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: Aurelien Jarno <aurelien@aurel32.net>
Cc: riku.voipio@iki.fi, qemu-devel@nongnu.org, agraf@suse.de

On Friday 16 October 2009, Aurelien Jarno wrote:
> IMHO, the correct way to do it is to use the following code, assuming
> you want to use 64-bit TCG regs to hold 32-bit values (that's
> something that is not really clear in your next patch):
>
> - for register load:
> | static TCGv load_reg(int reg)
> | {
> |    TCGv r =3D tcg_temp_new_i64();
> |    tcg_gen_ext32u_i64(r, tcgregs[reg]);
> |    return r;
> | }
> |
> | static void store_reg32(int reg, TCGv v)
> | {
> |    tcg_gen_ext32u_i64(v, v); /* may be optional */
> |    tcg_gen_andi_i64(tcgregs[reg], tcgregs[reg],
> | 0xffffffff00000000ULL); tcg_gen_or_i64(tcgregs[reg], tcgregs[reg],
> | v);
> |=A0}

This is _extremely_ detrimental to performance. The point of the sync op=20
is that in most cases it's a nop because registers are usually used with=20
the same bitness again and again. The sign extension/masking stuff is=20
done every time a register is accessed as 32 bits, which is the most=20
common case. Compare the translation of the following sequence of=20
instructions:

IN: _dl_aux_init
0x0000000080044ff6:  lhi        %r4,0
0x0000000080044ffa:  lhi        %r5,0
0x0000000080044ffe:  lhi        %r0,0

with sync:

OP:
 mov_i32 loc0,global_cc
 movi_i32 tmp1,$0x0
 sync_i64 R4
 mov_i32 r4,tmp1
 movi_i32 tmp1,$0x0
 sync_i64 R5
 mov_i32 r5,tmp1
 movi_i32 tmp1,$0x0
 sync_i64 R0
 mov_i32 r0,tmp1
 mov_i32 global_cc,loc0
 movi_i64 tmp2,$0x80045002
 st_i64 tmp2,env,$0x158
 exit_tb $0x0

OUT: [size=3D61]
0x6019a030:  mov    0x160(%r14),%ebp
0x6019a037:  mov    %rbp,%rbx
0x6019a03a:  mov    $0x80045002,%r12d
0x6019a040:  mov    %r12,0x158(%r14)
0x6019a047:  mov    %ebp,0xd1a0(%r14)
0x6019a04e:  mov    %ebx,0x160(%r14)
0x6019a055:  xor    %ebp,%ebp
0x6019a057:  mov    %ebp,(%r14)
0x6019a05a:  xor    %ebp,%ebp
0x6019a05c:  mov    %ebp,0x20(%r14)
0x6019a060:  xor    %ebp,%ebp
0x6019a062:  mov    %ebp,0x28(%r14)
0x6019a066:  xor    %eax,%eax
0x6019a068:  jmpq   0x621dc8ce


with sign extension:

OP:
 mov_i32 loc0,global_cc
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R4,R4,tmp2
 or_i64 R4,R4,tmp1
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R5,R5,tmp2
 or_i64 R5,R5,tmp1
 movi_i32 tmp1,$0x0
 ext32u_i64 tmp1,tmp1
 movi_i64 tmp2,$0xffffffff00000000
 and_i64 R0,R0,tmp2
 or_i64 R0,R0,tmp1
 mov_i32 global_cc,loc0
 movi_i64 tmp2,$0x80045002
 st_i64 tmp2,env,$0x158
 exit_tb $0x0

OUT: [size=3D126]
0x6019af10:  mov    0x160(%r14),%ebp
0x6019af17:  xor    %ebx,%ebx
0x6019af19:  mov    %ebx,%ebx
0x6019af1b:  mov    0x20(%r14),%r12
0x6019af1f:  mov    $0xffffffff00000000,%r13
0x6019af29:  and    %r13,%r12
0x6019af2c:  or     %rbx,%r12
0x6019af2f:  xor    %ebx,%ebx
0x6019af31:  mov    %ebx,%ebx
0x6019af33:  mov    0x28(%r14),%r13
0x6019af37:  mov    $0xffffffff00000000,%r15
0x6019af41:  and    %r15,%r13
0x6019af44:  or     %rbx,%r13
0x6019af47:  xor    %ebx,%ebx
0x6019af49:  mov    %ebx,%ebx
0x6019af4b:  mov    (%r14),%r15
0x6019af4e:  mov    $0xffffffff00000000,%r10
0x6019af58:  and    %r10,%r15
0x6019af5b:  or     %rbx,%r15
0x6019af5e:  mov    %rbp,%rbx
0x6019af61:  mov    $0x80045002,%r10d
0x6019af67:  mov    %r10,0x158(%r14)
0x6019af6e:  mov    %ebp,0xd1a0(%r14)
0x6019af75:  mov    %ebx,0x160(%r14)
0x6019af7c:  mov    %r15,(%r14)
0x6019af7f:  mov    %r12,0x20(%r14)
0x6019af83:  mov    %r13,0x28(%r14)
0x6019af87:  xor    %eax,%eax
0x6019af89:  jmpq   0x621dd78e

Its more than twice the size and has ten memory accesses instead of=20
seven.

CU
Uli

--=20
SUSE LINUX Products GmbH, GF: Markus Rex, HRB 16746 (AG N=FCrnberg)