From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43)
	id 1KfNIs-0007cE-IG
	for qemu-devel@nongnu.org; Mon, 15 Sep 2008 19:16:22 -0400
Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43)
	id 1KfNIs-0007bs-2X
	for qemu-devel@nongnu.org; Mon, 15 Sep 2008 19:16:22 -0400
Received: from [199.232.76.173] (port=52220 helo=monty-python.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1KfNIr-0007bo-Kp
	for qemu-devel@nongnu.org; Mon, 15 Sep 2008 19:16:21 -0400
Received: from rv-out-0708.google.com ([209.85.198.243]:21451)
	by monty-python.gnu.org with esmtp (Exim 4.60)
	(envelope-from <balrogg@gmail.com>) id 1KfNIq-0005Ac-Vh
	for qemu-devel@nongnu.org; Mon, 15 Sep 2008 19:16:21 -0400
Received: by rv-out-0708.google.com with SMTP id f25so2291970rvb.22
	for <qemu-devel@nongnu.org>; Mon, 15 Sep 2008 16:16:19 -0700 (PDT)
Message-ID: <fb249edb0809151616u3fdc07f6qfa619fb19a6dd317@mail.gmail.com>
Date: Tue, 16 Sep 2008 01:16:19 +0200
From: "andrzej zaborowski" <balrogg@gmail.com>
Subject: Re: [Qemu-devel] TCG native 32->64 concatenation
In-Reply-To: <200809071753.27384.paul@codesourcery.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 7bit
Content-Disposition: inline
References: <200809071753.27384.paul@codesourcery.com>
Reply-To: qemu-devel@nongnu.org
List-Id: qemu-devel.nongnu.org
List-Unsubscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.gnu.org/pipermail/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: qemu-devel@nongnu.org

2008/9/7 Paul Brook <paul@codesourcery.com>:
> The patch below adds a new concat_i32_i64 TCG op.  This allows a pair of
> 32-bit values to be efficiently combined to form a 64-bit value.  I've
> converted all the cases I could find to use this, and tested the arm code on
> both 32 and 64-bit hosts.
>
> This touches bits of code that I can't easily test well, so I'd appreciate
> another pair of eyes looking over it before I commit.
>
> Signed-off-by: Paul Brook <paul@codesourcery.com>
>
> Index: target-sh4/translate.c
> ===================================================================
> --- target-sh4/translate.c      (revision 5178)
> +++ target-sh4/translate.c      (working copy)
> @@ -393,15 +393,12 @@ static inline void gen_load_fpr32(TCGv t
>  static inline void gen_load_fpr64(TCGv t, int reg)
>  {
>     TCGv tmp1 = tcg_temp_new(TCG_TYPE_I32);
> -    TCGv tmp2 = tcg_temp_new(TCG_TYPE_I64);
> +    TCGv tmp2 = tcg_temp_new(TCG_TYPE_I32);
>
>     tcg_gen_ld_i32(tmp1, cpu_env, offsetof(CPUState, fregs[reg]));
> -    tcg_gen_extu_i32_i64(t, tmp1);
> -    tcg_gen_shli_i64(t, t, 32);
> -    tcg_gen_ld_i32(tmp1, cpu_env, offsetof(CPUState, fregs[reg + 1]));
> -    tcg_gen_extu_i32_i64(tmp2, tmp1);
> +    tcg_gen_ld_i32(tmp2, cpu_env, offsetof(CPUState, fregs[reg + 1]));
> +    tcg_gen_concat_i32_i64(t, tmp2, tmp1);
>     tcg_temp_free(tmp1);
> -    tcg_gen_or_i64(t, t, tmp2);
>     tcg_temp_free(tmp2);
>  }
>
> Index: target-ppc/translate.c
> ===================================================================
> --- target-ppc/translate.c      (revision 5178)
> +++ target-ppc/translate.c      (working copy)
> @@ -5308,12 +5308,7 @@ static always_inline void gen_load_gpr64
>  #if defined(TARGET_PPC64)
>     tcg_gen_mov_i64(t, cpu_gpr[reg]);
>  #else
> -    tcg_gen_extu_i32_i64(t, cpu_gprh[reg]);
> -    tcg_gen_shli_i64(t, t, 32);
> -    TCGv tmp = tcg_temp_local_new(TCG_TYPE_I64);
> -    tcg_gen_extu_i32_i64(tmp, cpu_gpr[reg]);
> -    tcg_gen_or_i64(t, t, tmp);
> -    tcg_temp_free(tmp);
> +    tcg_gen_concat_i32_i64(t, cpu_gpr[reg], cpu_gprh[reg]);
>  #endif
>  }
>
> Index: target-mips/translate.c
> ===================================================================
> --- target-mips/translate.c     (revision 5178)
> +++ target-mips/translate.c     (working copy)
> @@ -666,14 +666,11 @@ static inline void gen_load_fpr64 (Disas
>         tcg_gen_ld_i64(t, current_fpu, 8 * reg);
>     } else {
>         TCGv r_tmp1 = tcg_temp_new(TCG_TYPE_I32);
> -        TCGv r_tmp2 = tcg_temp_new(TCG_TYPE_I64);
> +        TCGv r_tmp2 = tcg_temp_new(TCG_TYPE_I32);
>
>         tcg_gen_ld_i32(r_tmp1, current_fpu, 8 * (reg | 1) + 4 *
> FP_ENDIAN_IDX);
> -        tcg_gen_extu_i32_i64(t, r_tmp1);
> -        tcg_gen_shli_i64(t, t, 32);
> -        tcg_gen_ld_i32(r_tmp1, current_fpu, 8 * (reg & ~1) + 4 *
> FP_ENDIAN_IDX);
> -        tcg_gen_extu_i32_i64(r_tmp2, r_tmp1);
> -        tcg_gen_or_i64(t, t, r_tmp2);
> +        tcg_gen_ld_i32(r_tmp2, current_fpu, 8 * (reg & ~1) + 4 *
> FP_ENDIAN_IDX);
> +        tcg_gen_concat_i32_i64(t, r_tmp2, r_tmp1);
>         tcg_temp_free(r_tmp1);
>         tcg_temp_free(r_tmp2);
>     }
> @@ -6531,22 +6528,17 @@ static void gen_farith (DisasContext *ct
>     case FOP(38, 16):
>         check_cp1_64bitmode(ctx);
>         {
> -            TCGv fp64_0 = tcg_temp_new(TCG_TYPE_I64);
> -            TCGv fp64_1 = tcg_temp_new(TCG_TYPE_I64);
> +            TCGv fp64 = tcg_temp_new(TCG_TYPE_I64);
>             TCGv fp32_0 = tcg_temp_new(TCG_TYPE_I32);
>             TCGv fp32_1 = tcg_temp_new(TCG_TYPE_I32);
>
>             gen_load_fpr32(fp32_0, fs);
>             gen_load_fpr32(fp32_1, ft);
> -            tcg_gen_extu_i32_i64(fp64_0, fp32_0);
> -            tcg_gen_extu_i32_i64(fp64_1, fp32_1);
> -            tcg_temp_free(fp32_0);
> +            tcg_gen_concat_i32_i64(fp64, fp32_0, fp32_1);
>             tcg_temp_free(fp32_1);
> -            tcg_gen_shli_i64(fp64_1, fp64_1, 32);
> -            tcg_gen_or_i64(fp64_0, fp64_0, fp64_1);
> -            tcg_temp_free(fp64_1);
> -            gen_store_fpr64(ctx, fp64_0, fd);
> -            tcg_temp_free(fp64_0);
> +            tcg_temp_free(fp32_0);
> +            gen_store_fpr64(ctx, fp64, fd);
> +            tcg_temp_free(fp64);
>         }
>         opn = "cvt.ps.s";
>         break;
> Index: tcg/tcg-op.h
> ===================================================================
> --- tcg/tcg-op.h        (revision 5178)
> +++ tcg/tcg-op.h        (working copy)
> @@ -1395,6 +1395,23 @@ static inline void tcg_gen_discard_i64(T
>  }
>  #endif
>
> +static inline void tcg_gen_concat_i32_i64(TCGv dest, TCGv low, TCGv high)
> +{
> +#if TCG_TARGET_REG_BITS == 32
> +    tcg_gen_mov_i32(dest, low);
> +    tcg_gen_mov_i32(TCGV_HIGH(dest), high);
> +#else
> +    TCGv tmp = tcg_temp_new (TCG_TYPE_I64);
> +    /* This extension is only needed for type correctness.
> +       We may be able to do better given target specific information.  */
> +    tcg_gen_extu_i32_i64(tmp, high);
> +    tcg_gen_shli_i64(tmp, tmp, 32);
> +    tcg_gen_extu_i32_i64(dest, low);
> +    tcg_gen_or_i64(dest, dest, tmp);
> +    tcg_temp_free(tmp);
> +#endif
> +}
> +
>  /***************************************/
>  /* QEMU specific operations. Their type depend on the QEMU CPU
>    type. */
> Index: tcg/README
> ===================================================================
> --- tcg/README  (revision 5178)
> +++ tcg/README  (working copy)
> @@ -265,6 +265,10 @@ Convert t1 (32 bit) to t0 (64 bit) and d
>  * trunc_i64_i32 t0, t1
>  Truncate t1 (64 bit) to t0 (32 bit)
>
> +* concat_i32_i64 t0, t1, t2
> +Construct t0 (64-bit) taking the low half from t1 (32 bit) and the high half
> +from t2 (32 bit).
> +
>  ********* Load/Store
>
>  * ld_i32/i64 t0, t1, offset
> Index: target-arm/translate.c
> ===================================================================
> --- target-arm/translate.c      (revision 5178)
> +++ target-arm/translate.c      (working copy)
> @@ -1447,10 +1447,7 @@ static void gen_iwmmxt_movl_T0_T1_wRn(in
>
>  static void gen_iwmmxt_movl_wRn_T0_T1(int rn)
>  {
> -    tcg_gen_extu_i32_i64(cpu_V0, cpu_T[0]);
> -    tcg_gen_extu_i32_i64(cpu_V1, cpu_T[0]);
> -    tcg_gen_shli_i64(cpu_V1, cpu_V1, 32);
> -    tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
> +    tcg_gen_concat_i32_i64(cpu_V0, cpu_T[0], cpu_T[0]);

Oh, I think this was supposed to use T0 and T1 instead of duplicating
T0, so changing this to

+    tcg_gen_concat_i32_i64(cpu_V0, cpu_T[0], cpu_T[1]);

would fix an old bug.

Cheers