All of lore.kernel.org
 help / color / mirror / Atom feed
From: Aurelien Jarno <aurelien@aurel32.net>
To: Richard Henderson <rth@twiddle.net>
Cc: qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH 5/7] target-mips: Use TCG registers for the FPU.
Date: Tue, 18 Sep 2012 18:39:15 +0200	[thread overview]
Message-ID: <20120918163915.GA22929@ohm.aurel32.net> (raw)
In-Reply-To: <1347917713-23343-6-git-send-email-rth@twiddle.net>

On Mon, Sep 17, 2012 at 02:35:11PM -0700, Richard Henderson wrote:
> With normal FP, this doesn't have much affect on the generated code,
> because most of the FP operations are not CONST/PURE, and so we spill
> registers in about the same frequency as the explicit load/stores.
> 
> But with Loongson multimedia instructions, which are all integral and
> whose helpers are in fact CONST+PURE, this greatly improves the code.
> 
> Rather than over-use the deposit operation, we create TCG registers for
> both the 64-bit FPU register as a whole and the two 32-bit halves.  We
> only ever reference the whole register or the two half registers in any
> one TB, so there's no problem with aliasing.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  target-mips/translate.c | 141 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 97 insertions(+), 44 deletions(-)
> 
> diff --git a/target-mips/translate.c b/target-mips/translate.c
> index b4301e9..df92cec 100644
> --- a/target-mips/translate.c
> +++ b/target-mips/translate.c
> @@ -479,6 +479,12 @@ static TCGv cpu_dspctrl, btarget, bcond;
>  static TCGv_i32 hflags;
>  static TCGv_i32 fpu_fcr0, fpu_fcr31;
>  
> +/* FPU registers.  These alias, but we'll only use one or the other
> +   in any one TB based on MIPS_HFLAG_F64.  */
> +static TCGv_i32 fpu_f32[32];
> +static TCGv_i32 fpu_fh32[32];
> +static TCGv_i64 fpu_f64[32];
> +
>  static uint32_t gen_opc_hflags[OPC_BUF_SIZE];
>  
>  #include "gen-icount.h"
> @@ -545,26 +551,45 @@ enum {
>      BS_EXCP     = 3, /* We reached an exception condition */
>  };
>  
> -static const char *regnames[] =
> -    { "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3",
> -      "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
> -      "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
> -      "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra", };
> +static const char * const regnames[] = {
> +    "r0", "at", "v0", "v1", "a0", "a1", "a2", "a3",
> +    "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
> +    "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
> +    "t8", "t9", "k0", "k1", "gp", "sp", "s8", "ra",
> +};
> +
> +static const char * const regnames_HI[] = {
> +    "HI0", "HI1", "HI2", "HI3",
> +};
>  
> -static const char *regnames_HI[] =
> -    { "HI0", "HI1", "HI2", "HI3", };
> +static const char * const regnames_LO[] = {
> +    "LO0", "LO1", "LO2", "LO3",
> +};
>  
> -static const char *regnames_LO[] =
> -    { "LO0", "LO1", "LO2", "LO3", };
> +static const char * const regnames_ACX[] = {
> +    "ACX0", "ACX1", "ACX2", "ACX3",
> +};
>  
> -static const char *regnames_ACX[] =
> -    { "ACX0", "ACX1", "ACX2", "ACX3", };
> +static const char * const fregnames[] = {
> +    "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
> +    "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
> +    "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
> +    "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
> +};
>  
> -static const char *fregnames[] =
> -    { "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7",
> -      "f8",  "f9",  "f10", "f11", "f12", "f13", "f14", "f15",
> -      "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
> -      "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", };
> +static const char * const flregnames[] = {
> +    "fl0",  "fl1",  "fl2",  "fl3",  "fl4",  "fl5",  "fl6",  "fl7",
> +    "fl8",  "fl9",  "fl10", "fl11", "fl12", "fl13", "fl14", "fl15",
> +    "fl16", "fl17", "fl18", "fl19", "fl20", "fl21", "fl22", "fl23",
> +    "fl24", "fl25", "fl26", "fl27", "fl28", "fl29", "fl30", "fl31",
> +};
> +
> +static const char * const fhregnames[] = {
> +    "fh0",  "fh1",  "fh2",  "fh3",  "fh4",  "fh5",  "fh6",  "fh7",
> +    "fh8",  "fh9",  "fh10", "fh11", "fh12", "fh13", "fh14", "fh15",
> +    "fh16", "fh17", "fh18", "fh19", "fh20", "fh21", "fh22", "fh23",
> +    "fh24", "fh25", "fh26", "fh27", "fh28", "fh29", "fh30", "fh31",
> +};
>
>  #ifdef MIPS_DEBUG_DISAS
>  #define MIPS_DEBUG(fmt, ...)                                                  \
> @@ -662,55 +687,70 @@ static inline void gen_store_srsgpr (int from, int to)
>  }
>  
>  /* Floating point register moves. */
> -static inline void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_load_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        tcg_gen_trunc_i64_i32(t, fpu_f64[reg]);
> +    } else {
> +        tcg_gen_mov_i32(t, fpu_f32[reg]);
> +    }
>  }
>  
> -static inline void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_store_fpr32(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_ext_i32_i64(t64, t);
> +        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 0, 32);
> +        tcg_temp_free_i64(t64);
> +    } else {
> +        tcg_gen_mov_i32(fpu_f32[reg], t);
> +    }
>  }
>  
> -static inline void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_load_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_ld_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_shri_i64(t64, fpu_f64[reg], 32);
> +        tcg_gen_trunc_i64_i32(t, t64);
> +        tcg_temp_free_i64(t64);
> +    } else {
> +        tcg_gen_mov_i32(t, fpu_fh32[reg]);
> +    }
>  }
>  
> -static inline void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
> +static void gen_store_fpr32h(DisasContext *ctx, TCGv_i32 t, int reg)
>  {
> -    tcg_gen_st_i32(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].w[!FP_ENDIAN_IDX]));
> +    if (ctx->hflags & MIPS_HFLAG_F64) {
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_ext_i32_i64(t64, t);
> +        tcg_gen_deposit_i64(fpu_f64[reg], fpu_f64[reg], t64, 32, 32);
> +        tcg_temp_free_i64(t64);
> +    } else {
> +        tcg_gen_mov_i32(fpu_fh32[reg], t);
> +    }
>  }
>  
> -static inline void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
> +static void gen_load_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
>  {
>      if (ctx->hflags & MIPS_HFLAG_F64) {
> -        tcg_gen_ld_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d));
> +        tcg_gen_mov_i64(t, fpu_f64[reg]);
>      } else {
> -        TCGv_i32 t0 = tcg_temp_new_i32();
> -        TCGv_i32 t1 = tcg_temp_new_i32();
> -        gen_load_fpr32(ctx, t0, reg & ~1);
> -        gen_load_fpr32(ctx, t1, reg | 1);
> -        tcg_gen_concat_i32_i64(t, t0, t1);
> -        tcg_temp_free_i32(t0);
> -        tcg_temp_free_i32(t1);
> +        tcg_gen_concat_i32_i64(t, fpu_f32[reg & ~1], fpu_f32[reg | 1]);
>      }
>  }
>  
> -static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
> +static void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
>  {
>      if (ctx->hflags & MIPS_HFLAG_F64) {
> -        tcg_gen_st_i64(t, cpu_env, offsetof(CPUMIPSState, active_fpu.fpr[reg].d));
> +        tcg_gen_mov_i64(fpu_f64[reg], t);
>      } else {
> -        TCGv_i64 t0 = tcg_temp_new_i64();
> -        TCGv_i32 t1 = tcg_temp_new_i32();
> -        tcg_gen_trunc_i64_i32(t1, t);
> -        gen_store_fpr32(ctx, t1, reg & ~1);
> -        tcg_gen_shri_i64(t0, t, 32);
> -        tcg_gen_trunc_i64_i32(t1, t0);
> -        gen_store_fpr32(ctx, t1, reg | 1);
> -        tcg_temp_free_i32(t1);
> -        tcg_temp_free_i64(t0);
> +        TCGv_i64 t64 = tcg_temp_new_i64();
> +        tcg_gen_shri_i64(t64, t, 32);
> +        tcg_gen_trunc_i64_i32(fpu_f32[reg | 1], t64);
> +        tcg_temp_free_i64(t64);
> +        tcg_gen_trunc_i64_i32(fpu_f32[reg & ~1], t);
>      }
>  }
>  
> @@ -12694,6 +12734,19 @@ static void mips_tcg_init(void)
>                                         offsetof(CPUMIPSState, active_fpu.fcr31),
>                                         "fcr31");
>  
> +    for (i = 0; i < 32; i++) {
> +        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);
> +        fpu_f32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, flregnames[i]);
> +    }
> +    for (i = 0; i < 32; i++) {
> +        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[!FP_ENDIAN_IDX]);
> +        fpu_fh32[i] = tcg_global_mem_new_i32(TCG_AREG0, off, fhregnames[i]);
> +    }
> +    for (i = 0; i < 32; i++) {
> +        int off = offsetof(CPUMIPSState, active_fpu.fpr[i].w[FP_ENDIAN_IDX]);

This should be fpr[i].d.

> +        fpu_f64[i] = tcg_global_mem_new_i64(TCG_AREG0, off, fregnames[i]);
> +    }
> +

Adding so many globals (i.e. multiplying by 4) has a cost that is greater
than the gains. Remember the register allocator is doing a loop on all
globals at the end of a basic block or when calling a non CONST 
helper/op. While the generated code looks nicer, this slow down the
guest by roughly 12% (measured on a boot time).

I am currently working on an optimization of the liveness/register
allocator which among other things, partly mitigates that (I hope to get
the patches ready for posting in a week or so). That said the slow down
is still around 3%. I think we should go for only mapping the fp
registers as 64-bit registers, and use trunc/shift/deposit to read/write
them. Of course the generated code doesn't look so nice, but what is
important is that the overall execution is faster, not slower.

>      /* register helpers */
>  #define GEN_HELPER 2
>  #include "helper.h"
> -- 
> 1.7.11.4
> 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

  reply	other threads:[~2012-09-18 16:39 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-09-17 21:35 [Qemu-devel] [PATCH v2 0/7] target-mips improvements Richard Henderson
2012-09-17 21:35 ` [Qemu-devel] [PATCH 1/7] target-mips: Set opn in gen_ldst_multiple Richard Henderson
2012-09-18 16:38   ` Aurelien Jarno
2012-09-17 21:35 ` [Qemu-devel] [PATCH 2/7] target-mips: Fix MIPS_DEBUG Richard Henderson
2012-09-18 16:38   ` Aurelien Jarno
2012-09-17 21:35 ` [Qemu-devel] [PATCH 3/7] target-mips: Always evaluate debugging macro arguments Richard Henderson
2012-09-18 16:38   ` Aurelien Jarno
2012-09-17 21:35 ` [Qemu-devel] [PATCH 4/7] target-mips: Pass DisasContext to fpr32 load/store routines Richard Henderson
2012-09-18 16:39   ` Aurelien Jarno
2012-09-17 21:35 ` [Qemu-devel] [PATCH 5/7] target-mips: Use TCG registers for the FPU Richard Henderson
2012-09-18 16:39   ` Aurelien Jarno [this message]
2012-09-17 21:35 ` [Qemu-devel] [PATCH 6/7] target-mips: Add accessors for the two 32-bit halves of a 64-bit FPR Richard Henderson
2012-09-17 21:35 ` [Qemu-devel] [PATCH 7/7] target-mips: Implement Loongson Multimedia Instructions Richard Henderson
2012-09-18 16:39   ` Aurelien Jarno

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120918163915.GA22929@ohm.aurel32.net \
    --to=aurelien@aurel32.net \
    --cc=qemu-devel@nongnu.org \
    --cc=rth@twiddle.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.