All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexander Graf <agraf@suse.de>
To: Paolo Bonzini <pbonzini@redhat.com>, qemu-devel@nongnu.org
Cc: dgibson@redhat.com, qemu-ppc@nongnu.org, tommusta@gmail.com
Subject: Re: [Qemu-devel] [Qemu-ppc] [PATCH 02/17] ppc: avoid excessive TLB flushing
Date: Fri, 05 Sep 2014 09:10:01 +0200	[thread overview]
Message-ID: <540961C9.9000705@suse.de> (raw)
In-Reply-To: <1409246113-6519-3-git-send-email-pbonzini@redhat.com>



On 28.08.14 19:14, Paolo Bonzini wrote:
> PowerPC TCG flushes the TLB on every IR/DR change, which basically
> means on every user<->kernel context switch.  Use the 6-element
> TLB array as a cache, where each MMU index is mapped to a different
> state of the IR/DR/PR/HV bits.
> 
> This brings the number of TLB flushes down from ~900000 to ~50000
> for starting up the Debian installer, which is in line with x86
> and gives a ~10% performance improvement.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  cputlb.c                    | 19 +++++++++++++++++
>  hw/ppc/spapr_hcall.c        |  6 +++++-
>  include/exec/exec-all.h     |  5 +++++
>  target-ppc/cpu.h            |  4 +++-
>  target-ppc/excp_helper.c    |  6 +-----
>  target-ppc/helper_regs.h    | 52 +++++++++++++++++++++++++++++++--------------
>  target-ppc/translate_init.c |  5 +++++
>  7 files changed, 74 insertions(+), 23 deletions(-)
> 
> diff --git a/cputlb.c b/cputlb.c
> index afd3705..17e1b03 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -67,6 +67,25 @@ void tlb_flush(CPUState *cpu, int flush_global)
>      tlb_flush_count++;
>  }
>  
> +void tlb_flush_idx(CPUState *cpu, int mmu_idx)
> +{
> +    CPUArchState *env = cpu->env_ptr;
> +
> +#if defined(DEBUG_TLB)
> +    printf("tlb_flush_idx %d:\n", mmu_idx);
> +#endif
> +    /* must reset current TB so that interrupts cannot modify the
> +       links while we are modifying them */
> +    cpu->current_tb = NULL;
> +
> +    memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[mmu_idx]));
> +    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
> +
> +    env->tlb_flush_addr = -1;
> +    env->tlb_flush_mask = 0;
> +    tlb_flush_count++;
> +}
> +
>  static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
>  {
>      if (addr == (tlb_entry->addr_read &
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 467858c..b95961c 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -556,13 +556,17 @@ static target_ulong h_cede(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>  {
>      CPUPPCState *env = &cpu->env;
>      CPUState *cs = CPU(cpu);
> +    bool flush;
>  
>      env->msr |= (1ULL << MSR_EE);
> -    hreg_compute_hflags(env);
> +    flush = hreg_compute_hflags(env);
>      if (!cpu_has_work(cs)) {
>          cs->halted = 1;
>          cs->exception_index = EXCP_HLT;
>          cs->exit_request = 1;
> +    } else if (flush) {
> +        cs->interrupt_request |= CPU_INTERRUPT_EXITTB;
> +        cs->exit_request = 1;

Can this ever happen?

>      }
>      return H_SUCCESS;
>  }
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 5e5d86e..629a550 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -100,6 +100,7 @@ void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as);
>  /* cputlb.c */
>  void tlb_flush_page(CPUState *cpu, target_ulong addr);
>  void tlb_flush(CPUState *cpu, int flush_global);
> +void tlb_flush_idx(CPUState *cpu, int mmu_idx);
>  void tlb_set_page(CPUState *cpu, target_ulong vaddr,
>                    hwaddr paddr, int prot,
>                    int mmu_idx, target_ulong size);
> @@ -112,6 +113,10 @@ static inline void tlb_flush_page(CPUState *cpu, target_ulong addr)
>  static inline void tlb_flush(CPUState *cpu, int flush_global)
>  {
>  }
> +
> +static inline void tlb_flush_idx(CPUState *cpu, int mmu_idx)
> +{
> +}
>  #endif
>  
>  #define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
> diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
> index b64c652..c1cb27f 100644
> --- a/target-ppc/cpu.h
> +++ b/target-ppc/cpu.h
> @@ -922,7 +922,7 @@ struct ppc_segment_page_sizes {
>  
>  /*****************************************************************************/
>  /* The whole PowerPC CPU context */
> -#define NB_MMU_MODES 3
> +#define NB_MMU_MODES 6
>  
>  #define PPC_CPU_OPCODES_LEN 0x40
>  
> @@ -1085,6 +1085,8 @@ struct CPUPPCState {
>      target_ulong hflags;      /* hflags is a MSR & HFLAGS_MASK         */
>      target_ulong hflags_nmsr; /* specific hflags, not coming from MSR */
>      int mmu_idx;         /* precomputed MMU index to speed up mem accesses */
> +    uint32_t mmu_msr[NB_MMU_MODES];  /* ir/dr/hv/pr values for TLBs */
> +    int mmu_fifo;  /* for replacement in mmu_msr */
>  
>      /* Power management */
>      int (*check_pow)(CPUPPCState *env);
> diff --git a/target-ppc/excp_helper.c b/target-ppc/excp_helper.c
> index be71590..bf25d44 100644
> --- a/target-ppc/excp_helper.c
> +++ b/target-ppc/excp_helper.c
> @@ -623,9 +623,6 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp)
>  
>      if (env->spr[SPR_LPCR] & LPCR_AIL) {
>          new_msr |= (1 << MSR_IR) | (1 << MSR_DR);
> -    } else if (msr & ((1 << MSR_IR) | (1 << MSR_DR))) {
> -        /* If we disactivated any translation, flush TLBs */
> -        tlb_flush(cs, 1);
>      }
>  
>  #ifdef TARGET_PPC64
> @@ -678,8 +675,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp)
>      if ((env->mmu_model == POWERPC_MMU_BOOKE) ||
>          (env->mmu_model == POWERPC_MMU_BOOKE206)) {
>          /* XXX: The BookE changes address space when switching modes,
> -                we should probably implement that as different MMU indexes,
> -                but for the moment we do it the slow way and flush all.  */
> +                TODO: still needed?!?  */
>          tlb_flush(cs, 1);
>      }
>  }
> diff --git a/target-ppc/helper_regs.h b/target-ppc/helper_regs.h
> index 271fddf..291f9c1 100644
> --- a/target-ppc/helper_regs.h
> +++ b/target-ppc/helper_regs.h
> @@ -39,17 +39,38 @@ static inline void hreg_swap_gpr_tgpr(CPUPPCState *env)
>      env->tgpr[3] = tmp;
>  }
>  
> -static inline void hreg_compute_mem_idx(CPUPPCState *env)
> +static inline bool hreg_compute_mem_idx(CPUPPCState *env)
>  {
> -    /* Precompute MMU index */
> -    if (msr_pr == 0 && msr_hv != 0) {
> -        env->mmu_idx = 2;
> -    } else {
> -        env->mmu_idx = 1 - msr_pr;
> +    CPUState *cs = CPU(ppc_env_get_cpu(env));
> +    int msr = env->msr;
> +    int i;
> +
> +    if (!tcg_enabled()) {
> +        return false;
> +    }
> +
> +    msr &= (1 << MSR_IR) | (1 << MSR_DR) | (1 << MSR_PR) | MSR_HVB;
> +    if (msr_pr == 1) {
> +        msr &= ~MSR_HVB;
>      }
> +
> +    for (i = 0; i < NB_MMU_MODES; i++) {
> +        if (env->mmu_msr[i] == msr) {
> +            env->mmu_idx = i;
> +            return false;
> +        }
> +    }
> +
> +    /* Use a new index with FIFO replacement.  */
> +    i = (env->mmu_fifo == NB_MMU_MODES - 1 ? 0 : env->mmu_fifo + 1);
> +    env->mmu_fifo = i;
> +    env->mmu_msr[i] = msr;
> +    env->mmu_idx = i;
> +    tlb_flush_idx(cs, i);
> +    return true;
>  }

Ok, so this basically changes the semantics of mmu_idx from a static
array with predefined meanings to a dynamic array with runtime changing
semantics.

The first thing that comes to mind here is why we're not just extending
the existing array? After all, we have 4 bits -> 16 states minus one for
PR+HV. Can our existing logic not deal with this?

Second thing I'm failing to grasp still is that in the previous patch
you're changing ctx.mem_idx into to different static semantics. But that
mem_idx gets passed to our ld/st helpers which again boils down to the
mem_idx above, no? So aren't we accessing random unrelated mmu contexts now?

There's a good chance I'm not fully grasping something here :).


Alex

  parent reply	other threads:[~2014-09-05  7:10 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-28 17:14 [Qemu-devel] [RFT/RFH PATCH 00/16] PPC speedup patches for TCG Paolo Bonzini
2014-08-28 17:14 ` [Qemu-devel] [PATCH 01/17] ppc: do not look at the MMU index Paolo Bonzini
2014-08-28 17:14 ` [Qemu-devel] [PATCH 02/17] ppc: avoid excessive TLB flushing Paolo Bonzini
2014-08-28 17:30   ` Peter Maydell
2014-08-28 19:35     ` Paolo Bonzini
2014-09-05  6:00       ` David Gibson
2014-09-05  7:10   ` Alexander Graf [this message]
2014-09-05 12:11     ` [Qemu-devel] [Qemu-ppc] " Paolo Bonzini
2014-09-09 16:42       ` Paolo Bonzini
2014-09-09 20:51         ` Alexander Graf
2014-08-28 17:14 ` [Qemu-devel] [PATCH 03/17] ppc: fix monitor access to CR Paolo Bonzini
2014-09-03 18:21   ` Tom Musta
2014-09-05  7:10     ` [Qemu-devel] [Qemu-ppc] " Alexander Graf
2014-08-28 17:15 ` [Qemu-devel] [PATCH 04/17] ppc: use ARRAY_SIZE in gdbstub.c Paolo Bonzini
2014-09-03 18:21   ` Tom Musta
2014-08-28 17:15 ` [Qemu-devel] [PATCH 05/17] ppc: use CRF_* in fpu_helper.c Paolo Bonzini
2014-09-03 18:21   ` Tom Musta
2014-08-28 17:15 ` [Qemu-devel] [PATCH 06/17] ppc: use CRF_* in int_helper.c Paolo Bonzini
2014-09-03 18:28   ` Tom Musta
2014-09-05  7:12     ` [Qemu-devel] [Qemu-ppc] " Alexander Graf
2014-08-28 17:15 ` [Qemu-devel] [PATCH 07/17] ppc: fix result of DLMZB when no zero bytes are found Paolo Bonzini
2014-09-03 18:28   ` Tom Musta
2014-09-05  7:26     ` [Qemu-devel] [Qemu-ppc] " Alexander Graf
2014-08-28 17:15 ` [Qemu-devel] [PATCH 08/17] ppc: introduce helpers for mfocrf/mtocrf Paolo Bonzini
2014-09-03 18:28   ` Tom Musta
2014-08-28 17:15 ` [Qemu-devel] [PATCH 09/17] ppc: reorganize gen_compute_fprf Paolo Bonzini
2014-09-03 18:29   ` Tom Musta
2014-08-28 17:15 ` [Qemu-devel] [PATCH 10/17] ppc: introduce gen_op_mfcr/gen_op_mtcr Paolo Bonzini
2014-09-03 18:58   ` Tom Musta
2014-08-28 17:15 ` [Qemu-devel] [PATCH 11/17] ppc: rename gen_set_cr6_from_fpscr Paolo Bonzini
2014-09-03 19:41   ` Tom Musta
2014-09-05  7:27     ` [Qemu-devel] [Qemu-ppc] " Alexander Graf
2014-08-28 17:15 ` [Qemu-devel] [PATCH 12/17] ppc: use movcond for isel Paolo Bonzini
2014-08-29 18:30   ` Richard Henderson
2014-09-03 19:41   ` Tom Musta
2014-09-15 13:39     ` Paolo Bonzini
2014-08-28 17:15 ` [Qemu-devel] [PATCH 13/17] ppc: compute mask from BI using right shift Paolo Bonzini
2014-09-03 20:59   ` Tom Musta
2014-09-05  7:29     ` [Qemu-devel] [Qemu-ppc] " Alexander Graf
2014-08-28 17:15 ` [Qemu-devel] [PATCH 14/17] ppc: introduce ppc_get_crf and ppc_set_crf Paolo Bonzini
2014-09-04 18:26   ` Tom Musta
2014-08-28 17:15 ` [Qemu-devel] [PATCH 15/17] ppc: store CR registers in 32 1-bit registers Paolo Bonzini
2014-09-04 18:27   ` Tom Musta
2014-09-09 15:44     ` Paolo Bonzini
2014-09-09 16:41       ` Paolo Bonzini
2014-09-09 16:03     ` Richard Henderson
2014-09-09 16:26       ` Paolo Bonzini
2014-08-28 17:15 ` [Qemu-devel] [PATCH 16/17] ppc: inline ppc_get_crf/ppc_set_crf when clearer Paolo Bonzini
2014-08-28 17:15 ` [Qemu-devel] [PATCH 17/17] ppc: dump all 32 CR bits Paolo Bonzini
2014-08-28 18:05 ` [Qemu-devel] [RFT/RFH PATCH 00/16] PPC speedup patches for TCG Tom Musta

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=540961C9.9000705@suse.de \
    --to=agraf@suse.de \
    --cc=dgibson@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=qemu-ppc@nongnu.org \
    --cc=tommusta@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.