Re: [PATCH v3 4/6] KVM: MMU: fast invalidate all mmio sptes

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Gleb Natapov <gleb@redhat.com>
To: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: avi.kivity@gmail.com, mtosatti@redhat.com, pbonzini@redhat.com,
	linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Subject: Re: [PATCH v3 4/6] KVM: MMU: fast invalidate all mmio sptes
Date: Thu, 27 Jun 2013 11:29:00 +0300	[thread overview]
Message-ID: <20130627082900.GD18508@redhat.com> (raw)
In-Reply-To: <1370595088-3315-5-git-send-email-xiaoguangrong@linux.vnet.ibm.com>

On Fri, Jun 07, 2013 at 04:51:26PM +0800, Xiao Guangrong wrote:
> This patch tries to introduce a very simple and scale way to invalidate
> all mmio sptes - it need not walk any shadow pages and hold mmu-lock
> 
> KVM maintains a global mmio valid generation-number which is stored in
> kvm->memslots.generation and every mmio spte stores the current global
> generation-number into his available bits when it is created
> 
> When KVM need zap all mmio sptes, it just simply increase the global
> generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
> then it walks the shadow page table and get the mmio spte. If the
> generation-number on the spte does not equal the global generation-number,
> it will go to the normal #PF handler to update the mmio spte
> 
> Since 19 bits are used to store generation-number on mmio spte, we zap all
> mmio sptes when the number is round
> 
So this commit makes Fedora 9 32 bit reboot during boot, Fedora 9 64
fails too, but I haven't checked what happens exactly.

> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  2 +-
>  arch/x86/kvm/mmu.c              | 54 +++++++++++++++++++++++++++++++++++------
>  arch/x86/kvm/mmu.h              |  5 +++-
>  arch/x86/kvm/paging_tmpl.h      |  7 ++++--
>  arch/x86/kvm/vmx.c              |  4 +++
>  arch/x86/kvm/x86.c              |  3 +--
>  6 files changed, 61 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 1f98c1b..90d05ed 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -773,7 +773,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>  				     struct kvm_memory_slot *slot,
>  				     gfn_t gfn_offset, unsigned long mask);
>  void kvm_mmu_zap_all(struct kvm *kvm);
> -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
> +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
>  
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 044d8c0..bdc95bc 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -205,9 +205,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>  #define MMIO_SPTE_GEN_LOW_SHIFT		3
>  #define MMIO_SPTE_GEN_HIGH_SHIFT	52
>  
> +#define MMIO_GEN_SHIFT			19
>  #define MMIO_GEN_LOW_SHIFT		9
>  #define MMIO_GEN_LOW_MASK		((1 << MMIO_GEN_LOW_SHIFT) - 1)
> -#define MMIO_MAX_GEN			((1 << 19) - 1)
> +#define MMIO_GEN_MASK			((1 << MMIO_GEN_SHIFT) - 1)
> +#define MMIO_MAX_GEN			((1 << MMIO_GEN_SHIFT) - 1)
>  
>  static u64 generation_mmio_spte_mask(unsigned int gen)
>  {
> @@ -231,17 +233,23 @@ static unsigned int get_mmio_spte_generation(u64 spte)
>  	return gen;
>  }
>  
> +static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
> +{
> +	return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
> +}
> +
>  static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
>  			   unsigned access)
>  {
>  	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
> -	u64 mask = generation_mmio_spte_mask(0);
> +	unsigned int gen = kvm_current_mmio_generation(kvm);
> +	u64 mask = generation_mmio_spte_mask(gen);
>  
>  	access &= ACC_WRITE_MASK | ACC_USER_MASK;
>  	mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
>  	sp->mmio_cached = true;
>  
> -	trace_mark_mmio_spte(sptep, gfn, access, 0);
> +	trace_mark_mmio_spte(sptep, gfn, access, gen);
>  	mmu_spte_set(sptep, mask);
>  }
>  
> @@ -271,6 +279,12 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
>  	return false;
>  }
>  
> +static bool check_mmio_spte(struct kvm *kvm, u64 spte)
> +{
> +	return likely(get_mmio_spte_generation(spte) ==
> +			kvm_current_mmio_generation(kvm));
> +}
> +
>  static inline u64 rsvd_bits(int s, int e)
>  {
>  	return ((1ULL << (e - s + 1)) - 1) << s;
> @@ -3235,6 +3249,9 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
>  		gfn_t gfn = get_mmio_spte_gfn(spte);
>  		unsigned access = get_mmio_spte_access(spte);
>  
> +		if (!check_mmio_spte(vcpu->kvm, spte))
> +			return RET_MMIO_PF_INVALID;
> +
>  		if (direct)
>  			addr = 0;
>  
> @@ -3276,8 +3293,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
>  
>  	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
>  
> -	if (unlikely(error_code & PFERR_RSVD_MASK))
> -		return handle_mmio_page_fault(vcpu, gva, error_code, true);
> +	if (unlikely(error_code & PFERR_RSVD_MASK)) {
> +		r = handle_mmio_page_fault(vcpu, gva, error_code, true);
> +
> +		if (likely(r != RET_MMIO_PF_INVALID))
> +			return r;
> +	}
>  
>  	r = mmu_topup_memory_caches(vcpu);
>  	if (r)
> @@ -3353,8 +3374,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
>  	ASSERT(vcpu);
>  	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
>  
> -	if (unlikely(error_code & PFERR_RSVD_MASK))
> -		return handle_mmio_page_fault(vcpu, gpa, error_code, true);
> +	if (unlikely(error_code & PFERR_RSVD_MASK)) {
> +		r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
> +
> +		if (likely(r != RET_MMIO_PF_INVALID))
> +			return r;
> +	}
>  
>  	r = mmu_topup_memory_caches(vcpu);
>  	if (r)
> @@ -4327,7 +4352,7 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
>  	spin_unlock(&kvm->mmu_lock);
>  }
>  
> -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
> +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
>  {
>  	struct kvm_mmu_page *sp, *node;
>  	LIST_HEAD(invalid_list);
> @@ -4350,6 +4375,19 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
>  	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
>  }
>  
> +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
> +{
> +	/*
> +	 * The very rare case: if the generation-number is round,
> +	 * zap all shadow pages.
> +	 *
> +	 * The max value is MMIO_MAX_GEN - 1 since it is not called
> +	 * when mark memslot invalid.
> +	 */
> +	if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1)))
> +		kvm_mmu_zap_mmio_sptes(kvm);
> +}
> +
>  static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
>  {
>  	struct kvm *kvm;
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index ba6a19c..5b59c57 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -56,12 +56,15 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
>  /*
>   * Return values of handle_mmio_page_fault_common:
>   * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
> - *			 directly.
> + *			directly.
> + * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
> + *			fault path update the mmio spte.
>   * RET_MMIO_PF_RETRY: let CPU fault again on the address.
>   * RET_MMIO_PF_BUG: bug is detected.
>   */
>  enum {
>  	RET_MMIO_PF_EMULATE = 1,
> +	RET_MMIO_PF_INVALID = 2,
>  	RET_MMIO_PF_RETRY = 0,
>  	RET_MMIO_PF_BUG = -1
>  };
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index fb50fa6..7769699 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>  
>  	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
>  
> -	if (unlikely(error_code & PFERR_RSVD_MASK))
> -		return handle_mmio_page_fault(vcpu, addr, error_code,
> +	if (unlikely(error_code & PFERR_RSVD_MASK)) {
> +		r = handle_mmio_page_fault(vcpu, addr, error_code,
>  					      mmu_is_nested(vcpu));
> +		if (likely(r != RET_MMIO_PF_INVALID))
> +			return r;
> +	};
>  
>  	r = mmu_topup_memory_caches(vcpu);
>  	if (r)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 85c8d51..f4a5b3f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -5369,6 +5369,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
>  	if (likely(ret == RET_MMIO_PF_EMULATE))
>  		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
>  					      EMULATE_DONE;
> +
> +	if (unlikely(ret == RET_MMIO_PF_INVALID))
> +		return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
> +
>  	if (unlikely(ret == RET_MMIO_PF_RETRY))
>  		return 1;
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 54059ba..9e4afa7 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7067,8 +7067,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
>  	 * If memory slot is created, or moved, we need to clear all
>  	 * mmio sptes.
>  	 */
> -	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
> -		kvm_mmu_zap_mmio_sptes(kvm);
> +	kvm_mmu_invalidate_mmio_sptes(kvm);
>  }
>  
>  void kvm_arch_flush_shadow_all(struct kvm *kvm)
> -- 
> 1.8.1.4

--
			Gleb.

next prev parent reply	other threads:[~2013-06-27  8:29 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-07  8:51 [PATCH v3 0/6] KVM: MMU: fast invalidate all mmio sptes Xiao Guangrong
2013-06-07  8:51 ` [PATCH v3 1/6] KVM: MMU: retain more available bits on mmio spte Xiao Guangrong
2013-06-07  8:51 ` [PATCH v3 2/6] KVM: MMU: store generation-number into " Xiao Guangrong
2013-06-07  8:51 ` [PATCH v3 3/6] KVM: MMU: make return value of mmio page fault handler more readable Xiao Guangrong
2013-06-10  7:57   ` Gleb Natapov
2013-06-10  8:45     ` Xiao Guangrong
2013-06-10 13:16     ` Takuya Yoshikawa
2013-06-11  9:18       ` Gleb Natapov
2013-06-07  8:51 ` [PATCH v3 4/6] KVM: MMU: fast invalidate all mmio sptes Xiao Guangrong
2013-06-27  8:29   ` Gleb Natapov [this message]
2013-06-27  9:01     ` Gleb Natapov
2013-06-27  9:14       ` Gleb Natapov
2013-06-27  9:21         ` Gleb Natapov
2013-06-27  9:50           ` Xiao Guangrong
2013-06-27 10:19             ` Gleb Natapov
2013-06-27 11:05               ` Xiao Guangrong
2013-06-27 11:10                 ` Gleb Natapov
2013-06-07  8:51 ` [PATCH v3 5/6] KVM: MMU: add tracepoint for check_mmio_spte Xiao Guangrong
2013-06-07  8:51 ` [PATCH v3 6/6] KVM: MMU: init kvm generation close to mmio wrap-around value Xiao Guangrong
2013-06-10  7:56 ` [PATCH v3 0/6] KVM: MMU: fast invalidate all mmio sptes Gleb Natapov
2013-06-10  8:39   ` Xiao Guangrong
2013-06-10 13:43     ` Takuya Yoshikawa
2013-06-10 17:03       ` Gleb Natapov
2013-06-19 11:08         ` Paolo Bonzini
2013-06-19 11:27           ` Xiao Guangrong
2013-06-14  0:08 ` Marcelo Tosatti
2013-06-15  2:22   ` Takuya Yoshikawa
2013-06-17 11:59     ` Xiao Guangrong
2013-06-18 22:21       ` Marcelo Tosatti
2013-06-18 14:26 ` Paolo Bonzini
2013-06-19  2:47   ` Xiao Guangrong
2013-06-19 17:40 ` Paolo Bonzini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130627082900.GD18508@redhat.com \
    --to=gleb@redhat.com \
    --cc=avi.kivity@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mtosatti@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=xiaoguangrong@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).