public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Marcelo Tosatti <mtosatti@redhat.com>
To: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Avi Kivity <avi@redhat.com>, LKML <linux-kernel@vger.kernel.org>,
	KVM <kvm@vger.kernel.org>
Subject: Re: [PATCH v2 14/16] KVM: MMU: fast path of handling guest page fault
Date: Wed, 18 Apr 2012 20:08:37 -0300	[thread overview]
Message-ID: <20120418230837.GA9842@amt.cnet> (raw)
In-Reply-To: <4F8E3ACF.4000200@linux.vnet.ibm.com>

On Wed, Apr 18, 2012 at 11:53:51AM +0800, Xiao Guangrong wrote:
> On 04/18/2012 09:47 AM, Marcelo Tosatti wrote:
> 
> > On Fri, Apr 13, 2012 at 06:16:33PM +0800, Xiao Guangrong wrote:
> >> If the the present bit of page fault error code is set, it indicates
> >> the shadow page is populated on all levels, it means what we do is
> >> only modify the access bit which can be done out of mmu-lock
> >>
> >> Currently, in order to simplify the code, we only fix the page fault
> >> caused by write-protect on the fast path
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/kvm/mmu.c         |  205 ++++++++++++++++++++++++++++++++++++++++----
> >>  arch/x86/kvm/paging_tmpl.h |    3 +
> >>  2 files changed, 192 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index efa5d59..fc91667 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> @@ -446,6 +446,13 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
> >>  }
> >>  #endif
> >>
> >> +static bool spte_wp_by_dirty_log(u64 spte)
> >> +{
> >> +	WARN_ON(is_writable_pte(spte));
> >> +
> >> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> >> +}
> >> +
> >>  static bool spte_has_volatile_bits(u64 spte)
> >>  {
> >>  	if (!shadow_accessed_mask)
> >> @@ -454,9 +461,18 @@ static bool spte_has_volatile_bits(u64 spte)
> >>  	if (!is_shadow_present_pte(spte))
> >>  		return false;
> >>
> >> -	if ((spte & shadow_accessed_mask) &&
> >> -	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
> >> -		return false;
> >> +	if (spte & shadow_accessed_mask) {
> >> +		if (is_writable_pte(spte))
> >> +			return !(spte & shadow_dirty_mask);
> >> +
> >> +		/*
> >> +		 * If the spte is write-protected by dirty-log, it may
> >> +		 * be marked writable on fast page fault path, then CPU
> >> +		 * can modify the Dirty bit.
> >> +		 */
> >> +		if (!spte_wp_by_dirty_log(spte))
> >> +			return false;
> >> +	}
> >>
> >>  	return true;
> >>  }
> >> @@ -1109,26 +1125,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> >>  		rmap_remove(kvm, sptep);
> >>  }
> >>
> >> -static bool spte_wp_by_dirty_log(u64 spte)
> >> -{
> >> -	WARN_ON(is_writable_pte(spte));
> >> -
> >> -	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
> >> -}
> >> -
> >>  static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
> >>  			       bool *flush, bool page_table_protect)
> >>  {
> >>  	u64 spte = *sptep;
> >>
> >>  	if (is_writable_pte(spte)) {
> >> -		*flush |= true;
> >> -
> >>  		if (large) {
> >>  			pgprintk("rmap_write_protect(large): spte %p %llx\n",
> >>  				 spte, *spte);
> >>  			BUG_ON(!is_large_pte(spte));
> >>
> >> +			*flush |= true;
> >>  			drop_spte(kvm, sptep);
> >>  			--kvm->stat.lpages;
> >>  			return;
> >> @@ -1137,6 +1145,9 @@ static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
> >>  		goto reset_spte;
> >>  	}
> >>
> >> +	/* We need flush tlbs in this case: the fast page fault path
> >> +	 * can mark the spte writable after we read the sptep.
> >> +	 */
> >>  	if (page_table_protect && spte_wp_by_dirty_log(spte))
> >>  		goto reset_spte;
> >>
> >> @@ -1144,6 +1155,8 @@ static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
> >>
> >>  reset_spte:
> >>  	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
> >> +
> >> +	*flush |= true;
> >>  	spte = spte & ~PT_WRITABLE_MASK;
> >>  	if (page_table_protect)
> >>  		spte |= SPTE_WRITE_PROTECT;
> >> @@ -2767,18 +2780,172 @@ exit:
> >>  	return ret;
> >>  }
> >>
> >> +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
> >> +				   u32 error_code)
> >> +{
> >> +	unsigned long *rmap;
> >> +
> >> +	/*
> >> +	 * #PF can be fast only if the shadow page table is present and it
> >> +	 * is caused by write-protect, that means we just need change the
> >> +	 * W bit of the spte which can be done out of mmu-lock.
> >> +	 */
> >> +	if (!(error_code & PFERR_PRESENT_MASK) ||
> >> +	      !(error_code & PFERR_WRITE_MASK))
> >> +		return false;
> >> +
> >> +	rmap = gfn_to_rmap(vcpu->kvm, gfn, PT_PAGE_TABLE_LEVEL);
> >> +
> >> +	/* Quickly check the page can be writable. */
> >> +	if (test_bit(PTE_LIST_WP_BIT, ACCESS_ONCE(rmap)))
> >> +		return false;
> >> +
> >> +	return true;
> >> +}
> >> +
> >> +static bool
> >> +fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >> +			  u64 *sptep, u64 spte, gfn_t gfn)
> >> +{
> >> +	pfn_t pfn;
> >> +	bool ret = false;
> >> +
> >> +	/*
> >> +	 * For the indirect spte, it is hard to get a stable gfn since
> >> +	 * we just use a cmpxchg to avoid all the races which is not
> >> +	 * enough to avoid the ABA problem: the host can arbitrarily
> >> +	 * change spte and the mapping from gfn to pfh.
> >> +	 *
> >> +	 * What we do is call gfn_to_pfn_atomic to bind the gfn and the
> >> +	 * pfn because after the call:
> >> +	 * - we have held the refcount of pfn that means the pfn can not
> >> +	 *   be freed and be reused for another gfn.
> >> +	 * - the pfn is writable that means it can not be shared by different
> >> +	 *   gfn.
> >> +	 */
> >> +	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> > 
> > Please document what can happen in parallel whenever you manipulate
> > sptes without mmu_lock held, convincing the reader that this is safe.
> > 
> 
> 
> OK, I will documents it in locking.txt
> 
> I am not good at documenting, How about the below description?
> 
> What we use to avoid all the race is spte.SPTE_ALLOW_WRITE and spte.SPTE_WRITE_PROTECT:
> SPTE_ALLOW_WRITE means the gfn is writable on both guest and host, and SPTE_ALLOW_WRITE
> means this gfn is not write-protected for shadow page write protection.
> 
> On fast page fault path, we will atomically set the spte.W bit if
> spte.SPTE_WRITE_PROTECT = 1 and spte.SPTE_WRITE_PROTECT = 0, this is safe because whenever
> changing these bits can be detected by cmpxchg.
> 
> But we need carefully check the mapping between gfn to pfn since we can only ensure the pfn
> is not changed during cmpxchg. This is a ABA problem, for example, below case will happen:
> 
> At the beginning:
> gpte = gfn1
> gfn1 is mapped to pfn1 on host
> spte is the shadow page table entry corresponding with gpte and
> spte = pfn1
> 
>    VCPU 0                                           VCPU0
> on page fault path:
> 
>    old_spte = *spte;
>                                                  pfn1 is swapped out:
>                                                              spte = 0;
>                                                  pfn1 is re-alloced for gfn2
>                                                  gpte is changed by host, and pointing to gfn2:
>                                                              spte = pfn1;
> 
>    if (cmpxchg(spte, old_spte, old_spte+W)
> 	mark_page_dirty(vcpu->kvm, gfn1)
>   	OOPS!!!
>    we dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
> 
> For direct sp, we can easily avoid it since the spte of direct sp is fixed to gfn.
> For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic to pin gfn to pfn,
> because after gfn_to_pfn_atomic:
> - we have held the refcount of pfn that means the pfn can not
>   be freed and be reused for another gfn.
> - the pfn is writable that means it can not be shared by different
>   gfn by KSM.
> Then, we can ensure the dirty bitmaps is correctly set for a gfn.

This is one possible scenario, OK. If you can list all possibilities,
better.

We need to check every possible case carefully.

> > Same with current users of walk_shadow_page_lockless_begin.
> 
> 
> After walk_shadow_page_lockless_begin, it is safe since reader_counter has been
> increased:
> 
> static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
> {
> 	rcu_read_lock();
> 	atomic_inc(&vcpu->kvm->arch.reader_counter);
> 
> 	/* Increase the counter before walking shadow page table */
> 	smp_mb__after_atomic_inc();
> }
> 
> It is not enough?

It is, i don't know what i was talking about.


  reply	other threads:[~2012-04-18 23:12 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-04-13 10:05 [PATCH v2 00/16] KVM: MMU: fast page fault Xiao Guangrong
2012-04-13 10:09 ` [PATCH v2 01/16] KVM: MMU: cleanup __direct_map Xiao Guangrong
2012-04-13 10:10 ` [PATCH v2 02/16] KVM: MMU: introduce mmu_spte_establish Xiao Guangrong
2012-04-13 10:10 ` [PATCH v2 03/16] KVM: MMU: properly assert spte on rmap walking path Xiao Guangrong
2012-04-14  2:15   ` Takuya Yoshikawa
2012-04-16  3:26     ` Xiao Guangrong
2012-04-13 10:11 ` [PATCH v2 04/16] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
2012-04-14  2:00   ` Takuya Yoshikawa
2012-04-15 11:25     ` Avi Kivity
2012-04-16 14:14       ` Takuya Yoshikawa
2012-04-16 14:28         ` Avi Kivity
2012-04-16 15:54           ` Takuya Yoshikawa
2012-04-13 10:11 ` [PATCH v2 05/16] KVM: MMU: abstract spte write-protect Xiao Guangrong
2012-04-14  2:26   ` Takuya Yoshikawa
2012-04-16  3:27     ` Xiao Guangrong
2012-04-13 10:12 ` [PATCH v2 06/16] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
2012-04-13 10:12 ` [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte Xiao Guangrong
2012-04-14  2:44   ` Takuya Yoshikawa
2012-04-16  3:36     ` Xiao Guangrong
2012-04-17 14:47       ` Takuya Yoshikawa
2012-04-18  4:01         ` Xiao Guangrong
2012-04-21  1:01           ` Takuya Yoshikawa
2012-04-21  4:36             ` Xiao Guangrong
2012-04-18 10:03         ` Xiao Guangrong
2012-04-21  1:03           ` Takuya Yoshikawa
2012-04-13 10:13 ` [PATCH v2 08/16] KVM: MMU: store more bits in rmap Xiao Guangrong
2012-04-13 10:13 ` [PATCH v2 09/16] KVM: MMU: fast mmu_need_write_protect path for hard mmu Xiao Guangrong
2012-04-13 10:14 ` [PATCH v2 10/16] KVM: MMU: fask check whether page is writable Xiao Guangrong
2012-04-14  3:01   ` Takuya Yoshikawa
2012-04-16  3:38     ` Xiao Guangrong
2012-04-15 15:16   ` Avi Kivity
2012-04-16  3:25     ` Xiao Guangrong
2012-04-16 10:02       ` Avi Kivity
2012-04-16 10:20         ` Xiao Guangrong
2012-04-16 11:47           ` Avi Kivity
2012-04-17  3:55             ` Xiao Guangrong
2012-04-17  7:41               ` Avi Kivity
2012-04-17 12:10                 ` Xiao Guangrong
2012-04-13 10:14 ` [PATCH v2 11/16] KVM: MMU: introduce SPTE_ALLOW_WRITE bit Xiao Guangrong
2012-04-13 10:15 ` [PATCH v2 12/16] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
2012-04-13 10:15 ` [PATCH v2 13/16] KVM: MMU: break sptes write-protect if gfn is writable Xiao Guangrong
2012-04-13 10:16 ` [PATCH v2 14/16] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
2012-04-18  1:47   ` Marcelo Tosatti
2012-04-18  3:53     ` Xiao Guangrong
2012-04-18 23:08       ` Marcelo Tosatti [this message]
2012-04-13 10:17 ` [PATCH v2 15/16] KVM: MMU: trace fast " Xiao Guangrong
2012-04-13 10:17 ` [PATCH v2 16/16] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
2012-04-14  3:37 ` [PATCH v2 00/16] KVM: MMU: fast page fault Takuya Yoshikawa
2012-04-16  3:50   ` Xiao Guangrong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120418230837.GA9842@amt.cnet \
    --to=mtosatti@redhat.com \
    --cc=avi@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=xiaoguangrong@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox