Re: [PATCH v2 14/16] KVM: MMU: fast path of handling guest page fault

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Avi Kivity <avi@redhat.com>, LKML <linux-kernel@vger.kernel.org>,
	KVM <kvm@vger.kernel.org>
Subject: Re: [PATCH v2 14/16] KVM: MMU: fast path of handling guest page fault
Date: Wed, 18 Apr 2012 11:53:51 +0800	[thread overview]
Message-ID: <4F8E3ACF.4000200@linux.vnet.ibm.com> (raw)
In-Reply-To: <20120418014757.GA25133@amt.cnet>

On 04/18/2012 09:47 AM, Marcelo Tosatti wrote:

> On Fri, Apr 13, 2012 at 06:16:33PM +0800, Xiao Guangrong wrote:
>> If the the present bit of page fault error code is set, it indicates
>> the shadow page is populated on all levels, it means what we do is
>> only modify the access bit which can be done out of mmu-lock
>>
>> Currently, in order to simplify the code, we only fix the page fault
>> caused by write-protect on the fast path
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  arch/x86/kvm/mmu.c         |  205 ++++++++++++++++++++++++++++++++++++++++----
>>  arch/x86/kvm/paging_tmpl.h |    3 +
>>  2 files changed, 192 insertions(+), 16 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index efa5d59..fc91667 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -446,6 +446,13 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
>>  }
>>  #endif
>>
>> +static bool spte_wp_by_dirty_log(u64 spte)
>> +{
>> +	WARN_ON(is_writable_pte(spte));
>> +
>> +	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
>> +}
>> +
>>  static bool spte_has_volatile_bits(u64 spte)
>>  {
>>  	if (!shadow_accessed_mask)
>> @@ -454,9 +461,18 @@ static bool spte_has_volatile_bits(u64 spte)
>>  	if (!is_shadow_present_pte(spte))
>>  		return false;
>>
>> -	if ((spte & shadow_accessed_mask) &&
>> -	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
>> -		return false;
>> +	if (spte & shadow_accessed_mask) {
>> +		if (is_writable_pte(spte))
>> +			return !(spte & shadow_dirty_mask);
>> +
>> +		/*
>> +		 * If the spte is write-protected by dirty-log, it may
>> +		 * be marked writable on fast page fault path, then CPU
>> +		 * can modify the Dirty bit.
>> +		 */
>> +		if (!spte_wp_by_dirty_log(spte))
>> +			return false;
>> +	}
>>
>>  	return true;
>>  }
>> @@ -1109,26 +1125,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
>>  		rmap_remove(kvm, sptep);
>>  }
>>
>> -static bool spte_wp_by_dirty_log(u64 spte)
>> -{
>> -	WARN_ON(is_writable_pte(spte));
>> -
>> -	return (spte & SPTE_ALLOW_WRITE) && !(spte & SPTE_WRITE_PROTECT);
>> -}
>> -
>>  static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
>>  			       bool *flush, bool page_table_protect)
>>  {
>>  	u64 spte = *sptep;
>>
>>  	if (is_writable_pte(spte)) {
>> -		*flush |= true;
>> -
>>  		if (large) {
>>  			pgprintk("rmap_write_protect(large): spte %p %llx\n",
>>  				 spte, *spte);
>>  			BUG_ON(!is_large_pte(spte));
>>
>> +			*flush |= true;
>>  			drop_spte(kvm, sptep);
>>  			--kvm->stat.lpages;
>>  			return;
>> @@ -1137,6 +1145,9 @@ static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
>>  		goto reset_spte;
>>  	}
>>
>> +	/* We need flush tlbs in this case: the fast page fault path
>> +	 * can mark the spte writable after we read the sptep.
>> +	 */
>>  	if (page_table_protect && spte_wp_by_dirty_log(spte))
>>  		goto reset_spte;
>>
>> @@ -1144,6 +1155,8 @@ static void spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
>>
>>  reset_spte:
>>  	rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
>> +
>> +	*flush |= true;
>>  	spte = spte & ~PT_WRITABLE_MASK;
>>  	if (page_table_protect)
>>  		spte |= SPTE_WRITE_PROTECT;
>> @@ -2767,18 +2780,172 @@ exit:
>>  	return ret;
>>  }
>>
>> +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
>> +				   u32 error_code)
>> +{
>> +	unsigned long *rmap;
>> +
>> +	/*
>> +	 * #PF can be fast only if the shadow page table is present and it
>> +	 * is caused by write-protect, that means we just need change the
>> +	 * W bit of the spte which can be done out of mmu-lock.
>> +	 */
>> +	if (!(error_code & PFERR_PRESENT_MASK) ||
>> +	      !(error_code & PFERR_WRITE_MASK))
>> +		return false;
>> +
>> +	rmap = gfn_to_rmap(vcpu->kvm, gfn, PT_PAGE_TABLE_LEVEL);
>> +
>> +	/* Quickly check the page can be writable. */
>> +	if (test_bit(PTE_LIST_WP_BIT, ACCESS_ONCE(rmap)))
>> +		return false;
>> +
>> +	return true;
>> +}
>> +
>> +static bool
>> +fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>> +			  u64 *sptep, u64 spte, gfn_t gfn)
>> +{
>> +	pfn_t pfn;
>> +	bool ret = false;
>> +
>> +	/*
>> +	 * For the indirect spte, it is hard to get a stable gfn since
>> +	 * we just use a cmpxchg to avoid all the races which is not
>> +	 * enough to avoid the ABA problem: the host can arbitrarily
>> +	 * change spte and the mapping from gfn to pfh.
>> +	 *
>> +	 * What we do is call gfn_to_pfn_atomic to bind the gfn and the
>> +	 * pfn because after the call:
>> +	 * - we have held the refcount of pfn that means the pfn can not
>> +	 *   be freed and be reused for another gfn.
>> +	 * - the pfn is writable that means it can not be shared by different
>> +	 *   gfn.
>> +	 */
>> +	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
> 
> Please document what can happen in parallel whenever you manipulate
> sptes without mmu_lock held, convincing the reader that this is safe.
> 


OK, I will documents it in locking.txt

I am not good at documenting, How about the below description?

What we use to avoid all the race is spte.SPTE_ALLOW_WRITE and spte.SPTE_WRITE_PROTECT:
SPTE_ALLOW_WRITE means the gfn is writable on both guest and host, and SPTE_ALLOW_WRITE
means this gfn is not write-protected for shadow page write protection.

On fast page fault path, we will atomically set the spte.W bit if
spte.SPTE_WRITE_PROTECT = 1 and spte.SPTE_WRITE_PROTECT = 0, this is safe because whenever
changing these bits can be detected by cmpxchg.

But we need carefully check the mapping between gfn to pfn since we can only ensure the pfn
is not changed during cmpxchg. This is a ABA problem, for example, below case will happen:

At the beginning:
gpte = gfn1
gfn1 is mapped to pfn1 on host
spte is the shadow page table entry corresponding with gpte and
spte = pfn1

   VCPU 0                                           VCPU0
on page fault path:

   old_spte = *spte;
                                                 pfn1 is swapped out:
                                                             spte = 0;
                                                 pfn1 is re-alloced for gfn2
                                                 gpte is changed by host, and pointing to gfn2:
                                                             spte = pfn1;

   if (cmpxchg(spte, old_spte, old_spte+W)
	mark_page_dirty(vcpu->kvm, gfn1)
  	OOPS!!!
   we dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.

For direct sp, we can easily avoid it since the spte of direct sp is fixed to gfn.
For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic to pin gfn to pfn,
because after gfn_to_pfn_atomic:
- we have held the refcount of pfn that means the pfn can not
  be freed and be reused for another gfn.
- the pfn is writable that means it can not be shared by different
  gfn by KSM.
Then, we can ensure the dirty bitmaps is correctly set for a gfn.

>> +
>> +/*
>> + * Return value:
>> + * - true: let the vcpu to access on the same address again.
>> + * - false: let the real page fault path to fix it.
>> + */
>> +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
>> +			    int level, u32 error_code)
>> +{
>> +	struct kvm_shadow_walk_iterator iterator;
>> +	struct kvm_mmu_page *sp;
>> +	bool ret = false;
>> +	u64 spte = 0ull;
>> +
>> +	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
>> +		return false;
> 
> What prevents kvm_mmu_commit_zap_page from nukeing the "shadow_page"
> here, again? At this point the faulting spte could be zero (!present), 
> and you have not yet increased reader_counter.
> 


Hmm, you mean page_fault_can_be_fast? we do not use any spte in this function,
kvm_mmu_commit_zap_page can happily zap the shadow page.

> Same with current users of walk_shadow_page_lockless_begin.


After walk_shadow_page_lockless_begin, it is safe since reader_counter has been
increased:

static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
	rcu_read_lock();
	atomic_inc(&vcpu->kvm->arch.reader_counter);

	/* Increase the counter before walking shadow page table */
	smp_mb__after_atomic_inc();
}

It is not enough?

 
> I agree with Takuya, please reduce the size of the patchset to only to
> what is strictly necessary, it appears many of the first patches are
> not.


I will post a simple patchset soon, of course, after collecting your comments
in this mail. ;)

next prev parent reply	other threads:[~2012-04-18  4:13 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-04-13 10:05 [PATCH v2 00/16] KVM: MMU: fast page fault Xiao Guangrong
2012-04-13 10:09 ` [PATCH v2 01/16] KVM: MMU: cleanup __direct_map Xiao Guangrong
2012-04-13 10:10 ` [PATCH v2 02/16] KVM: MMU: introduce mmu_spte_establish Xiao Guangrong
2012-04-13 10:10 ` [PATCH v2 03/16] KVM: MMU: properly assert spte on rmap walking path Xiao Guangrong
2012-04-14  2:15   ` Takuya Yoshikawa
2012-04-16  3:26     ` Xiao Guangrong
2012-04-13 10:11 ` [PATCH v2 04/16] KVM: MMU: return bool in __rmap_write_protect Xiao Guangrong
2012-04-14  2:00   ` Takuya Yoshikawa
2012-04-15 11:25     ` Avi Kivity
2012-04-16 14:14       ` Takuya Yoshikawa
2012-04-16 14:28         ` Avi Kivity
2012-04-16 15:54           ` Takuya Yoshikawa
2012-04-13 10:11 ` [PATCH v2 05/16] KVM: MMU: abstract spte write-protect Xiao Guangrong
2012-04-14  2:26   ` Takuya Yoshikawa
2012-04-16  3:27     ` Xiao Guangrong
2012-04-13 10:12 ` [PATCH v2 06/16] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
2012-04-13 10:12 ` [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte Xiao Guangrong
2012-04-14  2:44   ` Takuya Yoshikawa
2012-04-16  3:36     ` Xiao Guangrong
2012-04-17 14:47       ` Takuya Yoshikawa
2012-04-18  4:01         ` Xiao Guangrong
2012-04-21  1:01           ` Takuya Yoshikawa
2012-04-21  4:36             ` Xiao Guangrong
2012-04-18 10:03         ` Xiao Guangrong
2012-04-21  1:03           ` Takuya Yoshikawa
2012-04-13 10:13 ` [PATCH v2 08/16] KVM: MMU: store more bits in rmap Xiao Guangrong
2012-04-13 10:13 ` [PATCH v2 09/16] KVM: MMU: fast mmu_need_write_protect path for hard mmu Xiao Guangrong
2012-04-13 10:14 ` [PATCH v2 10/16] KVM: MMU: fask check whether page is writable Xiao Guangrong
2012-04-14  3:01   ` Takuya Yoshikawa
2012-04-16  3:38     ` Xiao Guangrong
2012-04-15 15:16   ` Avi Kivity
2012-04-16  3:25     ` Xiao Guangrong
2012-04-16 10:02       ` Avi Kivity
2012-04-16 10:20         ` Xiao Guangrong
2012-04-16 11:47           ` Avi Kivity
2012-04-17  3:55             ` Xiao Guangrong
2012-04-17  7:41               ` Avi Kivity
2012-04-17 12:10                 ` Xiao Guangrong
2012-04-13 10:14 ` [PATCH v2 11/16] KVM: MMU: introduce SPTE_ALLOW_WRITE bit Xiao Guangrong
2012-04-13 10:15 ` [PATCH v2 12/16] KVM: MMU: introduce SPTE_WRITE_PROTECT bit Xiao Guangrong
2012-04-13 10:15 ` [PATCH v2 13/16] KVM: MMU: break sptes write-protect if gfn is writable Xiao Guangrong
2012-04-13 10:16 ` [PATCH v2 14/16] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
2012-04-18  1:47   ` Marcelo Tosatti
2012-04-18  3:53     ` Xiao Guangrong [this message]
2012-04-18 23:08       ` Marcelo Tosatti
2012-04-13 10:17 ` [PATCH v2 15/16] KVM: MMU: trace fast " Xiao Guangrong
2012-04-13 10:17 ` [PATCH v2 16/16] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
2012-04-14  3:37 ` [PATCH v2 00/16] KVM: MMU: fast page fault Takuya Yoshikawa
2012-04-16  3:50   ` Xiao Guangrong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4F8E3ACF.4000200@linux.vnet.ibm.com \
    --to=xiaoguangrong@linux.vnet.ibm.com \
    --cc=avi@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mtosatti@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox