linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
To: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Avi Kivity <avi@redhat.com>,
	Marcelo Tosatti <mtosatti@redhat.com>,
	LKML <linux-kernel@vger.kernel.org>, KVM <kvm@vger.kernel.org>
Subject: [PATCH 11/13] KVM: MMU: fast path of handling guest page fault
Date: Thu, 29 Mar 2012 17:27:04 +0800	[thread overview]
Message-ID: <4F742AE8.9020201@linux.vnet.ibm.com> (raw)
In-Reply-To: <4F742951.7080003@linux.vnet.ibm.com>

If the the present bit of page fault error code is set, it indicates
the shadow page is populated on all levels, it means what we do is only
modify the access bit which can be done out of mmu-lock

The tricks in this patch is avoiding the race between fast page fault
path and write-protect path, write-protect path is a read-check-modify
path:
read spte, check W bit, then clear W bit. What we do is populating a
identification in spte, if write-protect meets it, it modify the spte
even if the spte is readonly. See the comment in the code to get more
information

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c         |  265 +++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/paging_tmpl.h |   41 +++++++
 2 files changed, 302 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a7f7aea..4a01be4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2767,18 +2767,267 @@ exit:
 	return ret;
 }

+static u32 page_fault_expected_access(u32 error_code)
+{
+	u32 access = 0;
+
+	if (error_code & PFERR_WRITE_MASK)
+		access |= ACC_WRITE_MASK;
+
+	if (error_code & PFERR_USER_MASK)
+		access |= ACC_USER_MASK;
+
+	if (error_code & PFERR_FETCH_MASK)
+		access |= ACC_EXEC_MASK;
+
+	return access;
+}
+
+static u32 spte_access(u64 spte)
+{
+	u32 access;
+
+	access = spte & PT_WRITABLE_MASK;
+
+	if (spte & shadow_user_mask)
+		access |= ACC_USER_MASK;
+
+	if (shadow_x_mask) {
+		if (spte & shadow_x_mask)
+			access |= ACC_EXEC_MASK;
+
+		return access;
+	}
+
+	if (!(spte & shadow_nx_mask))
+		access |= ACC_EXEC_MASK;
+
+	return access;
+}
+
+static bool spte_satisfied(u64 spte, u32 access)
+{
+	return (spte_access(spte) & access) == access;
+}
+
+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
+				   u32 error_code)
+{
+	unsigned long *rmap;
+	bool write = error_code & PFERR_WRITE_MASK;
+
+	/*
+	 * #PF can be fast only if the shadow page table is present, that
+	 * means we just need change the access bits (e.g: R/W, U/S...)
+	 * which can be done out of mmu-lock.
+	 */
+	if (!(error_code & PFERR_PRESENT_MASK))
+		return false;
+
+	if (unlikely(vcpu->vcpu_id > max_vcpu_spte()))
+		return false;
+
+	rmap = gfn_to_rmap(vcpu->kvm, gfn, PT_PAGE_TABLE_LEVEL);
+
+	/* Quickly check the page can be writable. */
+	if (write && (ACCESS_ONCE(*rmap) & PTE_LIST_WRITE_PROTECT))
+		return false;
+
+	return true;
+}
+
+typedef bool (*fast_pf_fetch_spte)(struct kvm_vcpu *vcpu, u64 *sptep,
+				   u64 *new_spte, gfn_t gfn, u32 expect_access,
+				   u64 spte);
+
+static bool
+fast_pf_fetch_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 *new_spte,
+			  gfn_t gfn, u32 expect_access, u64 spte)
+{
+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+	WARN_ON(!sp->role.direct);
+
+	if (kvm_mmu_page_get_gfn(sp, sptep - sp->spt) != gfn)
+		return false;
+
+	set_spte(vcpu, new_spte, sp->role.access,
+		 expect_access & ACC_USER_MASK, expect_access & ACC_WRITE_MASK,
+		 sp->role.level, gfn, spte_to_pfn(spte), false, false,
+		 spte & SPTE_HOST_WRITEABLE, true);
+
+	return true;
+}
+
+static bool
+fast_page_fault_fix_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte,
+			 gfn_t gfn, u32 expect_access,
+			 fast_pf_fetch_spte fn)
+{
+	u64 new_spte = 0ull;
+	int vcpu_id = vcpu->vcpu_id;
+
+	spte = mark_vcpu_id_spte(sptep, spte, vcpu_id);
+
+	/*
+	 * Storing vcpu id into spte should be before read
+	 * PTE_LIST_WRITABLE bit.
+	 */
+	smp_mb();
+
+	/*
+	 * In most case, cmpxchg is enough to set access bits but we
+	 * should pay more attention to page write-protect path, it is
+	 * a read-check-modify path: read spte, check W bit, then clear
+	 * W bit. In order to avoid marking spte writable after/during
+	 * page write-protect, we do the trick like below:
+	 *
+	 *      fast page fault path:
+	 *            lock RCU
+	 *            set identification in the spte
+	 *            smp_mb()
+	 *            if (!rmap.PTE_LIST_WRITE_PROTECT)
+	 *                 cmpxchg + w - vcpu-id
+	 *            unlock RCU
+	 *
+	 *      write protect path:
+	 *            lock mmu-lock
+	 *            set rmap.PTE_LIST_WRITE_PROTECT
+	 *                 smp_mb()
+	 *            if (spte.w || spte has identification)
+	 *                 clear w bit and identification
+	 *            unlock mmu-lock
+	 *
+	 * Setting identification in the spte is used to notify
+	 * page-protect path to modify the spte, then we can see the
+	 * change in the cmpxchg.
+	 *
+	 * Setting identification is also a trick: it only set the last
+	 * bit of spte that does not change the mapping and lose cpu
+	 * status bits.
+	 *
+	 * The identification should be unique to avoid the below race:
+	 *
+	 *      VCPU 0                VCPU 1            VCPU 2
+	 *      lock RCU
+	 *   spte + identification
+	 *   check conditions
+	 *                       do write-protect, clear
+	 *                          identification
+	 *                                              lock RCU
+	 *                                        set identification
+	 *     cmpxchg + w - identification
+	 *        OOPS!!!
+	 *
+	 * We choose the vcpu id as the unique value.
+	 */
+
+	new_spte = 0ull;
+	if (!fn(vcpu, sptep, &new_spte, gfn, expect_access, spte))
+		return false;
+
+	if (!spte_satisfied(new_spte, expect_access))
+		return false;
+
+	/*
+	 * We can not remap a spte from writable to read-only out of
+	 * mmu-lock, since it need flush tlbs to sync guest page
+	 * write-protect.
+	 * See the comment in set_spte().
+	 */
+	if (unlikely(is_writable_pte(spte) && !is_writable_pte(new_spte)))
+		return false;
+
+	cmpxchg(sptep, spte, new_spte);
+
+	return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool
+fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, int level,
+		u32 error_code, fast_pf_fetch_spte fn)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
+	u32 expected_access;
+	bool ret = false;
+	u64 spte = 0ull;
+
+	if (!page_fault_can_be_fast(vcpu, gfn, error_code))
+		return false;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+		if (!is_shadow_present_pte(spte) || iterator.level < level)
+			break;
+
+	/*
+	 * If the mapping has been changed, let the vcpu fault on the
+	 * same address again.
+	 */
+	if (!is_rmap_spte(spte)) {
+		ret = true;
+		goto exit;
+	}
+
+	/*
+	 * Check if it is a spurious fault caused by TLB lazily flushed.
+	 *
+	 * Need not check the access of upper level table entries since
+	 * they are always ACC_ALL.
+	 */
+	expected_access = page_fault_expected_access(error_code);
+	if (spte_satisfied(spte, expected_access)) {
+		ret = true;
+		goto exit;
+	}
+
+	sp = page_header(__pa(iterator.sptep));
+	if (sp->role.level != level || !is_last_spte(spte, level))
+		goto exit;
+
+	/*
+	 * If the page fault is caused by write but host do not allow
+	 * to write the page, we need cow the host page.
+	 */
+	if ((error_code & PFERR_WRITE_MASK) && !(spte & SPTE_HOST_WRITEABLE))
+		goto exit;
+
+	/*
+	 * Do not expand the access of sp.
+	 *
+	 * Checking sp->role.access here is safe since it is never
+	 * changed after it is linked into shadow page table.
+	 */
+	if ((sp->role.access & expected_access) != expected_access)
+		goto exit;
+
+	ret = fast_page_fault_fix_spte(vcpu, iterator.sptep, spte, gfn,
+				       expected_access, fn);
+
+exit:
+	walk_shadow_page_lockless_end(vcpu);
+
+	return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);

-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
-			 bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+			 gfn_t gfn, bool prefault)
 {
 	int r;
 	int level;
 	int force_pt_level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
-	bool map_writable;
+	bool map_writable, write = error_code & PFERR_WRITE_MASK;

 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
 	if (likely(!force_pt_level)) {
@@ -2795,6 +3044,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, v, gfn, level, error_code,
+			    fast_pf_fetch_direct_spte))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

@@ -3195,7 +3448,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	gfn = gva >> PAGE_SHIFT;

 	return nonpaging_map(vcpu, gva & PAGE_MASK,
-			     error_code & PFERR_WRITE_MASK, gfn, prefault);
+			     error_code, gfn, prefault);
 }

 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3275,6 +3528,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	} else
 		level = PT_PAGE_TABLE_LEVEL;

+	if (fast_page_fault(vcpu, gpa, gfn, level, error_code,
+			    fast_pf_fetch_direct_spte))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e2af5a5..e1694e8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -568,6 +568,43 @@ static gpa_t FNAME(get_sp_gpa)(struct kvm_mmu_page *sp)
 	return gfn_to_gpa(sp->gfn) + offset;
 }

+static bool
+FNAME(fast_pf_fetch_indirect_spte)(struct kvm_vcpu *vcpu, u64 *sptep,
+				   u64 *new_spte, gfn_t gfn,
+				   u32 expect_access, u64 spte)
+
+{
+	struct kvm_mmu_page *sp = page_header(__pa(sptep));
+	pt_element_t gpte;
+	gpa_t pte_gpa;
+	unsigned pte_access;
+
+	if (sp->role.direct)
+		return fast_pf_fetch_direct_spte(vcpu, sptep, new_spte,
+						 gfn, expect_access, spte);
+
+	pte_gpa = FNAME(get_sp_gpa)(sp);
+	pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+	if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+				      sizeof(pt_element_t)))
+		return false;
+
+	if (FNAME(invalid_gpte)(vcpu, gpte))
+		return false;
+
+	if (gpte_to_gfn(gpte) != gfn)
+		return false;
+
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
+	set_spte(vcpu, new_spte, pte_access, expect_access & ACC_USER_MASK,
+		 expect_access & ACC_WRITE_MASK, sp->role.level, gfn,
+		 spte_to_pfn(spte), false, false,
+		 spte & SPTE_HOST_WRITEABLE, true);
+
+	return true;
+}
+
 /*
  * Page fault handler.  There are several causes for a page fault:
  *   - there is no shadow pte for the guest pte
@@ -632,6 +669,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	}

+	if (fast_page_fault(vcpu, addr, walker.gfn, level,
+			    error_code, FNAME(fast_pf_fetch_indirect_spte)))
+		return 0;
+
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();

-- 
1.7.7.6


  parent reply	other threads:[~2012-03-29  9:27 UTC|newest]

Thread overview: 83+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-29  9:20 [PATCH 00/13] KVM: MMU: fast page fault Xiao Guangrong
2012-03-29  9:20 ` [PATCH 01/13] KVM: MMU: properly assert spte on rmap_next path Xiao Guangrong
2012-03-29  9:21 ` [PATCH 02/13] KVM: MMU: abstract spte write-protect Xiao Guangrong
2012-03-29 11:11   ` Avi Kivity
2012-03-29 11:51     ` Xiao Guangrong
2012-03-29  9:22 ` [PATCH 03/13] KVM: MMU: split FNAME(prefetch_invalid_gpte) Xiao Guangrong
2012-03-29 13:00   ` Avi Kivity
2012-03-30  3:51     ` Xiao Guangrong
2012-03-29  9:22 ` [PATCH 04/13] KVM: MMU: introduce FNAME(get_sp_gpa) Xiao Guangrong
2012-03-29 13:07   ` Avi Kivity
2012-03-30  5:01     ` Xiao Guangrong
2012-04-01 12:42       ` Avi Kivity
2012-03-29  9:23 ` [PATCH 05/13] KVM: MMU: reset shadow_mmio_mask Xiao Guangrong
2012-03-29 13:10   ` Avi Kivity
2012-03-29 15:28     ` Avi Kivity
2012-03-29 16:24       ` Avi Kivity
2012-03-29  9:23 ` [PATCH 06/13] KVM: VMX: export PFEC.P bit on ept Xiao Guangrong
2012-03-29  9:24 ` [PATCH 07/13] KVM: MMU: store more bits in rmap Xiao Guangrong
2012-03-29  9:25 ` [PATCH 08/13] KVM: MMU: fask check whether page is writable Xiao Guangrong
2012-03-29 15:49   ` Avi Kivity
2012-03-30  5:10     ` Xiao Guangrong
2012-04-01 15:52   ` Avi Kivity
2012-04-05 17:54     ` Xiao Guangrong
2012-04-12 23:08       ` Marcelo Tosatti
2012-04-13 10:26         ` Xiao Guangrong
2012-03-29  9:25 ` [PATCH 09/13] KVM: MMU: get expected spte out of mmu-lock Xiao Guangrong
2012-04-01 15:53   ` Avi Kivity
2012-04-05 18:25     ` Xiao Guangrong
2012-04-09 12:28       ` Avi Kivity
2012-04-09 13:16         ` Takuya Yoshikawa
2012-04-09 13:21           ` Avi Kivity
2012-03-29  9:26 ` [PATCH 10/13] KVM: MMU: store vcpu id in spte to notify page write-protect path Xiao Guangrong
2012-03-29  9:27 ` Xiao Guangrong [this message]
2012-03-31 12:24   ` [PATCH 11/13] KVM: MMU: fast path of handling guest page fault Xiao Guangrong
2012-04-01 16:23   ` Avi Kivity
2012-04-03 13:04     ` Avi Kivity
2012-04-05 19:39     ` Xiao Guangrong
2012-03-29  9:27 ` [PATCH 12/13] KVM: MMU: trace fast " Xiao Guangrong
2012-03-29  9:28 ` [PATCH 13/13] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint Xiao Guangrong
2012-03-29 10:18 ` [PATCH 00/13] KVM: MMU: fast page fault Avi Kivity
2012-03-29 11:40   ` Xiao Guangrong
2012-03-29 12:57     ` Avi Kivity
2012-03-30  9:18       ` Xiao Guangrong
2012-03-31 13:12         ` Xiao Guangrong
2012-04-01 12:58         ` Avi Kivity
2012-04-05 21:57           ` Xiao Guangrong
2012-04-06  5:24             ` Xiao Guangrong
2012-04-09 13:20               ` Avi Kivity
2012-04-09 13:59                 ` Xiao Guangrong
2012-04-09 13:12 ` Avi Kivity
2012-04-09 13:55   ` Xiao Guangrong
2012-04-09 14:01     ` Xiao Guangrong
2012-04-09 14:25     ` Avi Kivity
2012-04-09 17:58   ` Marcelo Tosatti
2012-04-09 18:13     ` Xiao Guangrong
2012-04-09 19:31       ` Marcelo Tosatti
2012-04-09 18:26     ` Xiao Guangrong
2012-04-09 19:46       ` Marcelo Tosatti
2012-04-10  3:06         ` Xiao Guangrong
2012-04-10 10:04         ` Avi Kivity
2012-04-11  1:47           ` Marcelo Tosatti
2012-04-11  9:15             ` Avi Kivity
2012-04-10 10:39         ` Avi Kivity
2012-04-10 11:40           ` Takuya Yoshikawa
2012-04-10 11:58             ` Xiao Guangrong
2012-04-11 12:15               ` Takuya Yoshikawa
2012-04-11 12:38                 ` Xiao Guangrong
2012-04-11 14:14                   ` Takuya Yoshikawa
2012-04-11 14:21                     ` Avi Kivity
2012-04-11 22:26                       ` Takuya Yoshikawa
2012-04-13 14:25                     ` Takuya Yoshikawa
2012-04-15  9:32                       ` Avi Kivity
2012-04-16 15:49                         ` Takuya Yoshikawa
2012-04-16 16:02                           ` Avi Kivity
2012-04-17  6:26                           ` Xiao Guangrong
2012-04-17  7:51                             ` Avi Kivity
2012-04-17 12:37                               ` Takuya Yoshikawa
2012-04-17 12:41                                 ` Avi Kivity
2012-04-17 14:54                                   ` Takuya Yoshikawa
2012-04-17 14:56                                     ` Avi Kivity
2012-04-18 13:42                                       ` Takuya Yoshikawa
2012-04-17  6:16                         ` Xiao Guangrong
2012-04-10 10:10       ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4F742AE8.9020201@linux.vnet.ibm.com \
    --to=xiaoguangrong@linux.vnet.ibm.com \
    --cc=avi@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mtosatti@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).