[PATCH v2 4/5] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Junaid Shahid <junaids@google.com>
To: kvm@vger.kernel.org
Cc: andreslc@google.com, pfeiner@google.com, pbonzini@redhat.com,
	guangrong.xiao@linux.intel.com
Subject: [PATCH v2 4/5] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits.
Date: Tue,  8 Nov 2016 15:00:29 -0800	[thread overview]
Message-ID: <1478646030-101103-5-git-send-email-junaids@google.com> (raw)
In-Reply-To: <1478646030-101103-1-git-send-email-junaids@google.com>

This change implements lockless access tracking for Intel CPUs without EPT
A bits. This is achieved by marking the PTEs as not-present (but not
completely clearing them) when clear_flush_young() is called after marking
the pages as accessed. When an EPT Violation is generated as a result of
the VM accessing those pages, the PTEs are restored to their original values.

Signed-off-by: Junaid Shahid <junaids@google.com>
---
 arch/x86/include/asm/vmx.h |  39 ++++++
 arch/x86/kvm/mmu.c         | 314 ++++++++++++++++++++++++++++++++++-----------
 arch/x86/kvm/mmu.h         |   2 +
 arch/x86/kvm/vmx.c         |  20 ++-
 4 files changed, 301 insertions(+), 74 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 60991fb..3d63098 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -434,6 +434,45 @@ enum vmcs_field {
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
 #define VMX_EPT_ACCESS_BIT				(1ull << 8)
 #define VMX_EPT_DIRTY_BIT				(1ull << 9)
+#define VMX_EPT_RWX_MASK                        (VMX_EPT_READABLE_MASK |       \
+						 VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK				(7ull << VMX_EPT_MT_EPTE_SHIFT)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE		(VMX_EPT_WRITABLE_MASK |       \
+						 VMX_EPT_EXECUTABLE_MASK)
+
+/*
+ * The shift to use for saving the original RWX value when marking the PTE as
+ * not-present for tracking purposes.
+ */
+#define VMX_EPT_RWX_SAVE_SHIFT			52
+
+/*
+ * The shift/mask for determining the type of tracking (if any) being used for a
+ * not-present PTE. Currently, only two bits are used, but more can be added.
+ *
+ * NOTE: Bit 63 is an architecturally ignored bit (and hence can be used for our
+ *       purpose) when the EPT PTE is in a misconfigured state. However, it is
+ *       not necessarily an ignored bit otherwise (even in a not-present state).
+ *       Since the existing MMIO code already uses this bit and since KVM
+ *       doesn't use #VEs currently (where this bit comes into play), so we can
+ *       continue to use it for storing the type. But to be on the safe side,
+ *       we should not set it to 1 in those TRACK_TYPEs where the tracking is
+ *       done via EPT Violations instead of EPT Misconfigurations.
+ */
+#define VMX_EPT_TRACK_TYPE_SHIFT		62
+#define VMX_EPT_TRACK_TYPE_MASK			(3ull <<                       \
+						 VMX_EPT_TRACK_TYPE_SHIFT)
+
+/* Sets only bit 62 as the tracking is done by EPT Violations. See note above */
+#define VMX_EPT_TRACK_ACCESS			(1ull <<                       \
+						 VMX_EPT_TRACK_TYPE_SHIFT)
+/* Sets bits 62 and 63. See note above */
+#define VMX_EPT_TRACK_MMIO			(3ull <<                       \
+						 VMX_EPT_TRACK_TYPE_SHIFT)
+
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a22a8a2..8ea1618 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/kern_levels.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -177,6 +178,10 @@ static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_acc_track_mask;
+static u64 __read_mostly shadow_acc_track_value;
+static u64 __read_mostly shadow_acc_track_saved_bits_mask;
+static u64 __read_mostly shadow_acc_track_saved_bits_shift;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -187,6 +192,26 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
+void kvm_mmu_set_access_track_masks(u64 acc_track_mask, u64 acc_track_value,
+				    u64 saved_bits_mask, u64 saved_bits_shift)
+{
+	shadow_acc_track_mask = acc_track_mask;
+	shadow_acc_track_value = acc_track_value;
+	shadow_acc_track_saved_bits_mask = saved_bits_mask;
+	shadow_acc_track_saved_bits_shift = saved_bits_shift;
+
+	BUG_ON((~acc_track_mask & acc_track_value) != 0);
+	BUG_ON((~acc_track_mask & saved_bits_mask) != 0);
+	BUG_ON(shadow_accessed_mask != 0);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_access_track_masks);
+
+static inline bool is_access_track_spte(u64 spte)
+{
+	return shadow_acc_track_mask != 0 &&
+	       (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+}
+
 /*
  * the low bit of the generation number is always presumed to be zero.
  * This disables mmio caching during memslot updates.  The concept is
@@ -292,9 +317,25 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
+	BUG_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+void kvm_mmu_clear_all_pte_masks(void)
+{
+	shadow_user_mask = 0;
+	shadow_accessed_mask = 0;
+	shadow_dirty_mask = 0;
+	shadow_nx_mask = 0;
+	shadow_x_mask = 0;
+	shadow_mmio_mask = 0;
+	shadow_present_mask = 0;
+	shadow_acc_track_mask = 0;
+	shadow_acc_track_value = 0;
+	shadow_acc_track_saved_bits_mask = 0;
+	shadow_acc_track_saved_bits_shift = 0;
+}
+
 static int is_cpuid_PSE36(void)
 {
 	return 1;
@@ -307,7 +348,8 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
+	return ((pte & 0xFFFFFFFFull) && !is_mmio_spte(pte)) ||
+	       is_access_track_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -490,6 +532,9 @@ static bool spte_has_volatile_bits(u64 spte)
 	if (spte_can_locklessly_be_made_writable(spte))
 		return true;
 
+	if (is_access_track_spte(spte))
+		return true;
+
 	if (!shadow_accessed_mask)
 		return false;
 
@@ -533,17 +578,21 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
  * will find a read-only spte, even though the writable spte
  * might be cached on a CPU's TLB, the return value indicates this
  * case.
+ *
+ * Returns true if the TLB needs to be flushed
  */
 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte = *sptep;
-	bool ret = false;
+	bool flush = false;
+	bool writable_cleared;
+	bool acc_track_enabled;
 
 	WARN_ON(!is_shadow_present_pte(new_spte));
 
 	if (!is_shadow_present_pte(old_spte)) {
 		mmu_spte_set(sptep, new_spte);
-		return ret;
+		return flush;
 	}
 
 	if (!spte_has_volatile_bits(old_spte))
@@ -551,24 +600,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	else
 		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
+	BUG_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
 	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
 	if (spte_can_locklessly_be_made_writable(old_spte) &&
-	      !is_writable_pte(new_spte))
-		ret = true;
-
-	if (!shadow_accessed_mask) {
-		/*
-		 * We don't set page dirty when dropping non-writable spte.
-		 * So do it now if the new spte is becoming non-writable.
-		 */
-		if (ret)
-			kvm_set_pfn_dirty(spte_to_pfn(old_spte));
-		return ret;
-	}
+	    !is_writable_pte(new_spte))
+		flush = true;
 
 	/*
 	 * Flush TLB when accessed/dirty bits are changed in the page tables,
@@ -576,20 +617,34 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	 */
 	if (spte_is_bit_changed(old_spte, new_spte,
                                 shadow_accessed_mask | shadow_dirty_mask))
-		ret = true;
+		flush = true;
 
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+	writable_cleared = is_writable_pte(old_spte) &&
+			   !is_writable_pte(new_spte);
+	acc_track_enabled = !is_access_track_spte(old_spte) &&
+			    is_access_track_spte(new_spte);
+
+	if (writable_cleared || acc_track_enabled)
+		flush = true;
+
+	if (shadow_accessed_mask ?
+	    spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask) :
+	    acc_track_enabled)
 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+
+	if (shadow_dirty_mask ?
+	    spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask) :
+	    writable_cleared)
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 
-	return ret;
+	return flush;
 }
 
 /*
  * Rules for using mmu_spte_clear_track_bits:
  * It sets the sptep from present to nonpresent, and track the
  * state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
  */
 static int mmu_spte_clear_track_bits(u64 *sptep)
 {
@@ -604,6 +659,13 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	if (!is_shadow_present_pte(old_spte))
 		return 0;
 
+	/*
+	 * For access tracking SPTEs, the pfn was already marked accessed/dirty
+	 * when the SPTE was marked for access tracking, so nothing to do here.
+	 */
+	if (is_access_track_spte(old_spte))
+		return 1;
+
 	pfn = spte_to_pfn(old_spte);
 
 	/*
@@ -618,6 +680,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
 					    PT_WRITABLE_MASK))
 		kvm_set_pfn_dirty(pfn);
+
 	return 1;
 }
 
@@ -636,6 +699,52 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 	return __get_spte_lockless(sptep);
 }
 
+static u64 mark_spte_for_access_track(u64 spte)
+{
+	if (shadow_acc_track_mask == 0)
+		return spte;
+
+	/*
+	 * Verify that the write-protection that we do below will be fixable
+	 * via the fast page fault path. Currently, that is always the case, at
+	 * least when using EPT (which is when access tracking would be used).
+	 */
+	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+		  !spte_can_locklessly_be_made_writable(spte),
+		  "Writable SPTE is not locklessly dirty-trackable\n");
+
+	/*
+	 * Any PTE marked for access tracking should also be marked for dirty
+	 * tracking (by being non-writable)
+	 */
+	spte &= ~PT_WRITABLE_MASK;
+
+	spte &= ~(shadow_acc_track_saved_bits_mask <<
+		  shadow_acc_track_saved_bits_shift);
+	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+		shadow_acc_track_saved_bits_shift;
+	spte &= ~shadow_acc_track_mask;
+	spte |= shadow_acc_track_value;
+
+	return spte;
+}
+
+/* Returns true if the TLB needs to be flushed */
+static bool mmu_spte_enable_access_track(u64 *sptep)
+{
+	u64 spte = mmu_spte_get_lockless(sptep);
+
+	if (is_access_track_spte(spte))
+		return false;
+
+	/* Access tracking should not be enabled if CPU supports A/D bits */
+	BUG_ON(shadow_accessed_mask != 0);
+
+	spte = mark_spte_for_access_track(spte);
+
+	return mmu_spte_update(sptep, spte);
+}
+
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1403,6 +1512,25 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	return kvm_zap_rmapp(kvm, rmap_head);
 }
 
+static int kvm_acc_track_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+			       struct kvm_memory_slot *slot, gfn_t gfn,
+			       int level, unsigned long data)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	int need_tlb_flush = 0;
+
+	for_each_rmap_spte(rmap_head, &iter, sptep) {
+
+		rmap_printk("kvm_acc_track_rmapp: spte %p %llx gfn %llx (%d)\n",
+			    sptep, *sptep, gfn, level);
+
+		need_tlb_flush |= mmu_spte_enable_access_track(sptep);
+	}
+
+	return need_tlb_flush;
+}
+
 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
 			     unsigned long data)
@@ -1419,8 +1547,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 
 restart:
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
+
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
-			     sptep, *sptep, gfn, level);
+			    sptep, *sptep, gfn, level);
 
 		need_flush = 1;
 
@@ -1435,6 +1564,8 @@ restart:
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
 
+			new_spte = mark_spte_for_access_track(new_spte);
+
 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
 		}
@@ -1615,24 +1746,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int young = 0;
-
-	/*
-	 * If there's no access bit in the secondary pte set by the
-	 * hardware it's up to gup-fast/gup to set the access bit in
-	 * the primary pte or in the page structure.
-	 */
-	if (!shadow_accessed_mask)
-		goto out;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		if (*sptep & shadow_accessed_mask) {
-			young = 1;
-			break;
-		}
+		if ((*sptep & shadow_accessed_mask) ||
+		    (!shadow_accessed_mask && !is_access_track_spte(*sptep)))
+			return 1;
 	}
-out:
-	return young;
+
+	return 0;
 }
 
 #define RMAP_RECYCLE_THRESHOLD 1000
@@ -1669,7 +1790,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 		 */
 		kvm->mmu_notifier_seq++;
 		return kvm_handle_hva_range(kvm, start, end, 0,
-					    kvm_unmap_rmapp);
+					    shadow_acc_track_mask != 0
+					    ? kvm_acc_track_rmapp
+					    : kvm_unmap_rmapp);
 	}
 
 	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
@@ -2591,6 +2714,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= shadow_dirty_mask;
 	}
 
+	if (speculative)
+		spte = mark_spte_for_access_track(spte);
+
 set_pte:
 	if (mmu_spte_update(sptep, spte))
 		kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2644,7 +2770,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
-		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+		 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
 		 *sptep, sptep);
 	if (!was_rmapped && is_large_pte(*sptep))
 		++vcpu->kvm->stat.lpages;
@@ -2877,16 +3003,27 @@ static bool page_fault_can_be_fast(u32 error_code)
 	if (unlikely(error_code & PFERR_RSVD_MASK))
 		return false;
 
-	/*
-	 * #PF can be fast only if the shadow page table is present and it
-	 * is caused by write-protect, that means we just need change the
-	 * W bit of the spte which can be done out of mmu-lock.
-	 */
-	if (!(error_code & PFERR_PRESENT_MASK) ||
-	      !(error_code & PFERR_WRITE_MASK))
+	/* See if the page fault is due to an NX violation */
+	if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+		      == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
 		return false;
 
-	return true;
+	/*
+	 * #PF can be fast if:
+	 * 1. The shadow page table entry is not present, which could mean that
+	 *    the fault is potentially caused by access tracking (if enabled).
+	 * 2. The shadow page table entry is present and the fault
+	 *    is caused by write-protect, that means we just need change the W
+	 *    bit of the spte which can be done out of mmu-lock.
+	 *
+	 * However, if Access Tracking is disabled, then the first condition
+	 * above cannot be handled by the fast path. So if access tracking is
+	 * disabled, we return true only if the second condition is met.
+	 */
+
+	return shadow_acc_track_mask != 0 ||
+	       ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+		== (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
 }
 
 /*
@@ -2895,17 +3032,24 @@ static bool page_fault_can_be_fast(u32 error_code)
  */
 static bool
 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			u64 *sptep, u64 spte)
+			u64 *sptep, u64 old_spte,
+			bool remove_write_prot, bool remove_acc_track)
 {
 	gfn_t gfn;
+	u64 new_spte = old_spte;
 
 	WARN_ON(!sp->role.direct);
 
-	/*
-	 * The gfn of direct spte is stable since it is calculated
-	 * by sp->gfn.
-	 */
-	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+	if (remove_acc_track) {
+		u64 saved_bits = old_spte & (shadow_acc_track_saved_bits_mask <<
+					     shadow_acc_track_saved_bits_shift);
+
+		new_spte &= ~shadow_acc_track_mask;
+		new_spte |= saved_bits >> shadow_acc_track_saved_bits_shift;
+	}
+
+	if (remove_write_prot)
+		new_spte |= PT_WRITABLE_MASK;
 
 	/*
 	 * Theoretically we could also set dirty bit (and flush TLB) here in
@@ -2919,10 +3063,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 *
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
-	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte)
+	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
 		return false;
 
-	kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	if (remove_write_prot) {
+		/*
+		 * The gfn of direct spte is stable since it is
+		 * calculated by sp->gfn.
+		 */
+		gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	}
 
 	return true;
 }
@@ -2937,7 +3088,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	bool ret = false;
+	bool fault_handled = false;
 	u64 spte = 0ull;
 	uint retry_count = 0;
 
@@ -2953,36 +3104,43 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 			break;
 
 	do {
-		/*
-		 * If the mapping has been changed, let the vcpu fault on the
-		 * same address again.
-		 */
-		if (!is_shadow_present_pte(spte)) {
-			ret = true;
-			break;
-		}
+		bool remove_write_prot = (error_code & PFERR_WRITE_MASK) &&
+					 !(spte & PT_WRITABLE_MASK);
+		bool remove_acc_track;
+		bool valid_exec_access = (error_code & PFERR_FETCH_MASK) &&
+					 (spte & shadow_x_mask);
 
 		sp = page_header(__pa(iterator.sptep));
 		if (!is_last_spte(spte, sp->role.level))
 			break;
 
 		/*
-		 * Check if it is a spurious fault caused by TLB lazily flushed.
+		 * Check whether the memory access that caused the fault would
+		 * still cause it if it were to be performed right now. If not,
+		 * then this is a spurious fault caused by TLB lazily flushed,
+		 * or some other CPU has already fixed the PTE after the
+		 * current CPU took the fault.
+		 *
+		 * If Write-Only mappings ever become supported, then the
+		 * condition below would need to be changed appropriately.
 		 *
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
-		if (is_writable_pte(spte)) {
-			ret = true;
+		if (((spte & PT_PRESENT_MASK) && !remove_write_prot) ||
+		    valid_exec_access) {
+			fault_handled = true;
 			break;
 		}
 
+		remove_acc_track = is_access_track_spte(spte);
+
 		/*
-		 * Currently, to simplify the code, only the spte
-		 * write-protected by dirty-log can be fast fixed.
+		 * Currently, to simplify the code, write-protection can be
+		 * removed in the fast path only if the SPTE was write-protected
+		 * for dirty-logging.
 		 */
-		if (!spte_can_locklessly_be_made_writable(spte))
-			break;
+		remove_write_prot &= spte_can_locklessly_be_made_writable(spte);
 
 		/*
 		 * Do not fix write-permission on the large spte since we only
@@ -2998,13 +3156,20 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 		if (sp->role.level > PT_PAGE_TABLE_LEVEL)
 			break;
 
+		/* Verify that the fault can be handled in the fast path */
+		if (!remove_acc_track && !remove_write_prot)
+			break;
+
 		/*
 		 * Currently, fast page fault only works for direct mapping
 		 * since the gfn is not stable for indirect shadow page. See
 		 * Documentation/virtual/kvm/locking.txt to get more detail.
 		 */
-		ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
-		if (ret)
+		fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+							iterator.sptep, spte,
+							remove_write_prot,
+							remove_acc_track);
+		if (fault_handled)
 			break;
 
 		if (++retry_count > 4) {
@@ -3018,10 +3183,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 	} while (true);
 
 	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
-			      spte, ret);
+			      spte, fault_handled);
 	walk_shadow_page_lockless_end(vcpu);
 
-	return ret;
+	return fault_handled;
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
@@ -4300,6 +4465,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
 }
 
+/* This is only supposed to be used for non-EPT mappings */
 static bool need_remote_flush(u64 old, u64 new)
 {
 	if (!is_shadow_present_pte(old))
@@ -5067,6 +5233,8 @@ static void mmu_destroy_caches(void)
 
 int kvm_mmu_module_init(void)
 {
+	kvm_mmu_clear_all_pte_masks();
+
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index ddc56e9..dfd3056 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,8 @@ static inline u64 rsvd_bits(int s, int e)
 }
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+void kvm_mmu_set_access_track_masks(u64 acc_track_mask, u64 acc_track_value,
+				    u64 saved_bits_mask, u64 saved_bits_shift);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 88e3b02..363517e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5019,7 +5019,22 @@ static void ept_set_mmio_spte_mask(void)
 	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
 	 * spte.
 	 */
-	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
+	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE |
+				   VMX_EPT_TRACK_MMIO);
+}
+
+static void ept_set_acc_track_spte_mask(void)
+{
+	/*
+	 * For access track PTEs we use a non-present PTE to trigger an EPT
+	 * Violation. The original RWX value is saved in some unused bits in
+	 * the PTE and restored when the violation is fixed.
+	 */
+	kvm_mmu_set_access_track_masks(VMX_EPT_RWX_MASK |
+				       VMX_EPT_TRACK_TYPE_MASK,
+				       VMX_EPT_TRACK_ACCESS,
+				       VMX_EPT_RWX_MASK,
+				       VMX_EPT_RWX_SAVE_SHIFT);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
@@ -6551,6 +6566,9 @@ static __init int hardware_setup(void)
 				      0ull : VMX_EPT_READABLE_MASK);
 		ept_set_mmio_spte_mask();
 		kvm_enable_tdp();
+
+		if (!enable_ept_ad_bits)
+			ept_set_acc_track_spte_mask();
 	} else
 		kvm_disable_tdp();
 
-- 
2.8.0.rc3.226.g39d4020

next prev parent reply	other threads:[~2016-11-08 23:00 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-27  2:19 [PATCH 0/4] Lockless Access Tracking for Intel CPUs without EPT A bits Junaid Shahid
2016-10-27  2:19 ` [PATCH 1/4] kvm: x86: mmu: Use symbolic constants for EPT Violation Exit Qualifications Junaid Shahid
2016-11-02 18:03   ` Paolo Bonzini
2016-11-02 21:40     ` Junaid Shahid
2016-10-27  2:19 ` [PATCH 2/4] kvm: x86: mmu: Rename spte_is_locklessly_modifiable() Junaid Shahid
2016-10-27  2:19 ` [PATCH 3/4] kvm: x86: mmu: Fast Page Fault path retries Junaid Shahid
2016-10-27  2:19 ` [PATCH 4/4] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits Junaid Shahid
2016-11-02 18:01   ` Paolo Bonzini
2016-11-02 21:42     ` Junaid Shahid
2016-11-08 23:00 ` [PATCH v2 0/5] Lockless Access Tracking " Junaid Shahid
2016-11-08 23:00   ` [PATCH v2 1/5] kvm: x86: mmu: Use symbolic constants for EPT Violation Exit Qualifications Junaid Shahid
2016-11-21 13:06     ` Paolo Bonzini
2016-11-08 23:00   ` [PATCH v2 2/5] kvm: x86: mmu: Rename spte_is_locklessly_modifiable() Junaid Shahid
2016-11-21 13:07     ` Paolo Bonzini
2016-11-08 23:00   ` [PATCH v2 3/5] kvm: x86: mmu: Fast Page Fault path retries Junaid Shahid
2016-11-21 13:13     ` Paolo Bonzini
2016-11-08 23:00   ` Junaid Shahid [this message]
2016-11-21 14:42     ` [PATCH v2 4/5] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits Paolo Bonzini
2016-11-24  3:50       ` Junaid Shahid
2016-11-25  9:45         ` Paolo Bonzini
2016-11-29  2:43           ` Junaid Shahid
2016-11-29  8:09             ` Paolo Bonzini
2016-11-30  0:59               ` Junaid Shahid
2016-11-30 11:09                 ` Paolo Bonzini
2016-12-01 22:54       ` Junaid Shahid
2016-12-02  8:33         ` Paolo Bonzini
2016-12-05 22:57           ` Junaid Shahid
2016-11-08 23:00   ` [PATCH v2 5/5] kvm: x86: mmu: Update documentation for fast page fault mechanism Junaid Shahid
2016-12-07  0:46 ` [PATCH v3 0/8] Lockless Access Tracking for Intel CPUs without EPT A bits Junaid Shahid
2016-12-07  0:46   ` [PATCH v3 1/8] kvm: x86: mmu: Use symbolic constants for EPT Violation Exit Qualifications Junaid Shahid
2016-12-15  6:50     ` Xiao Guangrong
2016-12-15 23:06       ` Junaid Shahid
2016-12-07  0:46   ` [PATCH v3 2/8] kvm: x86: mmu: Rename spte_is_locklessly_modifiable() Junaid Shahid
2016-12-15  6:51     ` Xiao Guangrong
2016-12-07  0:46   ` [PATCH v3 3/8] kvm: x86: mmu: Fast Page Fault path retries Junaid Shahid
2016-12-15  7:20     ` Xiao Guangrong
2016-12-15 23:36       ` Junaid Shahid
2016-12-16 13:13         ` Xiao Guangrong
2016-12-17  0:36           ` Junaid Shahid
2016-12-07  0:46   ` [PATCH v3 4/8] kvm: x86: mmu: Refactor accessed/dirty checks in mmu_spte_update/clear Junaid Shahid
2016-12-07  0:46   ` [PATCH v3 5/8] kvm: x86: mmu: Introduce a no-tracking version of mmu_spte_update Junaid Shahid
2016-12-07  0:46   ` [PATCH v3 6/8] kvm: x86: mmu: Do not use bit 63 for tracking special SPTEs Junaid Shahid
2016-12-07  0:46   ` [PATCH v3 7/8] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits Junaid Shahid
2016-12-14 16:28     ` Paolo Bonzini
2016-12-14 22:36       ` Junaid Shahid
2016-12-14 23:35         ` Paolo Bonzini
2016-12-16 13:04     ` Xiao Guangrong
2016-12-16 15:23       ` Paolo Bonzini
2016-12-17  0:01         ` Junaid Shahid
2016-12-21  9:49         ` Xiao Guangrong
2016-12-21 18:00           ` Paolo Bonzini
2016-12-17  2:04       ` Junaid Shahid
2016-12-17 14:19         ` Paolo Bonzini
2016-12-20  3:36           ` Junaid Shahid
2016-12-20  9:01             ` Paolo Bonzini
2016-12-07  0:46   ` [PATCH v3 8/8] kvm: x86: mmu: Update documentation for fast page fault mechanism Junaid Shahid

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:60991fb dfblob:3d63098 dfblob:a22a8a2 dfblob:8ea1618
dfblob:ddc56e9 dfblob:dfd3056 dfblob:88e3b02 dfblob:363517e )
 OR (
bs:"[PATCH v2 4/5] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1478646030-101103-5-git-send-email-junaids@google.com \
    --to=junaids@google.com \
    --cc=andreslc@google.com \
    --cc=guangrong.xiao@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=pfeiner@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.