From mboxrd@z Thu Jan 1 00:00:00 1970 From: Xiao Guangrong Subject: Re: [PATCH v3 7/8] kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits. Date: Fri, 16 Dec 2016 21:04:56 +0800 Message-ID: <93b5692a-0f76-a31d-46f3-b85d19298d92@linux.intel.com> References: <1481071577-40250-1-git-send-email-junaids@google.com> <1481071577-40250-8-git-send-email-junaids@google.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Cc: andreslc@google.com, pfeiner@google.com, pbonzini@redhat.com To: Junaid Shahid , kvm@vger.kernel.org Return-path: Received: from mga01.intel.com ([192.55.52.88]:60335 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755028AbcLPNNg (ORCPT ); Fri, 16 Dec 2016 08:13:36 -0500 In-Reply-To: <1481071577-40250-8-git-send-email-junaids@google.com> Sender: kvm-owner@vger.kernel.org List-ID: On 12/07/2016 08:46 AM, Junaid Shahid wrote: > This change implements lockless access tracking for Intel CPUs without EPT > A bits. This is achieved by marking the PTEs as not-present (but not > completely clearing them) when clear_flush_young() is called after marking > the pages as accessed. When an EPT Violation is generated as a result of > the VM accessing those pages, the PTEs are restored to their original values. > > Signed-off-by: Junaid Shahid > --- > arch/x86/include/asm/kvm_host.h | 3 +- > arch/x86/include/asm/vmx.h | 9 +- > arch/x86/kvm/mmu.c | 274 +++++++++++++++++++++++++++++++--------- > arch/x86/kvm/vmx.c | 26 ++-- > arch/x86/kvm/x86.c | 2 +- > 5 files changed, 237 insertions(+), 77 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 5a10eb7..da1d4b9 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1056,7 +1056,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); > void kvm_mmu_init_vm(struct kvm *kvm); > void kvm_mmu_uninit_vm(struct kvm *kvm); > void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, > - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); > + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, > + u64 acc_track_mask); Actually, this is the mask cleared by acc-track rather that _set_ by acc-track, maybe suppress_by_acc_track_mask is a better name. > > void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); > void kvm_mmu_slot_remove_write_access(struct kvm *kvm, > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index 45ee6d9..9d228a8 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -438,11 +438,14 @@ enum vmcs_field { > #define VMX_EPT_IPAT_BIT (1ull << 6) > #define VMX_EPT_ACCESS_BIT (1ull << 8) > #define VMX_EPT_DIRTY_BIT (1ull << 9) > +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ > + VMX_EPT_WRITABLE_MASK | \ > + VMX_EPT_EXECUTABLE_MASK) > +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) I saw no space using this mask, can be dropped. > > /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ > -#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ > - VMX_EPT_EXECUTABLE_MASK) > - > +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ > + VMX_EPT_EXECUTABLE_MASK) > > #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 3f66fd3..6ba6220 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -37,6 +37,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -129,6 +130,10 @@ module_param(dbg, bool, 0644); > #define ACC_USER_MASK PT_USER_MASK > #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) > > +/* The mask for the R/X bits in EPT PTEs */ > +#define PT64_EPT_READABLE_MASK 0x1ull > +#define PT64_EPT_EXECUTABLE_MASK 0x4ull > + Can we move this EPT specific stuff out of mmu.c? > #include > > #define CREATE_TRACE_POINTS > @@ -178,6 +183,25 @@ static u64 __read_mostly shadow_dirty_mask; > static u64 __read_mostly shadow_mmio_mask; > static u64 __read_mostly shadow_present_mask; > > +/* > + * The mask/value to distinguish a PTE that has been marked not-present for > + * access tracking purposes. > + * The mask would be either 0 if access tracking is disabled, or > + * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. > + */ > +static u64 __read_mostly shadow_acc_track_mask; > +static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; > + > +/* > + * The mask/shift to use for saving the original R/X bits when marking the PTE > + * as not-present for access tracking purposes. We do not save the W bit as the > + * PTEs being access tracked also need to be dirty tracked, so the W bit will be > + * restored only when a write is attempted to the page. > + */ > +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | > + PT64_EPT_EXECUTABLE_MASK; > +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; > + > static void mmu_spte_set(u64 *sptep, u64 spte); > static void mmu_free_roots(struct kvm_vcpu *vcpu); > > @@ -187,6 +211,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) > } > EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); > > +static inline bool is_access_track_spte(u64 spte) > +{ > + return shadow_acc_track_mask != 0 && > + (spte & shadow_acc_track_mask) == shadow_acc_track_value; > +} spte & SPECIAL_MASK && !is_mmio(spte) is more clearer. > + > /* > * the low bit of the generation number is always presumed to be zero. > * This disables mmio caching during memslot updates. The concept is > @@ -284,7 +314,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) > } > > void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, > - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) > + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, > + u64 acc_track_mask) > { > shadow_user_mask = user_mask; > shadow_accessed_mask = accessed_mask; > @@ -292,9 +323,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, > shadow_nx_mask = nx_mask; > shadow_x_mask = x_mask; > shadow_present_mask = p_mask; > + shadow_acc_track_mask = acc_track_mask; > + WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); > } > EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); > > +void kvm_mmu_clear_all_pte_masks(void) > +{ > + shadow_user_mask = 0; > + shadow_accessed_mask = 0; > + shadow_dirty_mask = 0; > + shadow_nx_mask = 0; > + shadow_x_mask = 0; > + shadow_mmio_mask = 0; > + shadow_present_mask = 0; > + shadow_acc_track_mask = 0; > +} > + Hmmm... why is it needed? Static values are always init-ed to zero... > static int is_cpuid_PSE36(void) > { > return 1; > @@ -307,7 +352,7 @@ static int is_nx(struct kvm_vcpu *vcpu) > > static int is_shadow_present_pte(u64 pte) > { > - return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); > + return (pte != 0) && !is_mmio_spte(pte); > } > > static int is_large_pte(u64 pte) > @@ -490,23 +535,20 @@ static bool spte_has_volatile_bits(u64 spte) > if (spte_can_locklessly_be_made_writable(spte)) > return true; > > - if (!shadow_accessed_mask) > - return false; > - > if (!is_shadow_present_pte(spte)) > return false; > > - if ((spte & shadow_accessed_mask) && > - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) > - return false; > + if (!shadow_accessed_mask) > + return is_access_track_spte(spte); > > - return true; > + return (spte & shadow_accessed_mask) == 0 || > + (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0); > } > > static bool is_accessed_spte(u64 spte) > { > return shadow_accessed_mask ? spte & shadow_accessed_mask > - : true; > + : !is_access_track_spte(spte); > } > > static bool is_dirty_spte(u64 spte) > @@ -650,6 +692,65 @@ static u64 mmu_spte_get_lockless(u64 *sptep) > return __get_spte_lockless(sptep); > } > > +static u64 mark_spte_for_access_track(u64 spte) > +{ > + if (shadow_accessed_mask != 0) > + return spte & ~shadow_accessed_mask; > + > + if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) > + return spte; > + > + /* > + * Verify that the write-protection that we do below will be fixable > + * via the fast page fault path. Currently, that is always the case, at > + * least when using EPT (which is when access tracking would be used). > + */ > + WARN_ONCE((spte & PT_WRITABLE_MASK) && > + !spte_can_locklessly_be_made_writable(spte), > + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); This code is right but i can not understand the comment here... :( > + > + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << > + shadow_acc_track_saved_bits_shift), > + "kvm: Access Tracking saved bit locations are not zero\n"); > + > + spte |= (spte & shadow_acc_track_saved_bits_mask) << > + shadow_acc_track_saved_bits_shift; > + spte &= ~shadow_acc_track_mask; > + spte |= shadow_acc_track_value; > + > + return spte; > +} > + > +/* Returns the Accessed status of the PTE and resets it at the same time. */ > +static bool mmu_spte_age(u64 *sptep) > +{ > + u64 spte = mmu_spte_get_lockless(sptep); > + > + if (spte & shadow_accessed_mask) { > + clear_bit((ffs(shadow_accessed_mask) - 1), > + (unsigned long *)sptep); > + return true; > + } > + > + if (shadow_accessed_mask == 0) { > + if (is_access_track_spte(spte)) > + return false; > + > + /* > + * Capture the dirty status of the page, so that it doesn't get > + * lost when the SPTE is marked for access tracking. > + */ > + if (is_writable_pte(spte)) > + kvm_set_pfn_dirty(spte_to_pfn(spte)); > + > + spte = mark_spte_for_access_track(spte); > + mmu_spte_update_no_track(sptep, spte); > + return true; > + } > + > + return false; > +} > + > static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) > { > /* > @@ -1434,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > restart: > for_each_rmap_spte(rmap_head, &iter, sptep) { > rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", > - sptep, *sptep, gfn, level); > + sptep, *sptep, gfn, level); > > need_flush = 1; > > @@ -1447,7 +1548,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > > new_spte &= ~PT_WRITABLE_MASK; > new_spte &= ~SPTE_HOST_WRITEABLE; > - new_spte &= ~shadow_accessed_mask; > + > + new_spte = mark_spte_for_access_track(new_spte); > > mmu_spte_clear_track_bits(sptep); > mmu_spte_set(sptep, new_spte); > @@ -1609,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > struct rmap_iterator uninitialized_var(iter); > int young = 0; > > - BUG_ON(!shadow_accessed_mask); > - > - for_each_rmap_spte(rmap_head, &iter, sptep) { > - if (*sptep & shadow_accessed_mask) { > - young = 1; > - clear_bit((ffs(shadow_accessed_mask) - 1), > - (unsigned long *)sptep); > - } > - } > + for_each_rmap_spte(rmap_head, &iter, sptep) > + young |= mmu_spte_age(sptep); > > trace_kvm_age_page(gfn, level, slot, young); > return young; > @@ -1631,11 +1726,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > struct rmap_iterator iter; > > /* > - * If there's no access bit in the secondary pte set by the > - * hardware it's up to gup-fast/gup to set the access bit in > - * the primary pte or in the page structure. > + * If there's no access bit in the secondary pte set by the hardware and > + * fast access tracking is also not enabled, it's up to gup-fast/gup to > + * set the access bit in the primary pte or in the page structure. > */ > - if (!shadow_accessed_mask) > + if (!shadow_accessed_mask && !shadow_acc_track_mask) > goto out; > > for_each_rmap_spte(rmap_head, &iter, sptep) > @@ -1670,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) > * This has some overhead, but not as much as the cost of swapping > * out actively used pages or breaking up actively used hugepages. > */ > - if (!shadow_accessed_mask) > + if (!shadow_accessed_mask && !shadow_acc_track_mask) > return kvm_handle_hva_range(kvm, start, end, 0, > kvm_unmap_rmapp); > > @@ -2593,6 +2688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, > spte |= shadow_dirty_mask; > } > > + if (speculative) > + spte = mark_spte_for_access_track(spte); > + > set_pte: > if (mmu_spte_update(sptep, spte)) > kvm_flush_remote_tlbs(vcpu->kvm); > @@ -2646,7 +2744,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, > pgprintk("%s: setting spte %llx\n", __func__, *sptep); > pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", > is_large_pte(*sptep)? "2MB" : "4kB", > - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, > + *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, > *sptep, sptep); > if (!was_rmapped && is_large_pte(*sptep)) > ++vcpu->kvm->stat.lpages; > @@ -2879,16 +2977,28 @@ static bool page_fault_can_be_fast(u32 error_code) > if (unlikely(error_code & PFERR_RSVD_MASK)) > return false; > > - /* > - * #PF can be fast only if the shadow page table is present and it > - * is caused by write-protect, that means we just need change the > - * W bit of the spte which can be done out of mmu-lock. > - */ > - if (!(error_code & PFERR_PRESENT_MASK) || > - !(error_code & PFERR_WRITE_MASK)) > + /* See if the page fault is due to an NX violation */ > + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) > + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) > return false; > > - return true; > + /* > + * #PF can be fast if: > + * 1. The shadow page table entry is not present, which could mean that > + * the fault is potentially caused by access tracking (if enabled). > + * 2. The shadow page table entry is present and the fault > + * is caused by write-protect, that means we just need change the W > + * bit of the spte which can be done out of mmu-lock. > + * > + * However, if access tracking is disabled we know that a non-present > + * page must be a genuine page fault where we have to create a new SPTE. > + * So, if access tracking is disabled, we return true only for write > + * accesses to a present page. > + */ > + > + return shadow_acc_track_mask != 0 || > + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) > + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); acc-track can not fix a WRITE-access, this should be: !(error_code & (PFERR_WRITE_MASK)) && shadow_acc_track_mask != 0 || ... > } > > /* > @@ -2897,17 +3007,26 @@ static bool page_fault_can_be_fast(u32 error_code) > */ > static bool > fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > - u64 *sptep, u64 spte) > + u64 *sptep, u64 old_spte, > + bool remove_write_prot, bool remove_acc_track) > { > gfn_t gfn; > + u64 new_spte = old_spte; > > WARN_ON(!sp->role.direct); > > - /* > - * The gfn of direct spte is stable since it is calculated > - * by sp->gfn. > - */ > - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); > + if (remove_acc_track) { > + u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift) > + & shadow_acc_track_saved_bits_mask; > + > + new_spte &= ~shadow_acc_track_mask; > + new_spte &= ~(shadow_acc_track_saved_bits_mask << > + shadow_acc_track_saved_bits_shift); > + new_spte |= saved_bits; > + } > + > + if (remove_write_prot) > + new_spte |= PT_WRITABLE_MASK; > > /* > * Theoretically we could also set dirty bit (and flush TLB) here in > @@ -2921,10 +3040,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > * > * Compare with set_spte where instead shadow_dirty_mask is set. > */ > - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte) > + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) > return false; > > - kvm_vcpu_mark_page_dirty(vcpu, gfn); > + if (remove_write_prot) { > + /* > + * The gfn of direct spte is stable since it is > + * calculated by sp->gfn. > + */ > + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); > + kvm_vcpu_mark_page_dirty(vcpu, gfn); > + } > > return true; > } > @@ -2955,35 +3081,55 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > break; > > do { > - /* > - * If the mapping has been changed, let the vcpu fault on the > - * same address again. > - */ > - if (!is_shadow_present_pte(spte)) { > - fault_handled = true; > - break; > - } > + bool remove_write_prot = false; > + bool remove_acc_track; > > sp = page_header(__pa(iterator.sptep)); > if (!is_last_spte(spte, sp->role.level)) > break; > > /* > - * Check if it is a spurious fault caused by TLB lazily flushed. > + * Check whether the memory access that caused the fault would > + * still cause it if it were to be performed right now. If not, > + * then this is a spurious fault caused by TLB lazily flushed, > + * or some other CPU has already fixed the PTE after the > + * current CPU took the fault. > * > * Need not check the access of upper level table entries since > * they are always ACC_ALL. > */ > - if (is_writable_pte(spte)) { > - fault_handled = true; > - break; > + > + if (error_code & PFERR_FETCH_MASK) { > + if ((spte & (shadow_x_mask | shadow_nx_mask)) > + == shadow_x_mask) { > + fault_handled = true; > + break; > + } > + } else if (error_code & PFERR_WRITE_MASK) { > + if (is_writable_pte(spte)) { > + fault_handled = true; > + break; > + } > + > + /* > + * Currently, to simplify the code, write-protection can > + * be removed in the fast path only if the SPTE was > + * write-protected for dirty-logging. > + */ > + remove_write_prot = > + spte_can_locklessly_be_made_writable(spte); > + } else { > + /* Fault was on Read access */ > + if (spte & PT_PRESENT_MASK) { > + fault_handled = true; > + break; > + } > } > > - /* > - * Currently, to simplify the code, only the spte > - * write-protected by dirty-log can be fast fixed. > - */ > - if (!spte_can_locklessly_be_made_writable(spte)) > + remove_acc_track = is_access_track_spte(spte); > + Why not check cached R/X permission can satisfy R/X access before goto atomic path? > + /* Verify that the fault can be handled in the fast path */ > + if (!remove_acc_track && !remove_write_prot) > break; > > /* > @@ -2997,7 +3143,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > * > * See the comments in kvm_arch_commit_memory_region(). > */ > - if (sp->role.level > PT_PAGE_TABLE_LEVEL) > + if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot) > break; > > /* > @@ -3006,7 +3152,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > * Documentation/virtual/kvm/locking.txt to get more detail. > */ > fault_handled = fast_pf_fix_direct_spte(vcpu, sp, > - iterator.sptep, spte); > + iterator.sptep, spte, > + remove_write_prot, > + remove_acc_track); > if (fault_handled) > break; > > @@ -5095,6 +5243,8 @@ static void mmu_destroy_caches(void) > > int kvm_mmu_module_init(void) > { > + kvm_mmu_clear_all_pte_masks(); > + > pte_list_desc_cache = kmem_cache_create("pte_list_desc", > sizeof(struct pte_list_desc), > 0, 0, NULL); > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 6a01e755..50fc078 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -6318,6 +6318,19 @@ static void wakeup_handler(void) > spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); > } > > +void vmx_enable_tdp(void) > +{ > + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, > + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, > + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, > + 0ull, VMX_EPT_EXECUTABLE_MASK, > + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, > + enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK); I think commonly set SPTE_SPECIAL_MASK (move set SPTE_SPECIAL_MASK to mmu.c) for mmio-mask and acc-track-mask can make the code more clearer... Thanks!