From: Ben Gardon <bgardon@google.com>
To: kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
Peter Feiner <pfeiner@google.com>,
Peter Shier <pshier@google.com>,
Junaid Shahid <junaids@google.com>,
Jim Mattson <jmattson@google.com>,
Ben Gardon <bgardon@google.com>
Subject: [RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler
Date: Thu, 26 Sep 2019 16:18:12 -0700 [thread overview]
Message-ID: <20190926231824.149014-17-bgardon@google.com> (raw)
In-Reply-To: <20190926231824.149014-1-bgardon@google.com>
Adds handler functions to replace __direct_map in handling direct page
faults. These functions, unlike __direct_map can handle page faults on
multiple VCPUs simultaneously.
Signed-off-by: Ben Gardon <bgardon@google.com>
---
arch/x86/kvm/mmu.c | 192 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 179 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0696658b527c..f3a26a32c8174 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1117,6 +1117,24 @@ static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
}
+/*
+ * Return an unused object to the specified cache. The object's memory should
+ * be zeroed before being returned if that memory was modified after allocation
+ * from the cache.
+ */
+static void mmu_memory_cache_return(struct kvm_mmu_memory_cache *mc,
+ void *obj)
+{
+ /*
+ * Since this object was allocated from the cache, the cache should
+ * have at least one spare capacity to put the object back.
+ */
+ BUG_ON(mc->nobjs >= ARRAY_SIZE(mc->objects));
+
+ mc->objects[mc->nobjs] = obj;
+ mc->nobjs++;
+}
+
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
{
kmem_cache_free(pte_list_desc_cache, pte_list_desc);
@@ -2426,6 +2444,21 @@ static bool direct_walk_iterator_set_pte(struct direct_walk_iterator *iter,
return r;
}
+static u64 generate_nonleaf_pte(u64 *child_pt, bool ad_disabled)
+{
+ u64 pte;
+
+ pte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
+
+ if (ad_disabled)
+ pte |= shadow_acc_track_value;
+ else
+ pte |= shadow_accessed_mask;
+
+ return pte;
+}
+
/**
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
* @kvm: kvm instance
@@ -3432,13 +3465,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
- spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_mask;
-
- if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
- else
- spte |= shadow_accessed_mask;
+ spte = generate_nonleaf_pte(sp->spt, sp_ad_disabled(sp));
mmu_spte_set(sptep, spte);
@@ -4071,6 +4098,126 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
return ret;
}
+static int direct_page_fault_handle_target_level(struct kvm_vcpu *vcpu,
+ int write, int map_writable, struct direct_walk_iterator *iter,
+ kvm_pfn_t pfn, bool prefault)
+{
+ u64 new_pte;
+ int ret = 0;
+ int generate_pte_ret = 0;
+
+ if (unlikely(is_noslot_pfn(pfn)))
+ new_pte = generate_mmio_pte(vcpu, iter->pte_gfn_start, ACC_ALL);
+ else {
+ generate_pte_ret = generate_pte(vcpu, ACC_ALL, iter->level,
+ iter->pte_gfn_start, pfn,
+ iter->old_pte, prefault, false,
+ map_writable, false, &new_pte);
+ /* Failed to construct a PTE. Retry the page fault. */
+ if (!new_pte)
+ return RET_PF_RETRY;
+ }
+
+ /*
+ * If the page fault was caused by a write but the page is write
+ * protected, emulation is needed. If the emulation was skipped,
+ * the vcpu would have the same fault again.
+ */
+ if ((generate_pte_ret & SET_SPTE_WRITE_PROTECTED_PT) && write)
+ ret = RET_PF_EMULATE;
+
+ /* If an MMIO PTE was installed, the MMIO will need to be emulated. */
+ if (unlikely(is_mmio_spte(new_pte)))
+ ret = RET_PF_EMULATE;
+
+ /*
+ * If this would not change the PTE then some other thread must have
+ * already fixed the page fault and there's no need to proceed.
+ */
+ if (iter->old_pte == new_pte)
+ return ret;
+
+ /*
+ * If this warning were to trigger, it would indicate that there was a
+ * missing MMU notifier or this thread raced with some notifier
+ * handler. The page fault handler should never change a present, leaf
+ * PTE to point to a differnt PFN. A notifier handler should have
+ * zapped the PTE before the main MM's page table was changed.
+ */
+ WARN_ON(is_present_direct_pte(iter->old_pte) &&
+ is_present_direct_pte(new_pte) &&
+ is_last_spte(iter->old_pte, iter->level) &&
+ is_last_spte(new_pte, iter->level) &&
+ spte_to_pfn(iter->old_pte) != spte_to_pfn(new_pte));
+
+ /*
+ * If the page fault handler lost the race to set the PTE, retry the
+ * page fault.
+ */
+ if (!direct_walk_iterator_set_pte(iter, new_pte))
+ return RET_PF_RETRY;
+
+ /*
+ * Update some stats for this page fault, if the page
+ * fault was not speculative.
+ */
+ if (!prefault)
+ vcpu->stat.pf_fixed++;
+
+ return ret;
+
+}
+
+static int handle_direct_page_fault(struct kvm_vcpu *vcpu,
+ unsigned long mmu_seq, int write, int map_writable, int level,
+ gpa_t gpa, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+{
+ struct direct_walk_iterator iter;
+ struct kvm_mmu_memory_cache *pf_pt_cache = &vcpu->arch.mmu_page_cache;
+ u64 *child_pt;
+ u64 new_pte;
+ int ret = RET_PF_RETRY;
+
+ direct_walk_iterator_setup_walk(&iter, vcpu->kvm,
+ kvm_arch_vcpu_memslots_id(vcpu), gpa >> PAGE_SHIFT,
+ (gpa >> PAGE_SHIFT) + 1, MMU_READ_LOCK);
+ while (direct_walk_iterator_next_pte(&iter)) {
+ if (iter.level == level) {
+ ret = direct_page_fault_handle_target_level(vcpu,
+ write, map_writable, &iter, pfn,
+ prefault);
+
+ break;
+ } else if (!is_present_direct_pte(iter.old_pte) ||
+ is_large_pte(iter.old_pte)) {
+ /*
+ * The leaf PTE for this fault must be mapped at a
+ * lower level, so a non-leaf PTE must be inserted into
+ * the paging structure. If the assignment below
+ * succeeds, it will add the non-leaf PTE and a new
+ * page of page table memory. Then the iterator can
+ * traverse into that new page. If the atomic compare/
+ * exchange fails, the iterator will repeat the current
+ * PTE, so the only thing this function must do
+ * differently is return the page table memory to the
+ * vCPU's fault cache.
+ */
+ child_pt = mmu_memory_cache_alloc(pf_pt_cache);
+ new_pte = generate_nonleaf_pte(child_pt, false);
+
+ if (!direct_walk_iterator_set_pte(&iter, new_pte))
+ mmu_memory_cache_return(pf_pt_cache, child_pt);
+ }
+ }
+ direct_walk_iterator_end_traversal(&iter);
+
+ /* If emulating, flush this vcpu's TLB. */
+ if (ret == RET_PF_EMULATE)
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ return ret;
+}
+
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
{
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
@@ -5014,7 +5161,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
+ bool map_writable = false;
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
@@ -5035,8 +5182,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
- if (fast_page_fault(vcpu, gpa, level, error_code))
- return RET_PF_RETRY;
+ if (!vcpu->kvm->arch.direct_mmu_enabled)
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -5048,17 +5196,31 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
return r;
r = RET_PF_RETRY;
- write_lock(&vcpu->kvm->mmu_lock);
+ if (vcpu->kvm->arch.direct_mmu_enabled)
+ read_lock(&vcpu->kvm->mmu_lock);
+ else
+ write_lock(&vcpu->kvm->mmu_lock);
+
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
if (make_mmu_pages_available(vcpu) < 0)
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+
+ if (vcpu->kvm->arch.direct_mmu_enabled)
+ r = handle_direct_page_fault(vcpu, mmu_seq, write, map_writable,
+ level, gpa, gfn, pfn, prefault);
+ else
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault);
out_unlock:
- write_unlock(&vcpu->kvm->mmu_lock);
+ if (vcpu->kvm->arch.direct_mmu_enabled)
+ read_unlock(&vcpu->kvm->mmu_lock);
+ else
+ write_unlock(&vcpu->kvm->mmu_lock);
+
kvm_release_pfn_clean(pfn);
return r;
}
@@ -6242,6 +6404,10 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
LIST_HEAD(invalid_list);
+ if (vcpu->arch.mmu->direct_map && vcpu->kvm->arch.direct_mmu_enabled)
+ /* Reclaim is a todo. */
+ return true;
+
if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
return 0;
--
2.23.0.444.g18eeb5a265-goog
next prev parent reply other threads:[~2019-09-26 23:19 UTC|newest]
Thread overview: 57+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-09-26 23:17 [RFC PATCH 00/28] kvm: mmu: Rework the x86 TDP direct mapped case Ben Gardon
2019-09-26 23:17 ` [RFC PATCH 01/28] kvm: mmu: Separate generating and setting mmio ptes Ben Gardon
2019-11-27 18:15 ` Sean Christopherson
2019-09-26 23:17 ` [RFC PATCH 02/28] kvm: mmu: Separate pte generation from set_spte Ben Gardon
2019-11-27 18:25 ` Sean Christopherson
2019-09-26 23:17 ` [RFC PATCH 03/28] kvm: mmu: Zero page cache memory at allocation time Ben Gardon
2019-11-27 18:32 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 04/28] kvm: mmu: Update the lpages stat atomically Ben Gardon
2019-11-27 18:39 ` Sean Christopherson
2019-12-06 20:10 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 05/28] sched: Add cond_resched_rwlock Ben Gardon
2019-11-27 18:42 ` Sean Christopherson
2019-12-06 20:12 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 06/28] kvm: mmu: Replace mmu_lock with a read/write lock Ben Gardon
2019-11-27 18:47 ` Sean Christopherson
2019-12-02 22:45 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 07/28] kvm: mmu: Add functions for handling changed PTEs Ben Gardon
2019-11-27 19:04 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 08/28] kvm: mmu: Init / Uninit the direct MMU Ben Gardon
2019-12-02 23:40 ` Sean Christopherson
2019-12-06 20:25 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 09/28] kvm: mmu: Free direct MMU page table memory in an RCU callback Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 10/28] kvm: mmu: Flush TLBs before freeing direct MMU page table memory Ben Gardon
2019-12-02 23:46 ` Sean Christopherson
2019-12-06 20:31 ` Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 11/28] kvm: mmu: Optimize for freeing direct MMU PTs on teardown Ben Gardon
2019-12-02 23:54 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 12/28] kvm: mmu: Set tlbs_dirty atomically Ben Gardon
2019-12-03 0:13 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 13/28] kvm: mmu: Add an iterator for concurrent paging structure walks Ben Gardon
2019-12-03 2:15 ` Sean Christopherson
2019-12-18 18:25 ` Ben Gardon
2019-12-18 19:14 ` Sean Christopherson
2019-09-26 23:18 ` [RFC PATCH 14/28] kvm: mmu: Batch updates to the direct mmu disconnected list Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 15/28] kvm: mmu: Support invalidate_zap_all_pages Ben Gardon
2019-09-26 23:18 ` Ben Gardon [this message]
2020-01-08 17:20 ` [RFC PATCH 16/28] kvm: mmu: Add direct MMU page fault handler Peter Xu
2020-01-08 18:15 ` Ben Gardon
2020-01-08 19:00 ` Peter Xu
2019-09-26 23:18 ` [RFC PATCH 17/28] kvm: mmu: Add direct MMU fast " Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 18/28] kvm: mmu: Add an hva range iterator for memslot GFNs Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 19/28] kvm: mmu: Make address space ID a property of memslots Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 20/28] kvm: mmu: Implement the invalidation MMU notifiers for the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 21/28] kvm: mmu: Integrate the direct mmu with the changed pte notifier Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 22/28] kvm: mmu: Implement access tracking for the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 23/28] kvm: mmu: Make mark_page_dirty_in_slot usable from outside kvm_main Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 24/28] kvm: mmu: Support dirty logging in the direct MMU Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 25/28] kvm: mmu: Support kvm_zap_gfn_range " Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 26/28] kvm: mmu: Integrate direct MMU with nesting Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 27/28] kvm: mmu: Lazily allocate rmap when direct MMU is enabled Ben Gardon
2019-09-26 23:18 ` [RFC PATCH 28/28] kvm: mmu: Support MMIO in the direct MMU Ben Gardon
2019-10-17 18:50 ` [RFC PATCH 00/28] kvm: mmu: Rework the x86 TDP direct mapped case Sean Christopherson
2019-10-18 13:42 ` Paolo Bonzini
2019-11-27 19:09 ` Sean Christopherson
2019-12-06 19:55 ` Ben Gardon
2019-12-06 19:57 ` Sean Christopherson
2019-12-06 20:42 ` Ben Gardon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190926231824.149014-17-bgardon@google.com \
--to=bgardon@google.com \
--cc=jmattson@google.com \
--cc=junaids@google.com \
--cc=kvm@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=pfeiner@google.com \
--cc=pshier@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.