From: Gleb Natapov <gleb@kernel.org>
To: mtosatti@redhat.com
Cc: kvm@vger.kernel.org, ak@linux.intel.com, pbonzini@redhat.com,
xiaoguangrong@linux.vnet.ibm.com, avi@cloudius-systems.com
Subject: Re: [patch 2/5] KVM: MMU: allow pinning spte translations (TDP-only)
Date: Thu, 19 Jun 2014 10:21:16 +0300 [thread overview]
Message-ID: <20140619072116.GC10948@minantech.com> (raw)
In-Reply-To: <20140618231521.569025131@amt.cnet>
On Wed, Jun 18, 2014 at 08:12:05PM -0300, mtosatti@redhat.com wrote:
> Allow vcpus to pin spte translations by:
>
> 1) Creating a per-vcpu list of pinned ranges.
What if memory slot containing pinned range is going away?
> 2) On mmu reload request:
> - Fault ranges.
> - Mark sptes with a pinned bit.
Should also be marked "dirty" as per SDM:
The three DS save area sections should be allocated from a non-paged pool, and marked accessed and dirty
Some comment below.
> - Mark shadow pages as pinned.
>
> 3) Then modify the following actions:
> - Page age => skip spte flush.
> - MMU notifiers => force mmu reload request (which kicks cpu out of
> guest mode).
> - GET_DIRTY_LOG => force mmu reload request.
> - SLAB shrinker => skip shadow page deletion.
>
> TDP-only.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> ---
> arch/x86/include/asm/kvm_host.h | 14 ++
> arch/x86/kvm/mmu.c | 202 ++++++++++++++++++++++++++++++++++++++--
> arch/x86/kvm/mmu.h | 5
> arch/x86/kvm/mmutrace.h | 23 ++++
> arch/x86/kvm/paging_tmpl.h | 2
> arch/x86/kvm/x86.c | 4
> 6 files changed, 241 insertions(+), 9 deletions(-)
>
> Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h 2014-06-18 17:28:17.549456614 -0300
> +++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h 2014-06-18 17:28:24.338435658 -0300
> @@ -221,6 +221,8 @@
> /* hold the gfn of each spte inside spt */
> gfn_t *gfns;
> bool unsync;
> + bool pinned;
> +
> int root_count; /* Currently serving as active root */
> unsigned int unsync_children;
> unsigned long parent_ptes; /* Reverse mapping for parent_pte */
> @@ -337,6 +339,14 @@
> KVM_DEBUGREG_WONT_EXIT = 2,
> };
>
> +struct kvm_pinned_page_range {
> + gfn_t base_gfn;
> + unsigned long npages;
> + struct list_head link;
> +};
> +
> +#define KVM_MAX_PER_VCPU_PINNED_RANGE 10
> +
> struct kvm_vcpu_arch {
> /*
> * rip and regs accesses must go through
> @@ -392,6 +402,10 @@
> struct kvm_mmu_memory_cache mmu_page_cache;
> struct kvm_mmu_memory_cache mmu_page_header_cache;
>
> + struct list_head pinned_mmu_pages;
> + struct mutex pinned_mmu_mutex;
> + unsigned int nr_pinned_ranges;
> +
> struct fpu guest_fpu;
> u64 xcr0;
> u64 guest_supported_xcr0;
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c 2014-06-18 17:28:17.550456611 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c 2014-06-18 17:28:24.339435654 -0300
> @@ -148,6 +148,9 @@
>
> #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> +#define SPTE_PINNED (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT))
> +
> +#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT
>
> #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>
> @@ -327,6 +330,11 @@
> return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
> }
>
> +static int is_pinned_spte(u64 spte)
> +{
> + return spte & SPTE_PINNED && is_shadow_present_pte(spte);
> +}
> +
> static int is_large_pte(u64 pte)
> {
> return pte & PT_PAGE_SIZE_MASK;
> @@ -2818,7 +2826,7 @@
> * - false: let the real page fault path to fix it.
> */
> static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> - u32 error_code)
> + u32 error_code, bool pin)
> {
> struct kvm_shadow_walk_iterator iterator;
> struct kvm_mmu_page *sp;
> @@ -2828,6 +2836,9 @@
> if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> return false;
>
> + if (pin)
> + return false;
> +
> if (!page_fault_can_be_fast(error_code))
> return false;
>
> @@ -2895,9 +2906,55 @@
> }
>
> static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> - gva_t gva, pfn_t *pfn, bool write, bool *writable);
> + gva_t gva, pfn_t *pfn, bool write, bool *writable,
> + bool pin);
> static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
>
> +
> +static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4])
> +{
> + struct kvm_shadow_walk_iterator iterator;
> + int nr_sptes = 0;
> +
> + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> + return nr_sptes;
> +
> + for_each_shadow_entry(vcpu, addr, iterator) {
> + sptes[iterator.level-1] = iterator.sptep;
> + nr_sptes++;
> + if (!is_shadow_present_pte(*iterator.sptep))
> + break;
> + }
> +
> + return nr_sptes;
> +}
> +
> +static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> + u64 *sptes[4];
> + int r, i, level;
> +
> + r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes);
> + if (!r)
> + return false;
> +
> + level = 5 - r;
> + if (!is_last_spte(*sptes[r-1], level))
> + return false;
> + if (!is_shadow_present_pte(*sptes[r-1]))
> + return false;
> +
> + for (i = 0; i < r; i++) {
> + u64 *sptep = sptes[i];
> + struct kvm_mmu_page *sp = page_header(__pa(sptep));
> +
> + sp->pinned = true;
> + set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
> + }
> +
> + return true;
> +}
> +
> static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
> gfn_t gfn, bool prefault, bool pin, bool *pinned)
> {
> @@ -2923,13 +2980,14 @@
> } else
> level = PT_PAGE_TABLE_LEVEL;
>
> - if (fast_page_fault(vcpu, v, level, error_code))
> + if (fast_page_fault(vcpu, v, level, error_code, pin))
> return 0;
>
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
>
> - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
> + if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable,
> + pin))
> return 0;
>
> if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
> @@ -2943,6 +3001,8 @@
> transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
> r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
> prefault);
> + if (pin)
> + *pinned = direct_pin_sptes(vcpu, gfn);
> spin_unlock(&vcpu->kvm->mmu_lock);
>
>
> @@ -3349,7 +3409,8 @@
> }
>
> static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> - gva_t gva, pfn_t *pfn, bool write, bool *writable)
> + gva_t gva, pfn_t *pfn, bool write, bool *writable,
> + bool pin)
> {
> bool async;
>
> @@ -3358,7 +3419,7 @@
> if (!async)
> return false; /* *pfn has correct page already */
>
> - if (!prefault && can_do_async_pf(vcpu)) {
> + if (!prefault && !pin && can_do_async_pf(vcpu)) {
> trace_kvm_try_async_get_page(gva, gfn);
> if (kvm_find_async_pf_gfn(vcpu, gfn)) {
> trace_kvm_async_pf_doublefault(gva, gfn);
> @@ -3406,13 +3467,14 @@
> } else
> level = PT_PAGE_TABLE_LEVEL;
>
> - if (fast_page_fault(vcpu, gpa, level, error_code))
> + if (fast_page_fault(vcpu, gpa, level, error_code, pin))
> return 0;
>
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
>
> - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
> + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable,
> + pin))
> return 0;
>
> if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
> @@ -3426,6 +3488,8 @@
> transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
> r = __direct_map(vcpu, gpa, write, map_writable,
> level, gfn, pfn, prefault);
> + if (pin)
> + *pinned = direct_pin_sptes(vcpu, gfn);
> spin_unlock(&vcpu->kvm->mmu_lock);
>
> return r;
> @@ -3903,6 +3967,127 @@
> }
> EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
>
> +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
> + gfn_t base_gfn, unsigned long npages)
> +{
> + struct kvm_pinned_page_range *p;
> +
> + mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> + if (p->base_gfn == base_gfn && p->npages == npages) {
> + mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> + return -EEXIST;
> + }
> + }
> + mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +
> + if (vcpu->arch.nr_pinned_ranges >=
> + KVM_MAX_PER_VCPU_PINNED_RANGE)
> + return -ENOSPC;
Shouldn't we refuse to register pinned range if !TDP?
> +
> + p = kzalloc(sizeof(struct kvm_pinned_page_range), GFP_KERNEL);
> + if (!p)
> + return -ENOMEM;
> +
> + vcpu->arch.nr_pinned_ranges++;
> +
> + trace_kvm_mmu_register_pinned_range(vcpu->vcpu_id, base_gfn, npages);
> +
> + INIT_LIST_HEAD(&p->link);
> + p->base_gfn = base_gfn;
> + p->npages = npages;
> + mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> + list_add(&p->link, &vcpu->arch.pinned_mmu_pages);
> + mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
> +
> + return 0;
> +}
> +
> +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
> + gfn_t base_gfn, unsigned long npages)
> +{
> + struct kvm_pinned_page_range *p;
> +
> + mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> + if (p->base_gfn == base_gfn && p->npages == npages) {
> + list_del(&p->link);
> + vcpu->arch.nr_pinned_ranges--;
> + mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> + kfree(p);
> + return 0;
> + }
> + }
> +
> + mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> + return -ENOENT;
> +}
> +
> +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_pinned_page_range *p, *p2;
> +
> + mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> + list_for_each_entry_safe(p, p2, &vcpu->arch.pinned_mmu_pages, link) {
> + list_del(&p->link);
> + kfree(p);
> + }
> + mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +}
> +
> +/*
> + * Pin KVM MMU page translations. This guarantees, for valid
> + * addresses registered by kvm_mmu_register_pinned_range (valid address
> + * meaning address which posses sufficient information for fault to
> + * be resolved), valid translations exist while in guest mode and
> + * therefore no VM-exits due to faults will occur.
> + *
> + * Failure to instantiate pages will abort guest entry.
> + *
> + * Page frames should be pinned with get_page in advance.
> + *
> + * Pinning is not guaranteed while executing as L2 guest.
> + *
> + */
> +
> +static void kvm_mmu_pin_pages(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_pinned_page_range *p;
> +
> + if (is_guest_mode(vcpu))
> + return;
> +
> + if (!vcpu->arch.mmu.direct_map)
> + return;
> +
> + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
> +
> + mutex_lock(&vcpu->arch.pinned_mmu_mutex);
> + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
> + gfn_t gfn_offset;
> +
> + for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) {
> + gfn_t gfn = p->base_gfn + gfn_offset;
> + int r;
> + bool pinned = false;
> +
> + r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT,
> + PFERR_WRITE_MASK, false,
> + true, &pinned);
> + /* MMU notifier sequence window: retry */
> + if (!r && !pinned)
> + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
> + if (r) {
> + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
I do not think triple fault is appropriate here. The reasons for triple fault are
documented in SDM and this is not one of them. What about error exit to user space?
> + break;
> + }
> +
> + }
> + }
> + mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
> +}
> +
> int kvm_mmu_load(struct kvm_vcpu *vcpu)
> {
> int r;
> @@ -3916,6 +4101,7 @@
> goto out;
> /* set_cr3() should ensure TLB has been flushed */
> vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
> + kvm_mmu_pin_pages(vcpu);
> out:
> return r;
> }
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.h 2014-06-18 17:27:47.582549238 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.h 2014-06-18 17:28:24.339435654 -0300
> @@ -178,4 +178,9 @@
> }
>
> void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
> +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
> + gfn_t base_gfn, unsigned long npages);
> +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
> + gfn_t base_gfn, unsigned long npages);
> +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu);
> #endif
> Index: kvm.pinned-sptes/arch/x86/kvm/x86.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c 2014-06-18 17:28:17.552456605 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/x86.c 2014-06-18 17:28:24.340435651 -0300
> @@ -7049,6 +7049,8 @@
>
> kvm_async_pf_hash_reset(vcpu);
> kvm_pmu_init(vcpu);
> + INIT_LIST_HEAD(&vcpu->arch.pinned_mmu_pages);
> + mutex_init(&vcpu->arch.pinned_mmu_mutex);
>
> return 0;
> fail_free_wbinvd_dirty_mask:
> @@ -7069,6 +7071,7 @@
> {
> int idx;
>
> + kvm_mmu_free_pinned_ranges(vcpu);
> kvm_pmu_destroy(vcpu);
> kfree(vcpu->arch.mce_banks);
> kvm_free_lapic(vcpu);
> @@ -7113,6 +7116,7 @@
> int r;
> r = vcpu_load(vcpu);
> BUG_ON(r);
> + kvm_mmu_free_pinned_ranges(vcpu);
> kvm_mmu_unload(vcpu);
> vcpu_put(vcpu);
> }
> Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h 2014-06-18 17:28:17.550456611 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h 2014-06-18 17:28:24.340435651 -0300
> @@ -747,7 +747,7 @@
> smp_rmb();
>
> if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
> - &map_writable))
> + &map_writable, false))
> return 0;
>
> if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
> Index: kvm.pinned-sptes/arch/x86/kvm/mmutrace.h
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmutrace.h 2014-06-18 17:27:47.583549234 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmutrace.h 2014-06-18 17:28:24.340435651 -0300
> @@ -322,6 +322,29 @@
> __entry->kvm_gen == __entry->spte_gen
> )
> );
> +
> +TRACE_EVENT(
> + kvm_mmu_register_pinned_range,
> + TP_PROTO(unsigned int vcpu_id, gfn_t gfn, unsigned long npages),
> + TP_ARGS(vcpu_id, gfn, npages),
> +
> + TP_STRUCT__entry(
> + __field( unsigned int, vcpu_id )
> + __field( gfn_t, gfn )
> + __field( unsigned long, npages )
> + ),
> +
> + TP_fast_assign(
> + __entry->vcpu_id = vcpu_id;
> + __entry->gfn = gfn;
> + __entry->npages = npages;
> + ),
> +
> + TP_printk("vcpu_id %u gfn %llx npages %lx",
> + __entry->vcpu_id,
> + __entry->gfn,
> + __entry->npages)
> +);
> #endif /* _TRACE_KVMMMU_H */
>
> #undef TRACE_INCLUDE_PATH
>
>
--
Gleb.
next prev parent reply other threads:[~2014-06-19 7:21 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-06-18 23:12 [patch 0/5] KVM: support for pinning sptes mtosatti
2014-06-18 23:12 ` [patch 1/5] KVM: x86: add pinned parameter to page_fault methods mtosatti
2014-06-18 23:12 ` [patch 2/5] KVM: MMU: allow pinning spte translations (TDP-only) mtosatti
2014-06-19 7:21 ` Gleb Natapov [this message]
2014-06-19 19:22 ` Marcelo Tosatti
2014-06-20 10:09 ` Gleb Natapov
2014-06-30 20:46 ` Marcelo Tosatti
2014-06-30 22:00 ` Andi Kleen
2014-06-19 8:01 ` Avi Kivity
2014-06-19 14:06 ` Andi Kleen
2014-06-19 18:26 ` Marcelo Tosatti
2014-06-22 13:35 ` Avi Kivity
2014-07-09 13:25 ` Marcelo Tosatti
2014-07-02 0:58 ` Nadav Amit
2014-06-18 23:12 ` [patch 3/5] KVM: MMU: notifiers support for pinned sptes mtosatti
2014-06-19 6:48 ` Gleb Natapov
2014-06-19 18:28 ` Marcelo Tosatti
2014-06-20 10:11 ` Gleb Natapov
2014-06-18 23:12 ` [patch 4/5] KVM: MMU: reload request from GET_DIRTY_LOG path mtosatti
2014-06-19 8:17 ` Gleb Natapov
2014-06-19 18:40 ` Marcelo Tosatti
2014-06-20 10:46 ` Gleb Natapov
2014-06-30 20:59 ` Marcelo Tosatti
2014-07-01 6:27 ` Gleb Natapov
2014-07-01 17:50 ` Marcelo Tosatti
2014-06-18 23:12 ` [patch 5/5] KVM: MMU: pinned sps are not candidates for deletion mtosatti
2014-06-19 1:44 ` [patch 0/5] KVM: support for pinning sptes Andi Kleen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140619072116.GC10948@minantech.com \
--to=gleb@kernel.org \
--cc=ak@linux.intel.com \
--cc=avi@cloudius-systems.com \
--cc=kvm@vger.kernel.org \
--cc=mtosatti@redhat.com \
--cc=pbonzini@redhat.com \
--cc=xiaoguangrong@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.