From mboxrd@z Thu Jan 1 00:00:00 1970 From: Marcelo Tosatti Subject: [patch 09/10] KVM: MMU: out of sync shadow core v2 Date: Thu, 18 Sep 2008 18:27:58 -0300 Message-ID: <20080918213337.148804603@localhost.localdomain> References: <20080918212749.800177179@localhost.localdomain> Cc: kvm@vger.kernel.org, "David S. Ahern" , Marcelo Tosatti To: Avi Kivity Return-path: Received: from mx1.redhat.com ([66.187.233.31]:59372 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756562AbYIRVt6 (ORCPT ); Thu, 18 Sep 2008 17:49:58 -0400 Content-Disposition: inline; filename=kvm-oos-core Sender: kvm-owner@vger.kernel.org List-ID: Allow guest pagetables to go out of sync. Signed-off-by: Marcelo Tosatti Index: kvm/arch/x86/kvm/mmu.c =================================================================== --- kvm.orig/arch/x86/kvm/mmu.c +++ kvm/arch/x86/kvm/mmu.c @@ -148,6 +148,7 @@ struct kvm_shadow_walk { }; typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); +typedef int (*mmu_unsync_fn) (struct kvm_mmu_page *sp, void *priv); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; @@ -942,6 +943,39 @@ static void nonpaging_invlpg(struct kvm_ { } +static int mmu_unsync_walk(struct kvm_mmu_page *parent, mmu_unsync_fn fn, + void *priv) +{ + int i, ret; + struct kvm_mmu_page *sp = parent; + + while (parent->unsync_children) { + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + sp = child; + break; + } + if (child->unsync) { + ret = fn(child, priv); + if (ret) + return ret; + } + } + } + if (i == PT64_ENT_PER_PAGE) { + sp->unsync_children = 0; + sp = parent; + } + } + return 0; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -962,6 +996,47 @@ static struct kvm_mmu_page *kvm_mmu_look return NULL; } +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + sp->unsync = 0; + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + rmap_write_protect(vcpu->kvm, sp->gfn); + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return 0; +} + +static int mmu_sync_fn(struct kvm_mmu_page *sp, void *priv) +{ + struct kvm_vcpu *vcpu = priv; + + kvm_sync_page(vcpu, sp); + return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + while (mmu_unsync_walk(sp, mmu_sync_fn, vcpu)) + cond_resched_lock(&vcpu->kvm->mmu_lock); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -975,7 +1050,7 @@ static struct kvm_mmu_page *kvm_mmu_get_ unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; role.word = 0; role.glevels = vcpu->arch.mmu.root_level; @@ -991,8 +1066,18 @@ static struct kvm_mmu_page *kvm_mmu_get_ gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + + if (sp->unsync_children) + vcpu->arch.mmu.need_root_sync = 1; + mmu_page_add_parent_pte(vcpu, sp, parent_pte); pgprintk("%s: found\n", __func__); return sp; @@ -1112,14 +1197,45 @@ static void kvm_mmu_unlink_parents(struc } } +struct mmu_zap_walk { + struct kvm *kvm; + int zapped; +}; + +static int mmu_zap_fn(struct kvm_mmu_page *sp, void *private) +{ + struct mmu_zap_walk *zap_walk = private; + + kvm_mmu_zap_page(zap_walk->kvm, sp); + zap_walk->zapped = 1; + return 0; +} + +static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct mmu_zap_walk mmu_zap_walk = { + .kvm = kvm, + .zapped = 0, + }; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + mmu_unsync_walk(sp, mmu_zap_fn, &mmu_zap_walk); + return mmu_zap_walk.zapped; +} + static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + int ret; ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.metaphysical) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); @@ -1129,7 +1245,7 @@ static int kvm_mmu_zap_page(struct kvm * kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); - return 0; + return ret; } /* @@ -1221,10 +1337,57 @@ struct page *gva_to_page(struct kvm_vcpu return page; } +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + sp->unsync_children = 1; + return 1; +} + +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.metaphysical) + continue; + if (s->role.word != sp->role.word) + return 1; + } + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + return 0; +} + +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) +{ + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->unsync) + return 0; + if (can_unsync) + return kvm_unsync_page(vcpu, shadow); + return 1; + } + return 0; +} + static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - gfn_t gfn, pfn_t pfn, bool speculative) + gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync) { u64 spte; int ret = 0; @@ -1251,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcp if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { ret = 1; @@ -1261,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcp spte |= PT_WRITABLE_MASK; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); ret = 1; @@ -1280,7 +1441,6 @@ set_pte: return ret; } - static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, @@ -1318,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative)) { + dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; if (was_writeble) @@ -1539,10 +1699,6 @@ static void mmu_alloc_roots(struct kvm_v vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } -static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ -} - static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; Index: kvm/include/asm-x86/kvm_host.h =================================================================== --- kvm.orig/include/asm-x86/kvm_host.h +++ kvm/include/asm-x86/kvm_host.h @@ -195,6 +195,8 @@ struct kvm_mmu_page { */ int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ + bool unsync; + bool unsync_children; union { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ @@ -226,6 +228,7 @@ struct kvm_mmu { hpa_t root_hpa; int root_level; int shadow_root_level; + bool need_root_sync; u64 *pae_root; }; @@ -371,6 +374,7 @@ struct kvm_vm_stat { u32 mmu_flooded; u32 mmu_recycled; u32 mmu_cache_miss; + u32 mmu_unsync; u32 remote_tlb_flush; u32 lpages; }; Index: kvm/arch/x86/kvm/x86.c =================================================================== --- kvm.orig/arch/x86/kvm/x86.c +++ kvm/arch/x86/kvm/x86.c @@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_en { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } Index: kvm/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.orig/arch/x86/kvm/paging_tmpl.h +++ kvm/arch/x86/kvm/paging_tmpl.h @@ -449,6 +449,11 @@ static int FNAME(page_fault)(struct kvm_ kvm_mmu_audit(vcpu, "post page fault (fixed)"); spin_unlock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.need_root_sync) { + kvm_mmu_sync_roots(vcpu); + vcpu->arch.mmu.need_root_sync = 0; + } + return write_pt; out_unlock: @@ -576,7 +581,7 @@ static int FNAME(sync_page)(struct kvm_v pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, is_dirty_pte(gpte), 0, gfn, - spte_to_pfn(sp->spt[i]), true); + spte_to_pfn(sp->spt[i]), true, false); } } --