From mboxrd@z Thu Jan 1 00:00:00 1970 From: Marcelo Tosatti Subject: [patch 09/13] KVM: MMU: out of sync shadow core Date: Sat, 06 Sep 2008 15:48:31 -0300 Message-ID: <20080906192431.211131067@localhost.localdomain> References: <20080906184822.560099087@localhost.localdomain> Cc: kvm@vger.kernel.org To: Avi Kivity Return-path: Received: from mx1.redhat.com ([66.187.233.31]:57354 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752477AbYIFT1h (ORCPT ); Sat, 6 Sep 2008 15:27:37 -0400 Content-Disposition: inline; filename=kvm-oos-core Sender: kvm-owner@vger.kernel.org List-ID: Allow global and single-root, single-role-per-gfn leaf shadowed pagetables to be unsynced. Global unsync pages are saved into a per-vm array, synced on cr4/cr0 writes. Non-global unsync pages are linked off their root shadow page, synced on cr3/cr4/cr0 writes. Some of this logic is simplistic and could be smarter (page_multimapped and the full root sync on higher level pagetable sharing). Also unsyncing of non-leaf nodes might be interesting (but more complicated). Index: kvm/arch/x86/kvm/mmu.c =================================================================== --- kvm.orig/arch/x86/kvm/mmu.c +++ kvm/arch/x86/kvm/mmu.c @@ -64,6 +64,17 @@ static void kvm_mmu_audit(struct kvm_vcp #define rmap_printk(x...) do { } while (0) #endif +#define OOS_DEBUG +#if defined (OOS_DEBUG) +#define OOS_ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + dump_stack(); \ + } +#else +#define OOS_ASSERT(x) do { } while (0) +#endif #if defined(MMU_DEBUG) || defined(AUDIT) static int dbg = 0; @@ -773,6 +784,8 @@ static struct kvm_mmu_page *kvm_mmu_allo sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + INIT_LIST_HEAD(&sp->oos_link); + INIT_LIST_HEAD(&sp->unsync_pages); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); ASSERT(is_empty_shadow_page(sp->spt)); @@ -896,6 +909,31 @@ static struct kvm_mmu_page *kvm_mmu_look return NULL; } +static struct kvm_mmu_page *kvm_mmu_lookup_page_root(struct kvm_vcpu *vcpu, + gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + struct kvm *kvm = vcpu->kvm; + int level = vcpu->arch.mmu.root_level; + if (!is_long_mode(vcpu) && is_pae(vcpu)) + level--; + + pgprintk("%s: looking for gfn %lx\n", __func__, gfn); + index = kvm_page_table_hashfn(gfn); + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.metaphysical + && !sp->role.invalid && sp->role.level == level) { + pgprintk("%s: found role %x\n", + __func__, sp->role.word); + return sp; + } + return NULL; +} + static void kvm_sync_writeble(struct kvm_vcpu *vcpu, u64 *spte, int gpte_rw, gfn_t gfn) { @@ -913,12 +951,48 @@ static void kvm_sync_writeble(struct kvm return; } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (kvm_page_global(sp)) { + int i; + /* FIXME: save index into sp->flags */ + for (i = 0; i < ARRAY_SIZE(kvm->arch.oos_global_pages); i++) + if (kvm->arch.oos_global_pages[i] == sp) { + kvm->arch.oos_global_pages[i] = NULL; + break; + } + OOS_ASSERT(i < ARRAY_SIZE(kvm->arch.oos_global_pages)); + --kvm->stat.mmu_unsync_global; + } else { + list_del(&sp->oos_link); + --kvm->stat.mmu_unsync; + } +} + +static void kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp); +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static void kvm_mmu_zap_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (unlikely(kvm_page_inuse(sp))) { + kvm_mmu_page_unlink_children(kvm, sp); + kvm_flush_remote_tlbs(kvm); + kvm_unlink_unsync_page(kvm, sp); + kvm_clear_pg_unsync(sp); + } else + kvm_mmu_zap_page(kvm, sp); +} static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int ret; + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_unsync_page(vcpu->kvm, sp); + return -EINVAL; + } + rmap_write_protect(vcpu->kvm, sp->gfn); ret = vcpu->arch.mmu.sync_page(vcpu, sp); if (ret <= 0) @@ -926,13 +1000,87 @@ static int kvm_sync_page(struct kvm_vcpu * mappings (only originally writeble ones * of course). */ - kvm_mmu_zap_page(vcpu->kvm, sp); - else + kvm_mmu_zap_unsync_page(vcpu->kvm, sp); + else { + kvm_unlink_unsync_page(vcpu->kvm, sp); kvm_clear_pg_unsync(sp); + } return ret; } +static int mmu_unsync_global_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) + +{ + struct kvm *kvm = vcpu->kvm; + unsigned idx = kvm->arch.oos_global_idx; + + if (kvm->arch.oos_global_pages[idx]) + kvm_sync_page(vcpu, kvm->arch.oos_global_pages[idx]); + + kvm_set_pg_unsync(sp); + kvm->arch.oos_global_pages[idx] = sp; + + kvm->arch.oos_global_idx++; + if (kvm->arch.oos_global_idx >= ARRAY_SIZE(kvm->arch.oos_global_pages)) + kvm->arch.oos_global_idx = 0; + + ++kvm->stat.mmu_unsync_global; + return 0; +} + +static int page_multimapped(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + while (!sp->multimapped) { + if (!sp->parent_pte) + return 0; + sp = page_header(__pa(sp->parent_pte)); + } + return 1; +} + +static int mmu_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + struct kvm_mmu_page *root_sp; + + if (page_multimapped(vcpu, sp)) + return 1; + + root_sp = kvm_mmu_lookup_page_root(vcpu, sp->root_gfn); + if (!root_sp) + return 1; + + kvm_set_pg_unsync(sp); + list_add(&sp->oos_link, &root_sp->unsync_pages); + ++vcpu->kvm->stat.mmu_unsync; + return 0; +} + +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (kvm_page_global(sp)) + return mmu_unsync_global_page(vcpu, sp); + else + return mmu_unsync_page(vcpu, sp); +} + +static int set_shared_mmu_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + struct kvm_mmu_page *root_sp; + int ret = 0; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + if (kvm_page_unsync(sp)) + kvm_sync_page(vcpu, sp); + } else if (sp->root_gfn != -1) { + root_sp = kvm_mmu_lookup_page_root(vcpu, sp->root_gfn); + } + + sp->root_gfn = -1; + return ret; +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t root_gfn, gfn_t gfn, @@ -947,7 +1095,8 @@ static struct kvm_mmu_page *kvm_mmu_get_ unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; + int unsyncable = 1; role.word = 0; role.glevels = vcpu->arch.mmu.root_level; @@ -963,8 +1112,24 @@ static struct kvm_mmu_page *kvm_mmu_get_ gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + /* + * If a pagetable becomes referenced by more than one + * root, or has multiple roles, unsync it and disable + * oos. For higher level pgtables the entire tree + * has to be synced. + */ + if (sp->root_gfn != root_gfn) { + kvm_set_pg_inuse(sp); + if (set_shared_mmu_page(vcpu, sp)) + tmp = bucket->first; + kvm_clear_pg_inuse(sp); + unsyncable = 0; + } + if (sp->role.word != role.word) + continue; + mmu_page_add_parent_pte(vcpu, sp, parent_pte); pgprintk("%s: found\n", __func__); return sp; @@ -975,6 +1140,9 @@ static struct kvm_mmu_page *kvm_mmu_get_ return sp; pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; + if (!unsyncable) + root_gfn = -1; + sp->root_gfn = root_gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); if (!metaphysical) @@ -1084,14 +1252,35 @@ static void kvm_mmu_unlink_parents(struc } } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_drop_unsync_children(struct kvm *kvm, + struct kvm_mmu_page *root_sp) +{ + struct kvm_mmu_page *sp, *n; + + list_for_each_entry_safe(sp, n, &root_sp->unsync_pages, oos_link) { + OOS_ASSERT(kvm_page_unsync(sp)); + OOS_ASSERT(!kvm_page_global(sp)); + OOS_ASSERT(sp->role.level == PT_PAGE_TABLE_LEVEL); + OOS_ASSERT(list_empty(&sp->unsync_pages)); + kvm_mmu_zap_page(kvm, sp); + } +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + int ret = 0; ++kvm->stat.mmu_shadow_zapped; kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.metaphysical) unaccount_shadowed(kvm, sp->gfn); + if (kvm_test_clear_pg_unsync(sp)) + kvm_unlink_unsync_page(kvm, sp); + if (!list_empty(&sp->unsync_pages)) { + kvm_drop_unsync_children(kvm, sp); + ret = 1; + } if (!sp->root_count) { hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); @@ -1101,6 +1290,7 @@ static void kvm_mmu_zap_page(struct kvm kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); + return ret; } /* @@ -1153,8 +1343,9 @@ static int kvm_mmu_unprotect_page(struct if (sp->gfn == gfn && !sp->role.metaphysical) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; } return r; } @@ -1191,6 +1382,25 @@ struct page *gva_to_page(struct kvm_vcpu return page; } +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + int write_fault) +{ + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (!write_fault) + return 1; + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->root_gfn == -1) + return 1; + if (!kvm_page_unsync(shadow)) + return kvm_unsync_page(vcpu, shadow); + } + return 0; +} + static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, @@ -1207,8 +1417,13 @@ static void mmu_set_spte(struct kvm_vcpu __func__, *shadow_pte, pt_access, write_fault, user_fault, gfn); - if (!global) + kvm_set_pg_inuse(sp); + + if (!global && kvm_page_global(sp)) { + if (kvm_page_unsync(sp)) + kvm_sync_page(vcpu, sp); kvm_clear_pg_global(sp); + } if (is_rmap_pte(*shadow_pte)) { /* @@ -1256,12 +1471,19 @@ static void mmu_set_spte(struct kvm_vcpu if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; + /* + * Do not create write protected large translations. + */ + if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + spte = shadow_trap_nonpresent_pte; + was_writeble = 0; + *ptwrite = 0; + goto set_shadow; + } spte |= PT_WRITABLE_MASK; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { + if (mmu_need_write_protect(vcpu, gfn, write_fault)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); pte_access &= ~ACC_WRITE_MASK; @@ -1272,16 +1494,9 @@ static void mmu_set_spte(struct kvm_vcpu if (write_fault) *ptwrite = 1; } - /* - * Do not create write protected large translations. - */ - if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { - spte = shadow_trap_nonpresent_pte; - was_writeble = 0; - *ptwrite = 0; - } } +set_shadow: if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); @@ -1309,6 +1524,7 @@ static void mmu_set_spte(struct kvm_vcpu vcpu->arch.last_pte_updated = shadow_pte; vcpu->arch.last_pte_gfn = gfn; } + kvm_clear_pg_inuse(sp); } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -1950,7 +2166,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu * */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -2174,7 +2391,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); Index: kvm/include/asm-x86/kvm_host.h =================================================================== --- kvm.orig/include/asm-x86/kvm_host.h +++ kvm/include/asm-x86/kvm_host.h @@ -179,6 +179,10 @@ union kvm_mmu_page_role { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + /* FIXME: one list_head is enough */ + struct list_head unsync_pages; + struct list_head oos_link; + gfn_t root_gfn; /* root this pagetable belongs to, -1 if multimapped */ /* * The following two entries are used to key the shadow page in the @@ -362,6 +366,8 @@ struct kvm_arch{ unsigned int n_requested_mmu_pages; unsigned int n_alloc_mmu_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + struct kvm_mmu_page *oos_global_pages[7]; + unsigned oos_global_idx; /* * Hash table of struct kvm_mmu_page. */ @@ -390,6 +396,8 @@ struct kvm_vm_stat { u32 mmu_flooded; u32 mmu_recycled; u32 mmu_cache_miss; + u32 mmu_unsync; + u32 mmu_unsync_global; u32 remote_tlb_flush; u32 lpages; }; @@ -763,6 +771,7 @@ int kvm_age_hva(struct kvm *kvm, unsigne enum kvm_page_flags { KVM_PG_global, KVM_PG_unsync, + KVM_PG_inuse, }; #define KVMPGFLAG(name) \ @@ -777,5 +786,6 @@ static inline int kvm_test_clear_pg_##na KVMPGFLAG(global); KVMPGFLAG(unsync); +KVMPGFLAG(inuse); #endif Index: kvm/arch/x86/kvm/x86.c =================================================================== --- kvm.orig/arch/x86/kvm/x86.c +++ kvm/arch/x86/kvm/x86.c @@ -100,6 +100,8 @@ struct kvm_stats_debugfs_item debugfs_en { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } Index: kvm/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.orig/arch/x86/kvm/paging_tmpl.h +++ kvm/arch/x86/kvm/paging_tmpl.h @@ -81,6 +81,7 @@ struct shadow_walker { int write_fault; int largepage; int *ptwrite; + int multiroot; pfn_t pfn; u64 *sptep; }; @@ -294,7 +295,7 @@ static int FNAME(shadow_walk_entry)(stru struct kvm_mmu_page *shadow_page; u64 spte; int metaphysical; - gfn_t table_gfn; + gfn_t table_gfn, root_gfn; int r; pt_element_t curr_pte; @@ -324,9 +325,11 @@ static int FNAME(shadow_walk_entry)(stru metaphysical = 0; table_gfn = gw->table_gfn[level - 2]; } - shadow_page = kvm_mmu_get_page(vcpu, gw->table_gfn[gw->root_level - 1], - table_gfn, (gva_t)addr, level-1, - metaphysical, access, sptep); + root_gfn = gw->table_gfn[gw->root_level - 1]; + if (sw->multiroot) + root_gfn = -1; + shadow_page = kvm_mmu_get_page(vcpu, root_gfn, table_gfn, (gva_t)addr, + level-1, metaphysical, access, sptep); if (!metaphysical) { r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); @@ -336,6 +339,8 @@ static int FNAME(shadow_walk_entry)(stru return 1; } } + if (shadow_page->root_gfn == -1) + sw->multiroot = 1; spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; @@ -355,6 +360,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu .write_fault = write_fault, .largepage = largepage, .ptwrite = ptwrite, + .multiroot = 0, .pfn = pfn, }; --