From mboxrd@z Thu Jan 1 00:00:00 1970 From: Marcelo Tosatti Subject: Re: large page support for kvm Date: Thu, 14 Feb 2008 21:17:39 -0200 Message-ID: <20080214231739.GA7787@dmt> References: <479F604C.20107@qumranet.com> <20080130184035.GS6960@amd.com> <47A16054.6080201@qumranet.com> <20080211154901.GA11936@dmt> <47B1894A.1030208@qumranet.com> <20080213001519.GA32134@dmt> <47B2921F.1040905@qumranet.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="nFreZHaLTZJo0R7j" Cc: kvm-devel To: Avi Kivity Return-path: Content-Disposition: inline In-Reply-To: <47B2921F.1040905@qumranet.com> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: kvm-devel-bounces@lists.sourceforge.net Errors-To: kvm-devel-bounces@lists.sourceforge.net List-Id: kvm.vger.kernel.org --nFreZHaLTZJo0R7j Content-Type: text/plain; charset=us-ascii Content-Disposition: inline On Wed, Feb 13, 2008 at 08:45:51AM +0200, Avi Kivity wrote: > >gfn_to_page() needs to grab the struct page corresponding to the large > >page, not the offset struct page for the faulting 4k address within > >the large frame. Since gfn_to_page can sleep, there is no way to do > >that in the mapping logic which happens under mmu_lock protection. > >We don't want to grab the large page frame "struct page" unless the > >is_largepage_backed() checks are successful. > > > >The checks could be done in page_fault() if walker->level == 2, before > >gfn_to_page()... But I don't see much difference of that and doing > >it inside walk_addr(). What do you say? > > > > > > I'd like to keep walk_addr() independent of the rest of the mmu (i.e. > walk_addr is 100% guest oriented). Also, the issue you point out is > shared by direct_map which doesn't call walk_addr(). > > An unrelated issue (pointed out by Jun Nakajima) is that this kills > dirty log tracking (needed for migration). It could be solved simply by > not using large page backing if dirty log tracking is enabled for that slot. Ok, fixed your comments and a bug which a root page was shadowed in the large area being mapped. access.flat is happy. Joerg, can you give this a try on a NPT-enabled system (need the attached qemu-largepage-hack.patch). Thanks Index: kvm.largepages/arch/x86/kvm/mmu.c =================================================================== --- kvm.largepages.orig/arch/x86/kvm/mmu.c +++ kvm.largepages/arch/x86/kvm/mmu.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -211,6 +212,11 @@ static int is_shadow_present_pte(u64 pte && pte != shadow_notrap_nonpresent_pte; } +static int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + static int is_writeble_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; @@ -350,17 +356,120 @@ static void mmu_free_rmap_desc(struct kv kfree(rd); } +static int hpage_align_diff(unsigned long gfn) +{ + return ((gfn+KVM_PAGES_PER_HPAGE-1) & ~(KVM_PAGES_PER_HPAGE-1)) - gfn; +} + +/* + * Return the pointer to the largepage write count for a given + * gfn, handling slots that are not large page aligned. + */ +static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) +{ + unsigned long idx; + + idx = (gfn - slot->base_gfn) + hpage_align_diff(slot->base_gfn); + idx /= KVM_PAGES_PER_HPAGE; + return &slot->lpage_info[idx].write_count; +} + +static void account_shadowed(struct kvm *kvm, gfn_t gfn) +{ + int *write_count; + + write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + *write_count += 1; + WARN_ON(*write_count > KVM_PAGES_PER_HPAGE); +} + +static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) +{ + int *write_count; + + write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + *write_count -= 1; + WARN_ON(*write_count < 0); +} + +static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + int *largepage_idx; + + if (slot) { + largepage_idx = slot_largepage_idx(gfn, slot); + return *largepage_idx; + } + + return 1; +} + +static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) +{ + struct vm_area_struct *vma; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 0; + + vma = find_vma(current->mm, addr); + if (vma && is_vm_hugetlb_page(vma)) + return 1; + + return 0; +} + +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + struct kvm_memory_slot *slot; + + if (has_wrprotected_page(vcpu->kvm, large_gfn)) + return 0; + + if (!host_largepage_backed(vcpu->kvm, large_gfn)) + return 0; + + slot = gfn_to_memslot(vcpu->kvm, large_gfn); + if (slot && slot->dirty_bitmap) + return 0; + + /* guest has 4M pages, host 2M */ + if (!is_pae(vcpu) && HPAGE_SHIFT == 21) + return 0; + + return 1; +} + +static int is_physical_memory(struct kvm *kvm, gfn_t gfn) +{ + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 0; + + return 1; +} + /* * Take gfn and return the reverse mapping to it. * Note: gfn must be unaliased before this function get called */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) { struct kvm_memory_slot *slot; + unsigned long idx; slot = gfn_to_memslot(kvm, gfn); - return &slot->rmap[gfn - slot->base_gfn]; + if (!lpage) + return &slot->rmap[gfn - slot->base_gfn]; + + idx = gfn - slot->base_gfn + hpage_align_diff(slot->base_gfn); + idx /= KVM_PAGES_PER_HPAGE; + return &slot->lpage_info[idx].rmap_pde; } /* @@ -372,7 +481,7 @@ static unsigned long *gfn_to_rmap(struct * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; @@ -384,7 +493,7 @@ static void rmap_add(struct kvm_vcpu *vc gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; - rmapp = gfn_to_rmap(vcpu->kvm, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); *rmapp = (unsigned long)spte; @@ -450,7 +559,7 @@ static void rmap_remove(struct kvm *kvm, kvm_release_page_dirty(page); else kvm_release_page_clean(page); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -516,7 +625,7 @@ static void rmap_write_protect(struct kv int write_protected = 0; gfn = unalias_gfn(kvm, gfn); - rmapp = gfn_to_rmap(kvm, gfn); + rmapp = gfn_to_rmap(kvm, gfn, 0); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -529,8 +638,27 @@ static void rmap_write_protect(struct kv } spte = rmap_next(kvm, rmapp, spte); } + /* check for huge page mappings */ + rmapp = gfn_to_rmap(kvm, gfn, 1); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); + if (is_writeble_pte(*spte)) { + rmap_remove(kvm, spte); + --kvm->stat.lpages; + set_shadow_pte(spte, shadow_trap_nonpresent_pte); + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); + } + if (write_protected) kvm_flush_remote_tlbs(kvm); + + account_shadowed(kvm, gfn); } #ifdef MMU_DEBUG @@ -750,11 +878,17 @@ static void kvm_mmu_page_unlink_children for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { ent = pt[i]; + if (is_shadow_present_pte(ent)) { + if (!is_large_pte(ent)) { + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(page_header(ent), + &pt[i]); + } else { + --kvm->stat.lpages; + rmap_remove(kvm, &pt[i]); + } + } pt[i] = shadow_trap_nonpresent_pte; - if (!is_shadow_present_pte(ent)) - continue; - ent &= PT64_BASE_ADDR_MASK; - mmu_page_remove_parent_pte(page_header(ent), &pt[i]); } kvm_flush_remote_tlbs(kvm); } @@ -794,6 +928,8 @@ static void kvm_mmu_zap_page(struct kvm } kvm_mmu_page_unlink_children(kvm, sp); if (!sp->root_count) { + if (!sp->role.metaphysical) + unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else @@ -894,12 +1030,28 @@ struct page *gva_to_page(struct kvm_vcpu static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn, struct page *page) + int *ptwrite, int largepage, gfn_t gfn, + struct page *page) { u64 spte; int was_rmapped = is_rmap_pte(*shadow_pte); int was_writeble = is_writeble_pte(*shadow_pte); + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage) { + if (was_rmapped && !is_large_pte(*shadow_pte)) { + struct kvm_mmu_page *child; + u64 pte = *shadow_pte; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, shadow_pte); + } + was_rmapped = is_large_pte(*shadow_pte); + } + pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", __FUNCTION__, *shadow_pte, pt_access, @@ -919,6 +1071,8 @@ static void mmu_set_spte(struct kvm_vcpu spte |= PT_PRESENT_MASK; if (pte_access & ACC_USER_MASK) spte |= PT_USER_MASK; + if (largepage) + spte |= PT_PAGE_SIZE_MASK; spte |= page_to_phys(page); @@ -933,7 +1087,8 @@ static void mmu_set_spte(struct kvm_vcpu } shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { + if (shadow || + (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { pgprintk("%s: found shadow page for %lx, marking ro\n", __FUNCTION__, gfn); pte_access &= ~ACC_WRITE_MASK; @@ -941,6 +1096,18 @@ static void mmu_set_spte(struct kvm_vcpu spte &= ~PT_WRITABLE_MASK; kvm_x86_ops->tlb_flush(vcpu); } + /* + * Largepage creation is susceptible to a upper-level + * table to be shadowed and write-protected in the + * area being mapped. If that is the case, invalidate + * the entry and let the instruction fault again + * and use 4K mappings. + */ + if (largepage) { + spte = shadow_trap_nonpresent_pte; + kvm_x86_ops->tlb_flush(vcpu); + goto unshadowed; + } if (write_fault) *ptwrite = 1; } @@ -952,10 +1119,17 @@ unshadowed: mark_page_dirty(vcpu->kvm, gfn); pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); + pgprintk("instantiating %s PTE (%s) at %d (%llx)\n", + (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", + (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte); set_shadow_pte(shadow_pte, spte); + if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) + && (spte & PT_PRESENT_MASK)) + ++vcpu->kvm->stat.lpages; + page_header_update_slot(vcpu->kvm, shadow_pte, gfn); if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn); + rmap_add(vcpu, shadow_pte, gfn, largepage); if (!is_rmap_pte(*shadow_pte)) kvm_release_page_clean(page); } else { @@ -973,7 +1147,8 @@ static void nonpaging_new_cr3(struct kvm } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - gfn_t gfn, struct page *page, int level) + int largepage, gfn_t gfn, struct page *page, + int level) { hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; @@ -987,7 +1162,13 @@ static int __direct_map(struct kvm_vcpu if (level == 1) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn, page); + 0, write, 1, &pt_write, 0, gfn, page); + return pt_write; + } + + if (largepage && level == 2) { + mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, 1, gfn, page); return pt_write; } @@ -1017,12 +1198,19 @@ static int __direct_map(struct kvm_vcpu static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; + int largepage = 0; struct page *page; down_read(&vcpu->kvm->slots_lock); down_read(¤t->mm->mmap_sem); + if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1)) + && is_physical_memory(vcpu->kvm, gfn)) { + gfn &= ~(KVM_PAGES_PER_HPAGE-1); + largepage = 1; + } + page = gfn_to_page(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); @@ -1035,7 +1223,8 @@ static int nonpaging_map(struct kvm_vcpu spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, gfn, page, PT32E_ROOT_LEVEL); + r = __direct_map(vcpu, v, write, largepage, gfn, page, + PT32E_ROOT_LEVEL); spin_unlock(&vcpu->kvm->mmu_lock); up_read(&vcpu->kvm->slots_lock); @@ -1166,6 +1355,8 @@ static int tdp_page_fault(struct kvm_vcp { struct page *page; int r; + int largepage = 0; + gfn_t gfn = gpa >> PAGE_SHIFT; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1175,7 +1366,12 @@ static int tdp_page_fault(struct kvm_vcp return r; down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1)) + && is_physical_memory(vcpu->kvm, gfn)) { + gfn &= ~(KVM_PAGES_PER_HPAGE-1); + largepage = 1; + } + page = gfn_to_page(vcpu->kvm, gfn); if (is_error_page(page)) { kvm_release_page_clean(page); up_read(¤t->mm->mmap_sem); @@ -1184,7 +1380,7 @@ static int tdp_page_fault(struct kvm_vcp spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - gpa >> PAGE_SHIFT, page, TDP_ROOT_LEVEL); + largepage, gfn, page, TDP_ROOT_LEVEL); spin_unlock(&vcpu->kvm->mmu_lock); up_read(¤t->mm->mmap_sem); @@ -1383,7 +1579,8 @@ static void mmu_pte_write_zap_pte(struct pte = *spte; if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (sp->role.level == PT_PAGE_TABLE_LEVEL || + is_large_pte(pte)) rmap_remove(vcpu->kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); @@ -1391,6 +1588,8 @@ static void mmu_pte_write_zap_pte(struct } } set_shadow_pte(spte, shadow_trap_nonpresent_pte); + if (is_large_pte(pte)) + --vcpu->kvm->stat.lpages; } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, @@ -1398,7 +1597,8 @@ static void mmu_pte_write_new_pte(struct u64 *spte, const void *new) { - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if ((sp->role.level != PT_PAGE_TABLE_LEVEL) + && !vcpu->arch.update_pte.largepage) { ++vcpu->kvm->stat.mmu_pde_zapped; return; } @@ -1446,6 +1646,8 @@ static void mmu_guess_page_from_pte_writ u64 gpte = 0; struct page *page; + vcpu->arch.update_pte.largepage = 0; + if (bytes != 4 && bytes != 8) return; @@ -1474,6 +1676,10 @@ static void mmu_guess_page_from_pte_writ gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; down_read(¤t->mm->mmap_sem); + if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { + gfn &= ~(KVM_PAGES_PER_HPAGE-1); + vcpu->arch.update_pte.largepage = 1; + } page = gfn_to_page(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); Index: kvm.largepages/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.largepages.orig/arch/x86/kvm/paging_tmpl.h +++ kvm.largepages/arch/x86/kvm/paging_tmpl.h @@ -248,6 +248,7 @@ static void FNAME(update_pte)(struct kvm pt_element_t gpte; unsigned pte_access; struct page *npage; + int largepage = vcpu->arch.update_pte.largepage; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -264,7 +265,8 @@ static void FNAME(update_pte)(struct kvm return; get_page(npage); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); + gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), + npage); } /* @@ -272,8 +274,8 @@ static void FNAME(update_pte)(struct kvm */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *walker, - int user_fault, int write_fault, int *ptwrite, - struct page *page) + int user_fault, int write_fault, int largepage, + int *ptwrite, struct page *page) { hpa_t shadow_addr; int level; @@ -302,6 +304,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (level == PT_PAGE_TABLE_LEVEL) break; + + if (largepage && level == PT_DIRECTORY_LEVEL) + break; + if (is_shadow_present_pte(*shadow_ent)) { shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; continue; @@ -340,7 +346,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn, page); + ptwrite, largepage, walker->gfn, page); return shadow_ent; } @@ -370,6 +376,7 @@ static int FNAME(page_fault)(struct kvm_ int write_pt = 0; int r; struct page *page; + int largepage = 0; pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -397,6 +404,15 @@ static int FNAME(page_fault)(struct kvm_ } down_read(¤t->mm->mmap_sem); + if (walker.level == PT_DIRECTORY_LEVEL) { + gfn_t large_gfn; + large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, large_gfn) + && is_physical_memory(vcpu->kvm, walker.gfn)) { + walker.gfn = large_gfn; + largepage = 1; + } + } page = gfn_to_page(vcpu->kvm, walker.gfn); up_read(¤t->mm->mmap_sem); @@ -411,7 +427,7 @@ static int FNAME(page_fault)(struct kvm_ spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - &write_pt, page); + largepage, &write_pt, page); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, shadow_pte, *shadow_pte, write_pt); Index: kvm.largepages/arch/x86/kvm/x86.c =================================================================== --- kvm.largepages.orig/arch/x86/kvm/x86.c +++ kvm.largepages/arch/x86/kvm/x86.c @@ -86,6 +86,7 @@ struct kvm_stats_debugfs_item debugfs_en { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { "lpages", VM_STAT(lpages) }, { NULL } }; Index: kvm.largepages/include/asm-x86/kvm_host.h =================================================================== --- kvm.largepages.orig/include/asm-x86/kvm_host.h +++ kvm.largepages/include/asm-x86/kvm_host.h @@ -38,6 +38,13 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) +/* shadow tables are PAE even on non-PAE hosts */ +#define KVM_HPAGE_SHIFT 21 +#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) +#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) + +#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) + #define DE_VECTOR 0 #define UD_VECTOR 6 #define NM_VECTOR 7 @@ -228,6 +235,7 @@ struct kvm_vcpu_arch { struct { gfn_t gfn; /* presumed gfn during guest pte update */ struct page *page; /* page corresponding to that gfn */ + int largepage; } update_pte; struct i387_fxsave_struct host_fx_image; @@ -298,6 +306,7 @@ struct kvm_vm_stat { u32 mmu_recycled; u32 mmu_cache_miss; u32 remote_tlb_flush; + u32 lpages; }; struct kvm_vcpu_stat { Index: kvm.largepages/include/linux/kvm_host.h =================================================================== --- kvm.largepages.orig/include/linux/kvm_host.h +++ kvm.largepages/include/linux/kvm_host.h @@ -102,6 +102,10 @@ struct kvm_memory_slot { unsigned long flags; unsigned long *rmap; unsigned long *dirty_bitmap; + struct { + unsigned long rmap_pde; + int write_count; + } *lpage_info; unsigned long userspace_addr; int user_alloc; }; @@ -168,6 +172,7 @@ int kvm_arch_set_memory_region(struct kv int user_alloc); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, Index: kvm.largepages/virt/kvm/kvm_main.c =================================================================== --- kvm.largepages.orig/virt/kvm/kvm_main.c +++ kvm.largepages/virt/kvm/kvm_main.c @@ -189,9 +189,13 @@ static void kvm_free_physmem_slot(struct if (!dont || free->dirty_bitmap != dont->dirty_bitmap) vfree(free->dirty_bitmap); + if (!dont || free->lpage_info != dont->lpage_info) + vfree(free->lpage_info); + free->npages = 0; free->dirty_bitmap = NULL; free->rmap = NULL; + free->lpage_info = NULL; } void kvm_free_physmem(struct kvm *kvm) @@ -301,6 +305,22 @@ int __kvm_set_memory_region(struct kvm * new.user_alloc = user_alloc; new.userspace_addr = mem->userspace_addr; } + if (npages && !new.lpage_info) { + int largepages = npages / KVM_PAGES_PER_HPAGE; + if (npages % KVM_PAGES_PER_HPAGE) + largepages++; + new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); + + if (!new.lpage_info) + goto out_free; + + memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); + /* large page crosses memslot boundary */ + if (npages % KVM_PAGES_PER_HPAGE) { + new.lpage_info[0].write_count = 1; + new.lpage_info[largepages-1].write_count = 1; + } + } /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { @@ -444,7 +464,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; @@ -454,6 +474,7 @@ static unsigned long gfn_to_hva(struct k return bad_hva(); return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } +EXPORT_SYMBOL(gfn_to_hva); /* * Requires current->mm->mmap_sem to be held --nFreZHaLTZJo0R7j Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="qemu-largepage-hack.patch" Index: kvm-userspace/qemu/vl.c =================================================================== --- kvm-userspace.orig/qemu/vl.c +++ kvm-userspace/qemu/vl.c @@ -8501,6 +8501,31 @@ void qemu_get_launch_info(int *argc, cha *opt_incoming = incoming; } +#define HPAGE_SIZE 2*1024*1024 + +void *alloc_huge_area(unsigned long memory) +{ + void *area; + int fd; + char path[] = "/mnt/kvm.XXXXXX"; + + mkstemp(path); + fd = open(path, O_RDWR); + if (fd < 0) { + perror("open"); + exit(0); + } + memory = (memory+HPAGE_SIZE-1) & ~(HPAGE_SIZE-1); + + area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (area == MAP_FAILED) { + perror("mmap"); + exit(0); + } + + return area; +} + int main(int argc, char **argv) { #ifdef CONFIG_GDBSTUB @@ -9330,9 +9355,9 @@ int main(int argc, char **argv) ret = kvm_qemu_check_extension(KVM_CAP_USER_MEMORY); if (ret) { - printf("allocating %d MB\n", phys_ram_size/1024/1024); - phys_ram_base = qemu_vmalloc(phys_ram_size); - if (!phys_ram_base) { + //phys_ram_base = qemu_vmalloc(phys_ram_size); + phys_ram_base = alloc_huge_area(phys_ram_size); + if (!phys_ram_base) { fprintf(stderr, "Could not allocate physical memory\n"); exit(1); } --nFreZHaLTZJo0R7j Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ --nFreZHaLTZJo0R7j Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel --nFreZHaLTZJo0R7j--