From mboxrd@z Thu Jan 1 00:00:00 1970 From: Avi Kivity Subject: Re: [kvm-devel] performance with guests running 2.4 kernels (specifically RHEL3) Date: Tue, 20 May 2008 17:19:23 +0300 Message-ID: <4832DDEB.4000100@qumranet.com> References: <48054518.3000104@cisco.com> <4805BCF1.6040605@qumranet.com> <4807BD53.6020304@cisco.com> <48085485.3090205@qumranet.com> <480C188F.3020101@cisco.com> <480C5C39.4040300@qumranet.com> <480E492B.3060500@cisco.com> <480EEDA0.3080209@qumranet.com> <480F546C.2030608@cisco.com> <481215DE.3000302@cisco.com> <20080428181550.GA3965@dmt> <4816617F.3080403@cisco.com> <4817F30C.6050308@cisco.com> <48184228.2020701@qumranet.com> <481876A9.1010806@cisco.com> <48187903.2070409@qumranet.com> <4826E744.1080107@qumranet.com> <4826F668.6030305@qumranet.com> <48290FC2.4070505@cisco.com> <48294272.5020801@qumranet.com> <482B4D29.7010202@cisco.com> <482C1633.5070302@qumranet.com> <482E5F9C.6000207@cisco.com> <482FCEE1.5040306@qumranet.com> <4830F90A.1020809@cisco.com> <4830FE8D.6010006@cisco.com> <4 8318E64.8090706@qumranet.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="------------040601030809090006090604" Cc: kvm@vger.kernel.org To: "David S. Ahern" Return-path: Received: from bzq-179-150-194.static.bezeqint.net ([212.179.150.194]:29894 "EHLO il.qumranet.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753903AbYETOT0 (ORCPT ); Tue, 20 May 2008 10:19:26 -0400 In-Reply-To: <48318E64.8090706@qumranet.com> Sender: kvm-owner@vger.kernel.org List-ID: This is a multi-part message in MIME format. --------------040601030809090006090604 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Avi Kivity wrote: > > There are (at least) three options available: > - detect and special-case this scenario > - change the flood detector to be per page table instead of per vcpu > - change the flood detector to look at a list of recently used page > tables instead of the last page table > > I'm having a hard time trying to pick between the second and third > options. > The answer turns out to be "yes", so here's a patch that adds a pte access history table for each shadowed guest page-table. Let me know if it helps. Benchmarking a variety of workloads on all guests supported by kvm is left as an exercise for the reader, but I suspect the patch will either improve things all around, or can be modified to do so. -- error compiling committee.c: too many arguments to function --------------040601030809090006090604 Content-Type: text/x-patch; name="per-page-pte-history.patch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="per-page-pte-history.patch" diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 154727d..1a3d01a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1130,7 +1130,8 @@ unshadowed: if (speculative) { vcpu->arch.last_pte_updated = shadow_pte; vcpu->arch.last_pte_gfn = gfn; - } + } else + page_header(__pa(shadow_pte))->pte_history_len = 0; } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -1616,13 +1617,6 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) kvm_mmu_flush_tlb(vcpu); } -static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) -{ - u64 *spte = vcpu->arch.last_pte_updated; - - return !!(spte && (*spte & shadow_accessed_mask)); -} - static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { @@ -1679,13 +1673,49 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) { u64 *spte = vcpu->arch.last_pte_updated; + struct kvm_mmu_page *page; + + if (spte && vcpu->arch.last_pte_gfn == gfn) { + page = page_header(__pa(spte)); + page->pte_history_len = 0; + pgprintk("clearing page history, gfn %x ent %lx\n", + page->gfn, spte - page->spt); + } +} + +static bool kvm_mmu_page_flooded(struct kvm_mmu_page *page) +{ + int i, j, ent, len; - if (spte - && vcpu->arch.last_pte_gfn == gfn - && shadow_accessed_mask - && !(*spte & shadow_accessed_mask) - && is_shadow_present_pte(*spte)) - set_bit(PT_ACCESSED_SHIFT, spte); + len = page->pte_history_len; + for (i = len; i != 0; --i) { + ent = page->pte_history[i - 1]; + if (test_bit(PT_ACCESSED_SHIFT, &page->spt[ent])) { + for (j = i; j < len; ++j) + page->pte_history[j-i] = page->pte_history[j]; + page->pte_history_len = len - i; + return false; + } + } + if (page->pte_history_len < KVM_MAX_PTE_HISTORY) + return false; + return true; +} + +static void kvm_mmu_log_pte_history(struct kvm_mmu_page *page, u64 *spte) +{ + int i; + unsigned ent = spte - page->spt; + + if (page->pte_history_len > 0 + && page->pte_history[page->pte_history_len - 1] == ent) + return; + if (page->pte_history_len == KVM_MAX_PTE_HISTORY) { + for (i = 1; i < KVM_MAX_PTE_HISTORY; ++i) + page->pte_history[i-1] = page->pte_history[i]; + --page->pte_history_len; + } + page->pte_history[page->pte_history_len++] = ent; } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -1704,7 +1734,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned misaligned; unsigned quadrant; int level; - int flooded = 0; int npte; int r; @@ -1715,16 +1744,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; - } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { @@ -1733,7 +1752,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; - if (misaligned || flooded) { + if (misaligned || kvm_mmu_page_flooded(sp)) { /* * Misaligned accesses are too much trouble to fix * up; also, they usually indicate a page is not used @@ -1785,6 +1804,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_pte_write_zap_pte(vcpu, sp, spte); if (new) mmu_pte_write_new_pte(vcpu, sp, spte, new); + kvm_mmu_log_pte_history(sp, spte); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index a71f3aa..cbe550e 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -78,6 +78,7 @@ #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_MAX_PTE_HISTORY 4 extern spinlock_t kvm_lock; extern struct list_head vm_list; @@ -189,6 +190,9 @@ struct kvm_mmu_page { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ }; + + u16 pte_history_len; + u16 pte_history[KVM_MAX_PTE_HISTORY]; }; /* --------------040601030809090006090604--