From mboxrd@z Thu Jan  1 00:00:00 1970
From: Avi Kivity <avi@qumranet.com>
Subject: Re: [kvm-devel] performance with guests running 2.4 kernels	(specifically
 RHEL3)
Date: Tue, 20 May 2008 17:19:23 +0300
Message-ID: <4832DDEB.4000100@qumranet.com>
References: <48054518.3000104@cisco.com>	<4805BCF1.6040605@qumranet.com>	<4807BD53.6020304@cisco.com>	<48085485.3090205@qumranet.com>	<480C188F.3020101@cisco.com>	<480C5C39.4040300@qumranet.com>	<480E492B.3060500@cisco.com>	<480EEDA0.3080209@qumranet.com>	<480F546C.2030608@cisco.com>	<481215DE.3000302@cisco.com>	<20080428181550.GA3965@dmt>	<4816617F.3080403@cisco.com>	<4817F30C.6050308@cisco.com>	<48184228.2020701@qumranet.com>	<481876A9.1010806@cisco.com>	<48187903.2070409@qumranet.com>	<4826E744.1080107@qumranet.com>	<4826F668.6030305@qumranet.com> <48290FC2.4070505@cisco.com> <48294272.5020801@qumranet.com> <482B4D29.7010202@cisco.com> <482C1633.5070302@qumranet.com> <482E5F9C.6000207@cisco.com> <482FCEE1.5040306@qumranet.com> <4830F90A.1020809@cisco.com> <4830FE8D.6010006@cisco.com> <4
 8318E64.8090706@qumranet.com>
Mime-Version: 1.0
Content-Type: multipart/mixed;
 boundary="------------040601030809090006090604"
Cc: kvm@vger.kernel.org
To: "David S. Ahern" <daahern@cisco.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from bzq-179-150-194.static.bezeqint.net ([212.179.150.194]:29894
	"EHLO il.qumranet.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1753903AbYETOT0 (ORCPT <rfc822;kvm@vger.kernel.org>);
	Tue, 20 May 2008 10:19:26 -0400
In-Reply-To: <48318E64.8090706@qumranet.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

This is a multi-part message in MIME format.
--------------040601030809090006090604
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

Avi Kivity wrote:
>
> There are (at least) three options available:
> - detect and special-case this scenario
> - change the flood detector to be per page table instead of per vcpu
> - change the flood detector to look at a list of recently used page 
> tables instead of the last page table
>
> I'm having a hard time trying to pick between the second and third 
> options.
>

The answer turns out to be "yes", so here's a patch that adds a pte 
access history table for each shadowed guest page-table.  Let me know if 
it helps.  Benchmarking a variety of workloads on all guests supported 
by kvm is left as an exercise for the reader, but I suspect the patch 
will either improve things all around, or can be modified to do so.

-- 
error compiling committee.c: too many arguments to function


--------------040601030809090006090604
Content-Type: text/x-patch;
 name="per-page-pte-history.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="per-page-pte-history.patch"

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 154727d..1a3d01a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1130,7 +1130,8 @@ unshadowed:
 	if (speculative) {
 		vcpu->arch.last_pte_updated = shadow_pte;
 		vcpu->arch.last_pte_gfn = gfn;
-	}
+	} else
+		page_header(__pa(shadow_pte))->pte_history_len = 0;
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1616,13 +1617,6 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
 		kvm_mmu_flush_tlb(vcpu);
 }
 
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
-	u64 *spte = vcpu->arch.last_pte_updated;
-
-	return !!(spte && (*spte & shadow_accessed_mask));
-}
-
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 					  const u8 *new, int bytes)
 {
@@ -1679,13 +1673,49 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	u64 *spte = vcpu->arch.last_pte_updated;
+	struct kvm_mmu_page *page;
+
+	if (spte && vcpu->arch.last_pte_gfn == gfn) {
+		page = page_header(__pa(spte));
+		page->pte_history_len = 0;
+		pgprintk("clearing page history, gfn %x ent %lx\n",
+			 page->gfn, spte - page->spt);
+	}
+}
+
+static bool kvm_mmu_page_flooded(struct kvm_mmu_page *page)
+{
+	int i, j, ent, len;
 
-	if (spte
-	    && vcpu->arch.last_pte_gfn == gfn
-	    && shadow_accessed_mask
-	    && !(*spte & shadow_accessed_mask)
-	    && is_shadow_present_pte(*spte))
-		set_bit(PT_ACCESSED_SHIFT, spte);
+	len = page->pte_history_len;
+	for (i = len; i != 0; --i) {
+		ent = page->pte_history[i - 1];
+		if (test_bit(PT_ACCESSED_SHIFT, &page->spt[ent])) {
+			for (j = i; j < len; ++j)
+				page->pte_history[j-i] = page->pte_history[j];
+			page->pte_history_len = len - i;
+			return false;
+		}
+	}
+	if (page->pte_history_len < KVM_MAX_PTE_HISTORY)
+		return false;
+	return true;
+}
+
+static void kvm_mmu_log_pte_history(struct kvm_mmu_page *page, u64 *spte)
+{
+	int i;
+	unsigned ent = spte - page->spt;
+
+	if (page->pte_history_len > 0
+	    && page->pte_history[page->pte_history_len - 1] == ent)
+		return;
+	if (page->pte_history_len == KVM_MAX_PTE_HISTORY) {
+		for (i = 1; i < KVM_MAX_PTE_HISTORY; ++i)
+			page->pte_history[i-1] = page->pte_history[i];
+		--page->pte_history_len;
+	}
+	page->pte_history[page->pte_history_len++] = ent;
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1704,7 +1734,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	unsigned misaligned;
 	unsigned quadrant;
 	int level;
-	int flooded = 0;
 	int npte;
 	int r;
 
@@ -1715,16 +1744,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	kvm_mmu_audit(vcpu, "pre pte write");
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
-	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
@@ -1733,7 +1752,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
-		if (misaligned || flooded) {
+		if (misaligned || kvm_mmu_page_flooded(sp)) {
 			/*
 			 * Misaligned accesses are too much trouble to fix
 			 * up; also, they usually indicate a page is not used
@@ -1785,6 +1804,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
 			if (new)
 				mmu_pte_write_new_pte(vcpu, sp, spte, new);
+			kvm_mmu_log_pte_history(sp, spte);
 			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
 			++spte;
 		}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index a71f3aa..cbe550e 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -78,6 +78,7 @@
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_MAX_PTE_HISTORY 4
 
 extern spinlock_t kvm_lock;
 extern struct list_head vm_list;
@@ -189,6 +190,9 @@ struct kvm_mmu_page {
 		u64 *parent_pte;               /* !multimapped */
 		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
 	};
+
+	u16 pte_history_len;
+	u16 pte_history[KVM_MAX_PTE_HISTORY];
 };
 
 /*

--------------040601030809090006090604--