public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 7/8] KVM: MMU: VMX cr3 cache support
@ 2008-03-02 16:31 Avi Kivity
  2008-03-06 13:15 ` Zhao Forrest
  0 siblings, 1 reply; 5+ messages in thread
From: Avi Kivity @ 2008-03-02 16:31 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Marcelo Tosatti

From: Marcelo Tosatti <mtosatti@redhat.com>

Add support for the cr3 cache feature on Intel VMX CPU's. This avoids
vmexits on context switch if the cr3 value is cached in one of the
entries (currently 4 are present).

This is especially important for Xenner, where each guest syscall
involves a cr3 switch.

v1->v2:
- handle the race which happens when the guest has the cache cleared
in the middle of kvm_write_cr3 by injecting a GP and trapping it to
fallback to hypercall variant (suggested by Avi).

v2->v3:
- one ioctl per paravirt feature

v3->v4:
- disable if tdp enabled

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         |  196 +++++++++++++++++++++++++++++++-------------
 arch/x86/kvm/mmu.h         |    3 +-
 arch/x86/kvm/paging_tmpl.h |    4 +-
 arch/x86/kvm/svm.c         |    6 ++
 arch/x86/kvm/vmx.c         |  152 +++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c         |    9 ++-
 include/asm-x86/kvm_host.h |    9 ++-
 include/asm-x86/kvm_para.h |   21 +++++
 include/linux/kvm.h        |    1 +
 9 files changed, 332 insertions(+), 69 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 14de7dc..11bca62 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -272,6 +272,16 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	return 0;
 }
 
+static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu)
+{
+        struct kvm_cr3_cache *cache;
+
+        if (!vcpu->arch.cr3_cache)
+                return;
+        cache = vcpu->arch.cr3_cache;
+        memset(cache->entry, 0, sizeof(cache->entry));
+}
+
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 {
 	while (mc->nobjs)
@@ -1127,7 +1137,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			   int largepage, gfn_t gfn, struct page *page,
 			   int level)
 {
-	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+	hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
 	int pt_write = 0;
 
 	for (; ; level--) {
@@ -1219,53 +1229,75 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 
 static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
+	int i, j;
 	struct kvm_mmu_page *sp;
 
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
+	/*
+	 * Skip to the next cr3 filter entry and free it (if it's occupied).
+	 */
+	vcpu->arch.cr3_cache_idx++;
+	if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit))
+		vcpu->arch.cr3_cache_idx = 0;
+
+	j = vcpu->arch.cr3_cache_idx;
+	/*
+	 * Clear the guest-visible entry.
+	 */
+	if (vcpu->arch.cr3_cache) {
+		vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0;
+		vcpu->arch.cr3_cache->entry[j].host_cr3 = 0;
+	}
 	spin_lock(&vcpu->kvm->mmu_lock);
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
+		hpa_t root = vcpu->arch.mmu.root_hpa[j];
+
+		if (!VALID_PAGE(root)) {
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			return;
+		}
 
 		sp = page_header(root);
 		--sp->root_count;
 		if (!sp->root_count && sp->role.invalid)
 			kvm_mmu_zap_page(vcpu->kvm, sp);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+		vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
 	}
 #endif
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			--sp->root_count;
-			if (!sp->root_count && sp->role.invalid)
-				kvm_mmu_zap_page(vcpu->kvm, sp);
+	ASSERT(vcpu->arch.mmu.pae_root[j]);
+	if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) {
+		for (i = 0; i < 4; ++i) {
+			hpa_t root = vcpu->arch.mmu.pae_root[j][i];
+
+			if (root) {
+				root &= PT64_BASE_ADDR_MASK;
+				sp = page_header(root);
+				--sp->root_count;
+				if (!sp->root_count && sp->role.invalid)
+					kvm_mmu_zap_page(vcpu->kvm, sp);
+			}
+			vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
 		}
-		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
 }
 
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
+	int i, j;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
 	int metaphysical = 0;
 
 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+	j = vcpu->arch.cr3_cache_idx;
 
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
+		hpa_t root = vcpu->arch.mmu.root_hpa[j];
 
 		ASSERT(!VALID_PAGE(root));
 		if (tdp_enabled)
@@ -1275,7 +1307,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
-		vcpu->arch.mmu.root_hpa = root;
+		vcpu->arch.mmu.root_hpa[j] = root;
 		return;
 	}
 #endif
@@ -1283,7 +1315,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	if (tdp_enabled)
 		metaphysical = 1;
 	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
+		hpa_t root = vcpu->arch.mmu.pae_root[j][i];
 
 		ASSERT(!VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
@@ -1299,9 +1331,9 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
-		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+		vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK;
 	}
-	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+	vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -1321,7 +1353,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 		return r;
 
 	ASSERT(vcpu);
-	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j]));
 
 	gfn = gva >> PAGE_SHIFT;
 
@@ -1367,12 +1399,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
-	mmu_free_roots(vcpu);
+	int j;
+
+	/*
+	 * This will cycle through all existing roots and free them.
+	 */
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+		mmu_free_roots(vcpu);
 }
 
 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = nonpaging_page_fault;
@@ -1381,7 +1420,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 	return 0;
 }
 
@@ -1420,6 +1460,7 @@ static void paging_free(struct kvm_vcpu *vcpu)
 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -1429,7 +1470,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 	return 0;
 }
 
@@ -1441,6 +1483,7 @@ static int paging64_init_context(struct kvm_vcpu *vcpu)
 static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
@@ -1449,7 +1492,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->prefetch_page = paging32_prefetch_page;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 	return 0;
 }
 
@@ -1461,13 +1505,15 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->shadow_root_level = TDP_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -1489,7 +1535,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
 	if (!is_paging(vcpu))
 		return nonpaging_init_context(vcpu);
@@ -1511,11 +1557,14 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
+	int j;
 	ASSERT(vcpu);
-	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
-		vcpu->arch.mmu.free(vcpu);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	}
+
+	for(j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+		if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) {
+			vcpu->arch.mmu.free(vcpu);
+			vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
+		}
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -1528,6 +1577,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
+	int j = vcpu->arch.cr3_cache_idx;
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -1536,8 +1586,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	kvm_mmu_free_some_pages(vcpu);
 	mmu_alloc_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-	kvm_mmu_flush_tlb(vcpu);
+	/* setting CR3 will flush the TLB */
+	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]);
 out:
 	return r;
 }
@@ -1545,7 +1595,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-	mmu_free_roots(vcpu);
+	int j;
+
+	kvm_cr3_cache_clear(vcpu);
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+		mmu_free_roots(vcpu);
 }
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
@@ -1727,6 +1781,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
 			kvm_mmu_zap_page(vcpu->kvm, sp);
+			kvm_cr3_cache_clear(vcpu);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -1788,6 +1843,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (r)
+		kvm_cr3_cache_clear(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	return r;
 }
@@ -1800,6 +1857,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_zap_page(vcpu->kvm, sp);
+		kvm_cr3_cache_clear(vcpu);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 }
@@ -1850,19 +1908,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
+	int j;
 
 	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 	}
-	free_page((unsigned long)vcpu->arch.mmu.pae_root);
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+		ASSERT(vcpu->arch.mmu.pae_root[j]);
+		free_page((unsigned long)vcpu->arch.mmu.pae_root[j]);
+		vcpu->arch.mmu.pae_root[j] = NULL;
+	}
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	struct page *page;
-	int i;
+	int i, j;
 
 	ASSERT(vcpu);
 
@@ -1872,17 +1935,23 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 	else
 		vcpu->kvm->arch.n_free_mmu_pages =
 					vcpu->kvm->arch.n_alloc_mmu_pages;
-	/*
-	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-	 * Therefore we need to allocate shadow page tables in the first
-	 * 4GB of memory, which happens to fit the DMA32 zone.
-	 */
-	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-	if (!page)
-		goto error_1;
-	vcpu->arch.mmu.pae_root = page_address(page);
-	for (i = 0; i < 4; ++i)
-		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+		/*
+		 * When emulating 32-bit mode, cr3 is only 32 bits even on
+		 * x86_64. Therefore we need to allocate shadow page tables
+		 * in the first 4GB of memory, which happens to fit the DMA32
+		 * zone.
+		 */
+		page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+		if (!page)
+			goto error_1;
+
+		ASSERT(!vcpu->arch.mmu.pae_root[j]);
+		vcpu->arch.mmu.pae_root[j] = page_address(page);
+		for (i = 0; i < 4; ++i)
+			vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
+	}
 
 	return 0;
 
@@ -1894,7 +1963,7 @@ error_1:
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
 	return alloc_mmu_pages(vcpu);
 }
@@ -1902,7 +1971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
 	return init_kvm_mmu(vcpu);
 }
@@ -2091,6 +2160,15 @@ static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
 			return 0;
 		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
 	}
+	case KVM_MMU_OP_SET_CR3: {
+		struct kvm_mmu_op_set_cr3 *scr3;
+
+		scr3 = pv_mmu_read_buffer(buffer, sizeof *scr3);
+		if (!scr3)
+			return 0;
+		kvm_set_cr3(vcpu, scr3->cr3);
+		return 1;
+	}
 	default: return 0;
 	}
 }
@@ -2188,15 +2266,17 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 {
 	unsigned i;
 
-	if (vcpu->arch.mmu.root_level == 4)
+	if (vcpu->arch.mmu.root_level == 4) {
 		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
+		return;
+	}
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
 		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+			if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK)
 				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
+						  vcpu->arch.mmu.pae_root[j][i],
+						  i << 30, 2);
+	}
 }
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e64e9f5..77f6882 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
-	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+	int idx = vcpu->arch.cr3_cache_idx;
+	if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE))
 		return 0;
 
 	return kvm_mmu_load(vcpu);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 17f9d16..3163c31 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -285,10 +285,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!is_present_pte(walker->ptes[walker->level - 1]))
 		return NULL;
 
-	shadow_addr = vcpu->arch.mmu.root_hpa;
+	shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
 	level = vcpu->arch.mmu.shadow_root_level;
 	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		shadow_addr = vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3];
 		shadow_addr &= PT64_BASE_ADDR_MASK;
 		--level;
 	}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 28ad3c4..7b774b0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
+static int cpu_has_cr3_cache(void)
+{
+	return 0;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+	.cpu_has_cr3_cache = cpu_has_cr3_cache,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 46e0e58..44b1ae0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -215,6 +215,10 @@ static inline int cpu_has_vmx_vpid(void)
 	return (vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_VPID);
 }
+static inline int cpu_has_cr3_cache(void)
+{
+	return 1;
+}
 
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
@@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	return 0;
 }
 
+int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct page *page;
+	hva_t cr3_cache_hva;
+
+	if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache)
+		return -EINVAL;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		return -EINVAL;
+	}
+
+	cr3_cache_hva = (hva_t)__va(page_to_phys(page));
+	vcpu->arch.cr3_cache = (void *)cr3_cache_hva;
+	vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit;
+
+	return 0;
+}
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	case MSR_IA32_TIME_STAMP_COUNTER:
 		guest_write_tsc(data);
 		break;
+	case KVM_MSR_SET_CR3_CACHE:
+		ret = vmx_cr3_cache_msr(vcpu, data);
+		break;
 	default:
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
@@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	struct kvm_cr3_cache *cache;
+	int idx;
+
 	vmx_flush_tlb(vcpu);
 	vmcs_writel(GUEST_CR3, cr3);
 	if (vcpu->arch.cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
+
+	if (!vcpu->arch.cr3_cache)
+		return;
+
+	idx = vcpu->arch.cr3_cache_idx;
+	cache = vcpu->arch.cr3_cache;
+
+	cache->entry[idx].host_cr3 = cr3;
+	cache->entry[idx].guest_cr3 = vcpu->arch.cr3;
+	vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1505,6 +1549,39 @@ out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
 }
+/*
+ * Set up the cr3 validity hardware cache.
+ */
+static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu)
+{
+	unsigned int cr3_target_values, i;
+	u64 msr_val;
+
+	rdmsrl(MSR_IA32_VMX_MISC, msr_val);
+
+	printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val);
+
+	/*
+	 * 9 bits of "CR3 target values":
+	 */
+	cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1);
+	printk(" cr3 target values: %d\n", cr3_target_values);
+	if (cr3_target_values > KVM_CR3_CACHE_SIZE) {
+		printk("KVM: limiting cr3 cache size from %d to %d\n",
+			cr3_target_values, KVM_CR3_CACHE_SIZE);
+		cr3_target_values = KVM_CR3_CACHE_SIZE;
+	}
+
+	vcpu->arch.cr3_cache_idx = 0;
+	vcpu->arch.cr3_cache_limit = cr3_target_values;
+	/*
+	 * Initialize. TODO: set this to guest physical memory.
+	 */
+	for (i = 0; i < cr3_target_values; i++)
+		vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL);
+
+	vmcs_write32(CR3_TARGET_COUNT, cr3_target_values);
+}
 
 static void seg_setup(int seg)
 {
@@ -1601,7 +1678,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+	vmcs_setup_cr3_cache(&vmx->vcpu);
 
 	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
 	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
@@ -2032,9 +2109,12 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 3:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
-			skip_emulated_instruction(vcpu);
+			if (!vcpu->arch.cr3_cache) {
+				vcpu_load_rsp_rip(vcpu);
+				kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+				skip_emulated_instruction(vcpu);
+			} else
+				kvm_inject_gp(vcpu, 0);
 			return 1;
 		case 4:
 			vcpu_load_rsp_rip(vcpu);
@@ -2395,6 +2475,56 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu)
+{
+	void *guest_cr3_hva;
+	hpa_t guest_cr3_hpa;
+	struct kvm_cr3_cache *cache;
+	int j;
+	int idx = vcpu->arch.cr3_cache_idx;
+
+	if (!vcpu->arch.cr3_cache)
+		return;
+
+	guest_cr3_hpa = vmcs_readl(GUEST_CR3);
+	/*
+	 * Are they in sync already?
+	 */
+	if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx])
+		return;
+
+	cache = vcpu->arch.cr3_cache;
+#ifdef CONFIG_X86_64
+	if (vcpu->arch.mmu.shadow_root_level == 4) {
+		for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+			hpa_t root = cache->entry[j].host_cr3;
+			if (root != guest_cr3_hpa)
+				continue;
+			vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+			vcpu->arch.cr3_cache_idx = j;
+			vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3;
+			++vcpu->stat.cr3_cache_synced;
+			return;
+		}
+	WARN_ON(j == KVM_CR3_CACHE_SIZE);
+	}
+#endif
+
+	guest_cr3_hva = __va(guest_cr3_hpa);
+	for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+		u64 *root = vcpu->arch.mmu.pae_root[j];
+		WARN_ON(!root);
+		if (root != guest_cr3_hva)
+			continue;
+		vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+		vcpu->arch.cr3_cache_idx = j;
+		vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+		++vcpu->stat.cr3_cache_synced;
+		return;
+	}
+	WARN_ON(j == KVM_CR3_CACHE_SIZE);
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2405,6 +2535,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 */
 	vmcs_writel(HOST_CR0, read_cr0());
 
+	WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
+
 	asm(
 		/* Store host registers */
 #ifdef CONFIG_X86_64
@@ -2519,6 +2651,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		, "ebx", "edi", "rsi"
 #endif
 	      );
+	/*
+	 * Figure out whether vcpu->cr3 needs updating because
+	 * the guest made use of the cr3 cache.
+	 */
+	kvm_cr3_cache_sync(vcpu);
+	WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
@@ -2551,11 +2689,16 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *page = NULL;
 
 	spin_lock(&vmx_vpid_lock);
 	if (vmx->vpid != 0)
 		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
+	if (vcpu->arch.cr3_cache) {
+		page = virt_to_page(vcpu->arch.cr3_cache);
+		kvm_release_page_dirty(page);
+	}
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->host_msrs);
 	kfree(vmx->guest_msrs);
@@ -2643,6 +2786,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 	.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+	.cpu_has_cr3_cache = cpu_has_cr3_cache,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 92a51d3..19cceb2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -80,6 +80,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "fpu_reload", VCPU_STAT(fpu_reload) },
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+	{ "cr3_cached_synced", VCPU_STAT(cr3_cache_synced) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -820,6 +821,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PV_MMU:
 		r = !tdp_enabled;
 		break;
+	case KVM_CAP_CR3_CACHE:
+		r = !tdp_enabled && kvm_x86_ops->cpu_has_cr3_cache();
+		break;
 	default:
 		r = 0;
 		break;
@@ -3298,12 +3302,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct page *page;
 	struct kvm *kvm;
-	int r;
+	int r, i;
 
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE;
 	if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
 		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
 	else
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d20cabc..f3ca4f6 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -188,11 +188,11 @@ struct kvm_mmu {
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
-	hpa_t root_hpa;
+	hpa_t root_hpa[KVM_CR3_CACHE_SIZE];
 	int root_level;
 	int shadow_root_level;
 
-	u64 *pae_root;
+	u64 *pae_root[KVM_CR3_CACHE_SIZE];
 };
 
 struct kvm_vcpu_arch {
@@ -206,6 +206,9 @@ struct kvm_vcpu_arch {
 	unsigned long cr0;
 	unsigned long cr2;
 	unsigned long cr3;
+	struct kvm_cr3_cache *cr3_cache;
+	unsigned int cr3_cache_idx;
+	unsigned int cr3_cache_limit;
 	unsigned long cr4;
 	unsigned long cr8;
 	u64 pdptrs[4]; /* pae */
@@ -338,6 +341,7 @@ struct kvm_vcpu_stat {
 	u32 insn_emulation;
 	u32 insn_emulation_fail;
 	u32 hypercalls;
+	u32 cr3_cache_synced;
 };
 
 struct descriptor_table {
@@ -354,6 +358,7 @@ struct kvm_x86_ops {
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
+	int (*cpu_has_cr3_cache)(void);
 
 	/* Create, but do not attach this VCPU */
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index 5098459..67f2ad2 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -13,9 +13,12 @@
 #define KVM_FEATURE_CLOCKSOURCE		0
 #define KVM_FEATURE_NOP_IO_DELAY	1
 #define KVM_FEATURE_MMU_OP		2
+#define KVM_FEATURE_MMU_WRITE		2
+#define KVM_FEATURE_CR3_CACHE		3
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
+#define KVM_MSR_SET_CR3_CACHE 0x13
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
@@ -23,6 +26,7 @@
 #define KVM_MMU_OP_WRITE_PTE            1
 #define KVM_MMU_OP_FLUSH_TLB	        2
 #define KVM_MMU_OP_RELEASE_PT	        3
+#define KVM_MMU_OP_SET_CR3              4
 
 /* Payload for KVM_HC_MMU_OP */
 struct kvm_mmu_op_header {
@@ -45,6 +49,11 @@ struct kvm_mmu_op_release_pt {
 	__u64 pt_phys;
 };
 
+struct kvm_mmu_op_set_cr3 {
+	struct kvm_mmu_op_header header;
+	__u64 cr3;
+};
+
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
@@ -157,4 +166,16 @@ static inline unsigned int kvm_arch_para_features(void)
 
 #endif
 
+#define KVM_CR3_CACHE_SIZE 4
+
+struct kvm_cr3_cache_entry {
+	__u64 guest_cr3;
+	__u64 host_cr3;
+};
+
+struct kvm_cr3_cache {
+	struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE];
+	__u32 max_idx;
+};
+
 #endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 074a107..2aebd29 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -238,6 +238,7 @@ struct kvm_vapic_addr {
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
 #define KVM_CAP_NOP_IO_DELAY 11
 #define KVM_CAP_PV_MMU 12
+#define KVM_CAP_CR3_CACHE 13
 
 /*
  * ioctls for VM fds
-- 
1.5.4.2


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2008-03-06 23:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-02 16:31 [PATCH 7/8] KVM: MMU: VMX cr3 cache support Avi Kivity
2008-03-06 13:15 ` Zhao Forrest
2008-03-06 14:11   ` Marcelo Tosatti
2008-03-06 14:44     ` Avi Kivity
2008-03-06 23:47       ` Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox