[patch 1/5] KVM: add basic paravirt support

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 1/5] KVM: add basic paravirt support
       [not found] <20080216220924.733723618@redhat.com>
@ 2008-02-16 22:09 ` Marcelo Tosatti
  2008-02-16 22:09 ` [patch 2/5] KVM: hypercall based pte updates and TLB flushes Marcelo Tosatti
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-16 22:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

[-- Attachment #1: kvm-paravirt-core --]
[-- Type: text/plain, Size: 7007 bytes --]

Add basic KVM paravirt support. Avoid vm-exits on IO delays.

Add KVM_GET_PARA_FEATURES ioctl so paravirt features can be reported via
cpuid.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>

Index: kvm.paravirt/arch/x86/Kconfig
===================================================================
--- kvm.paravirt.orig/arch/x86/Kconfig
+++ kvm.paravirt/arch/x86/Kconfig
@@ -372,6 +372,14 @@ config VMI
 	  at the moment), by linking the kernel to a GPL-ed ROM module
 	  provided by the hypervisor.
 
+config KVM_GUEST
+ 	bool "KVM Guest support"
+ 	select PARAVIRT
+ 	depends on !(X86_VISWS || X86_VOYAGER)
+ 	help
+ 	 This option enables various optimizations for running under the KVM
+ 	 hypervisor.
+
 source "arch/x86/lguest/Kconfig"
 
 config PARAVIRT
Index: kvm.paravirt/arch/x86/kernel/Makefile
===================================================================
--- kvm.paravirt.orig/arch/x86/kernel/Makefile
+++ kvm.paravirt/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 
 ifdef CONFIG_INPUT_PCSPKR
Index: kvm.paravirt/arch/x86/kernel/kvm.c
===================================================================
--- /dev/null
+++ kvm.paravirt/arch/x86/kernel/kvm.c
@@ -0,0 +1,52 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ *   Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static void paravirt_ops_setup(void)
+{
+	pv_info.name = "KVM";
+	pv_info.paravirt_enabled = 1;
+
+	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+		pv_cpu_ops.io_delay = kvm_io_delay;
+
+}
+
+void __init kvm_guest_init(void)
+{
+	if (!kvm_para_available())
+		return;
+
+	paravirt_ops_setup();
+}
Index: kvm.paravirt/arch/x86/kernel/setup_32.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kernel/setup_32.c
+++ kvm.paravirt/arch/x86/kernel/setup_32.c
@@ -46,6 +46,7 @@
 #include <linux/pfn.h>
 #include <linux/pci.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 
 #include <video/edid.h>
 
@@ -779,6 +780,7 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	vmi_init();
 #endif
+	kvm_guest_init();
 
 	/*
 	 * NOTE: before this point _nobody_ is allowed to allocate
Index: kvm.paravirt/arch/x86/kernel/setup_64.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kernel/setup_64.c
+++ kvm.paravirt/arch/x86/kernel/setup_64.c
@@ -41,6 +41,7 @@
 #include <linux/ctype.h>
 #include <linux/uaccess.h>
 #include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -447,6 +448,8 @@ void __init setup_arch(char **cmdline_p)
 	init_apic_mappings();
 	ioapic_init_mappings();
 
+	kvm_guest_init();
+
 	/*
 	 * We trust e820 completely. No explicit ROM probing in memory.
 	 */
Index: kvm.paravirt/arch/x86/kvm/x86.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/x86.c
+++ kvm.paravirt/arch/x86/kvm/x86.c
@@ -696,6 +696,7 @@ int kvm_dev_ioctl_check_extension(long e
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_PARA_FEATURES:
 		r = 1;
 		break;
 	case KVM_CAP_VAPIC:
@@ -761,6 +762,15 @@ long kvm_arch_dev_ioctl(struct file *fil
 		r = 0;
 		break;
 	}
+	case KVM_GET_PARA_FEATURES: {
+		__u32 para_features = KVM_PARA_FEATURES;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &para_features, sizeof para_features))
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
Index: kvm.paravirt/include/asm-x86/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_para.h
+++ kvm.paravirt/include/asm-x86/kvm_para.h
@@ -5,6 +5,7 @@
  * should be used to determine that a VM is running under KVM.
  */
 #define KVM_CPUID_SIGNATURE	0x40000000
+#define KVM_FEATURE_NOP_IO_DELAY	0
 
 /* This CPUID returns a feature bitmap in eax.  Before enabling a particular
  * paravirtualization, the appropriate feature bit should be checked.
@@ -14,6 +15,8 @@
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
+#define KVM_PARA_FEATURES (1UL << KVM_FEATURE_NOP_IO_DELAY)
+
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
  */
Index: kvm.paravirt/include/linux/kvm.h
===================================================================
--- kvm.paravirt.orig/include/linux/kvm.h
+++ kvm.paravirt/include/linux/kvm.h
@@ -222,6 +222,7 @@ struct kvm_vapic_addr {
  */
 #define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
 #define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
+#define KVM_GET_PARA_FEATURES     _IOR(KVMIO,  0x05, __u32)
 
 /*
  * Extension capability list.
@@ -233,6 +234,7 @@ struct kvm_vapic_addr {
 #define KVM_CAP_SET_TSS_ADDR 4
 #define KVM_CAP_VAPIC 6
 #define KVM_CAP_EXT_CPUID 7
+#define KVM_CAP_PARA_FEATURES 8
 
 /*
  * ioctls for VM fds
Index: kvm.paravirt/include/linux/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/linux/kvm_para.h
+++ kvm.paravirt/include/linux/kvm_para.h
@@ -20,6 +20,12 @@
 #include <asm/kvm_para.h>
 
 #ifdef __KERNEL__
+#ifdef CONFIG_KVM_GUEST
+void __init kvm_guest_init(void);
+#else
+#define kvm_guest_init() do { } while (0)
+#endif
+
 static inline int kvm_para_has_feature(unsigned int feature)
 {
 	if (kvm_arch_para_features() & (1UL << feature))

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [patch 2/5] KVM: hypercall based pte updates and TLB flushes
       [not found] <20080216220924.733723618@redhat.com>
  2008-02-16 22:09 ` [patch 1/5] KVM: add basic paravirt support Marcelo Tosatti
@ 2008-02-16 22:09 ` Marcelo Tosatti
  2008-02-16 22:09 ` [patch 3/5] KVM: hypercall batching Marcelo Tosatti
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-16 22:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

[-- Attachment #1: kvm-mmu-write --]
[-- Type: text/plain, Size: 9077 bytes --]

Hypercall based pte updates are faster than faults, and also allow use
of the lazy MMU mode to batch operations.

Don't report the feature if two dimensional paging is enabled.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>


Index: kvm.paravirt/arch/x86/kernel/kvm.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kernel/kvm.c
+++ kvm.paravirt/arch/x86/kernel/kvm.c
@@ -33,6 +33,91 @@ static void kvm_io_delay(void)
 {
 }
 
+static void kvm_mmu_write(void *dest, const void *src, size_t size)
+{
+	const uint8_t *p = src;
+	unsigned long a0 = *(unsigned long *)p;
+	unsigned long a1 = 0;
+
+	size >>= 2;
+#ifdef CONFIG_X86_32
+	if (size == 2)
+		a1 = *(u32 *)&p[4];
+#endif
+	kvm_hypercall4(KVM_HYPERCALL_MMU_WRITE, (unsigned long)dest, size, a0,
+		       a1);
+}
+
+/*
+ * We only need to hook operations that are MMU writes.  We hook these so that
+ * we can use lazy MMU mode to batch these operations.  We could probably
+ * improve the performance of the host code if we used some of the information
+ * here to simplify processing of batched writes.
+ */
+static void kvm_set_pte(pte_t *ptep, pte_t pte)
+{
+	kvm_mmu_write(ptep, &pte, sizeof(pte));
+}
+
+static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
+			   pte_t *ptep, pte_t pte)
+{
+	kvm_mmu_write(ptep, &pte, sizeof(pte));
+}
+
+static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+	kvm_mmu_write(pmdp, &pmd, sizeof(pmd));
+}
+
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+	kvm_mmu_write(ptep, &pte, sizeof(pte));
+}
+
+static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte)
+{
+	kvm_mmu_write(ptep, &pte, sizeof(pte));
+}
+
+static void kvm_pte_clear(struct mm_struct *mm,
+			  unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = __pte(0);
+	kvm_mmu_write(ptep, &pte, sizeof(pte));
+}
+
+static void kvm_pmd_clear(pmd_t *pmdp)
+{
+	pmd_t pmd = __pmd(0);
+	kvm_mmu_write(pmdp, &pmd, sizeof(pmd));
+}
+#endif
+
+static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	kvm_mmu_write(pgdp, &pgd, sizeof(pgd));
+}
+
+static void kvm_set_pud(pud_t *pudp, pud_t pud)
+{
+	kvm_mmu_write(pudp, &pud, sizeof(pud));
+}
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+static void kvm_flush_tlb(void)
+{
+	kvm_hypercall0(KVM_HYPERCALL_FLUSH_TLB);
+}
+
+static void kvm_release_pt(u32 pfn)
+{
+	kvm_hypercall1(KVM_HYPERCALL_RELEASE_PT, pfn << PAGE_SHIFT);
+}
+
 static void paravirt_ops_setup(void)
 {
 	pv_info.name = "KVM";
@@ -41,6 +126,24 @@ static void paravirt_ops_setup(void)
 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 		pv_cpu_ops.io_delay = kvm_io_delay;
 
+	if (kvm_para_has_feature(KVM_FEATURE_MMU_WRITE)) {
+		pv_mmu_ops.set_pte = kvm_set_pte;
+		pv_mmu_ops.set_pte_at = kvm_set_pte_at;
+		pv_mmu_ops.set_pmd = kvm_set_pmd;
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+		pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
+		pv_mmu_ops.set_pte_present = kvm_set_pte_present;
+		pv_mmu_ops.pte_clear = kvm_pte_clear;
+		pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+#endif
+		pv_mmu_ops.set_pud = kvm_set_pud;
+		pv_mmu_ops.set_pgd = kvm_set_pgd;
+#endif
+		pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
+		pv_mmu_ops.release_pt = kvm_release_pt;
+		pv_mmu_ops.release_pd = kvm_release_pt;
+	}
 }
 
 void __init kvm_guest_init(void)
Index: kvm.paravirt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/mmu.c
+++ kvm.paravirt/arch/x86/kvm/mmu.c
@@ -39,7 +39,7 @@
  * 2. while doing 1. it walks guest-physical to host-physical
  * If the hardware supports that we don't need to do shadow paging.
  */
-static bool tdp_enabled = false;
+bool tdp_enabled = false;
 
 #undef MMU_DEBUG
 
@@ -288,7 +288,7 @@ static void mmu_free_memory_cache_page(s
 		free_page((unsigned long)mc->objects[--mc->nobjs]);
 }
 
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 {
 	int r;
 
@@ -857,7 +857,7 @@ static int kvm_mmu_unprotect_page(struct
 	return r;
 }
 
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
+void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
 
Index: kvm.paravirt/arch/x86/kvm/mmu.h
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/mmu.h
+++ kvm.paravirt/arch/x86/kvm/mmu.h
@@ -47,4 +47,7 @@ static inline int is_paging(struct kvm_v
 	return vcpu->arch.cr0 & X86_CR0_PG;
 }
 
+void mmu_unshadow(struct kvm *kvm, gfn_t gfn);
+int mmu_topup_memory_caches(struct kvm_vcpu *vcpu);
+
 #endif
Index: kvm.paravirt/arch/x86/kvm/x86.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/x86.c
+++ kvm.paravirt/arch/x86/kvm/x86.c
@@ -764,6 +764,8 @@ long kvm_arch_dev_ioctl(struct file *fil
 	}
 	case KVM_GET_PARA_FEATURES: {
 		__u32 para_features = KVM_PARA_FEATURES;
+		if (tdp_enabled)
+			para_features &= ~(1UL << KVM_FEATURE_MMU_WRITE);
 
 		r = -EFAULT;
 		if (copy_to_user(argp, &para_features, sizeof para_features))
@@ -2269,6 +2271,52 @@ int kvm_emulate_halt(struct kvm_vcpu *vc
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static int kvm_hypercall_mmu_write(struct kvm_vcpu *vcpu, gva_t addr,
+				   unsigned long size, unsigned long a0,
+				   unsigned long a1)
+{
+	gpa_t gpa;
+	u64 value;
+
+	if (mmu_topup_memory_caches(vcpu))
+		return -KVM_EFAULT;
+
+	down_read(&vcpu->kvm->slots_lock);
+	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+	up_read(&vcpu->kvm->slots_lock);
+
+	if (gpa == UNMAPPED_GVA)
+		return -KVM_EFAULT;
+	if (size == 1) {
+		if (!emulator_write_phys(vcpu, gpa, &a0, sizeof(a0)))
+			return -KVM_EFAULT;
+	} else if (size == 2) {
+		if (!is_long_mode(vcpu) && is_pae(vcpu))
+			value = (u64)a1 << 32 | a0;
+		else
+			value = a0;
+		if (!emulator_write_phys(vcpu, gpa, &value, sizeof(value)))
+			return -KVM_EFAULT;
+	} else
+		return -KVM_E2BIG;
+
+	return 0;
+}
+
+static int kvm_hypercall_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->tlb_flush(vcpu);
+	return 0;
+}
+
+static int kvm_hypercall_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	return 0;
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
@@ -2293,6 +2341,15 @@ int kvm_emulate_hypercall(struct kvm_vcp
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
+	case KVM_HYPERCALL_MMU_WRITE:
+		ret = kvm_hypercall_mmu_write(vcpu, a0, a1, a2, a3);
+		break;
+	case KVM_HYPERCALL_FLUSH_TLB:
+		ret = kvm_hypercall_flush_tlb(vcpu);
+		break;
+	case KVM_HYPERCALL_RELEASE_PT:
+		ret = kvm_hypercall_release_pt(vcpu, a0);
+		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
Index: kvm.paravirt/include/asm-x86/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_para.h
+++ kvm.paravirt/include/asm-x86/kvm_para.h
@@ -6,6 +6,7 @@
  */
 #define KVM_CPUID_SIGNATURE	0x40000000
 #define KVM_FEATURE_NOP_IO_DELAY	0
+#define KVM_FEATURE_MMU_WRITE		1
 
 /* This CPUID returns a feature bitmap in eax.  Before enabling a particular
  * paravirtualization, the appropriate feature bit should be checked.
@@ -15,7 +16,8 @@
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
-#define KVM_PARA_FEATURES (1UL << KVM_FEATURE_NOP_IO_DELAY)
+#define KVM_PARA_FEATURES ((1UL << KVM_FEATURE_NOP_IO_DELAY)	|	\
+			   (1UL << KVM_FEATURE_MMU_WRITE))
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
Index: kvm.paravirt/include/linux/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/linux/kvm_para.h
+++ kvm.paravirt/include/linux/kvm_para.h
@@ -11,8 +11,13 @@
 
 /* Return values for hypercalls */
 #define KVM_ENOSYS		1000
+#define KVM_EFAULT		EFAULT
+#define KVM_E2BIG		E2BIG
 
-#define KVM_HC_VAPIC_POLL_IRQ            1
+#define KVM_HC_VAPIC_POLL_IRQ		1
+#define KVM_HYPERCALL_MMU_WRITE		2
+#define KVM_HYPERCALL_FLUSH_TLB		3
+#define KVM_HYPERCALL_RELEASE_PT	4
 
 /*
  * hypercalls use architecture specific
Index: kvm.paravirt/include/asm-x86/kvm_host.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_host.h
+++ kvm.paravirt/include/asm-x86/kvm_host.h
@@ -412,6 +412,8 @@ void kvm_mmu_change_mmu_pages(struct kvm
 
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 
+extern bool tdp_enabled;
+
 enum emulation_result {
 	EMULATE_DONE,       /* no further processing */
 	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [patch 3/5] KVM: hypercall batching
       [not found] <20080216220924.733723618@redhat.com>
  2008-02-16 22:09 ` [patch 1/5] KVM: add basic paravirt support Marcelo Tosatti
  2008-02-16 22:09 ` [patch 2/5] KVM: hypercall based pte updates and TLB flushes Marcelo Tosatti
@ 2008-02-16 22:09 ` Marcelo Tosatti
  2008-02-16 22:09 ` [patch 4/5] KVM: ignore zapped root pagetables Marcelo Tosatti
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-16 22:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

[-- Attachment #1: kvm-multicall --]
[-- Type: text/plain, Size: 8798 bytes --]

Batch pte updates and tlb flushes in lazy MMU mode.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>

Index: kvm.paravirt/arch/x86/kernel/kvm.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kernel/kvm.c
+++ kvm.paravirt/arch/x86/kernel/kvm.c
@@ -25,6 +25,74 @@
 #include <linux/kvm_para.h>
 #include <linux/cpu.h>
 #include <linux/mm.h>
+#include <linux/hardirq.h>
+
+#define MAX_MULTICALL_NR (PAGE_SIZE / sizeof(struct kvm_multicall_entry))
+
+struct kvm_para_state {
+	struct kvm_multicall_entry queue[MAX_MULTICALL_NR];
+	int queue_index;
+	enum paravirt_lazy_mode mode;
+};
+
+static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+
+static int can_defer_hypercall(struct kvm_para_state *state, unsigned int nr)
+{
+	if (state->mode == PARAVIRT_LAZY_MMU) {
+		switch (nr) {
+		case KVM_HYPERCALL_MMU_WRITE:
+		case KVM_HYPERCALL_FLUSH_TLB:
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static void hypercall_queue_flush(struct kvm_para_state *state)
+{
+	if (state->queue_index) {
+		kvm_hypercall2(KVM_HYPERCALL_MULTICALL, __pa(&state->queue),
+			      state->queue_index);
+		state->queue_index = 0;
+	}
+}
+
+static void kvm_hypercall_defer(struct kvm_para_state *state,
+				unsigned int nr,
+				unsigned long a0, unsigned long a1,
+				unsigned long a2, unsigned long a3)
+{
+	struct kvm_multicall_entry *entry;
+
+	BUG_ON(preemptible());
+
+	if (state->queue_index == MAX_MULTICALL_NR)
+		hypercall_queue_flush(state);
+
+	entry = &state->queue[state->queue_index++];
+	entry->nr = nr;
+	entry->a0 = a0;
+	entry->a1 = a1;
+	entry->a2 = a2;
+	entry->a3 = a3;
+}
+
+static long kvm_hypercall(unsigned int nr, unsigned long a0,
+			  unsigned long a1, unsigned long a2,
+			  unsigned long a3)
+{
+	struct kvm_para_state *state = &get_cpu_var(para_state);
+	long ret = 0;
+
+	if (can_defer_hypercall(state, nr))
+		kvm_hypercall_defer(state, nr, a0, a1, a2, a3);
+	else
+		ret = kvm_hypercall4(nr, a0, a1, a2, a3);
+
+	put_cpu_var(para_state);
+	return ret;
+}
 
 /*
  * No need for any "IO delay" on KVM
@@ -44,7 +112,7 @@ static void kvm_mmu_write(void *dest, co
 	if (size == 2)
 		a1 = *(u32 *)&p[4];
 #endif
-	kvm_hypercall4(KVM_HYPERCALL_MMU_WRITE, (unsigned long)dest, size, a0,
+	kvm_hypercall(KVM_HYPERCALL_MMU_WRITE, (unsigned long)dest, size, a0,
 		       a1);
 }
 
@@ -110,12 +178,31 @@ static void kvm_set_pud(pud_t *pudp, pud
 
 static void kvm_flush_tlb(void)
 {
-	kvm_hypercall0(KVM_HYPERCALL_FLUSH_TLB);
+	kvm_hypercall(KVM_HYPERCALL_FLUSH_TLB, 0, 0, 0, 0);
 }
 
 static void kvm_release_pt(u32 pfn)
 {
-	kvm_hypercall1(KVM_HYPERCALL_RELEASE_PT, pfn << PAGE_SHIFT);
+	kvm_hypercall(KVM_HYPERCALL_RELEASE_PT, pfn << PAGE_SHIFT, 0, 0, 0);
+}
+
+static void kvm_enter_lazy_mmu(void)
+{
+	struct kvm_para_state *state
+		= &per_cpu(para_state, smp_processor_id());
+
+	paravirt_enter_lazy_mmu();
+	state->mode = paravirt_get_lazy_mode();
+}
+
+static void kvm_leave_lazy_mmu(void)
+{
+	struct kvm_para_state *state
+		= &per_cpu(para_state, smp_processor_id());
+
+	hypercall_queue_flush(state);
+	paravirt_leave_lazy(paravirt_get_lazy_mode());
+	state->mode = paravirt_get_lazy_mode();
 }
 
 static void paravirt_ops_setup(void)
@@ -144,6 +231,11 @@ static void paravirt_ops_setup(void)
 		pv_mmu_ops.release_pt = kvm_release_pt;
 		pv_mmu_ops.release_pd = kvm_release_pt;
 	}
+
+	if (kvm_para_has_feature(KVM_FEATURE_MULTICALL)) {
+		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
+		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+	}
 }
 
 void __init kvm_guest_init(void)
Index: kvm.paravirt/arch/x86/kvm/x86.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/x86.c
+++ kvm.paravirt/arch/x86/kvm/x86.c
@@ -78,6 +78,8 @@ struct kvm_stats_debugfs_item debugfs_en
 	{ "fpu_reload", VCPU_STAT(fpu_reload) },
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+	{ "multicall", VCPU_STAT(multicall) },
+	{ "multicall_nr", VCPU_STAT(multicall_nr) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -764,8 +766,10 @@ long kvm_arch_dev_ioctl(struct file *fil
 	}
 	case KVM_GET_PARA_FEATURES: {
 		__u32 para_features = KVM_PARA_FEATURES;
-		if (tdp_enabled)
+		if (tdp_enabled) {
 			para_features &= ~(1UL << KVM_FEATURE_MMU_WRITE);
+			para_features &= ~(1UL << KVM_FEATURE_MULTICALL);
+		}
 
 		r = -EFAULT;
 		if (copy_to_user(argp, &para_features, sizeof para_features))
@@ -2317,6 +2321,52 @@ static int kvm_hypercall_release_pt(stru
 	return 0;
 }
 
+static int dispatch_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+			      unsigned long a0, unsigned long a1,
+			      unsigned long a2, unsigned long a3)
+{
+	switch (nr) {
+	case KVM_HC_VAPIC_POLL_IRQ:
+		return 0;
+	case KVM_HYPERCALL_MMU_WRITE:
+		return kvm_hypercall_mmu_write(vcpu, a0, a1, a2, a3);
+	case KVM_HYPERCALL_FLUSH_TLB:
+		return kvm_hypercall_flush_tlb(vcpu);
+	case KVM_HYPERCALL_RELEASE_PT:
+		return kvm_hypercall_release_pt(vcpu, a0);
+	}
+
+	return -KVM_ENOSYS;
+}
+
+static int kvm_hypercall_multicall(struct kvm_vcpu *vcpu, gpa_t addr, u32 nents)
+{
+	int i, result = 0;
+
+	++vcpu->stat.multicall;
+	vcpu->stat.multicall_nr += nents;
+
+	for (i = 0; i < nents; i++) {
+		struct kvm_multicall_entry mc;
+		int ret;
+
+		down_read(&vcpu->kvm->slots_lock);
+		ret = kvm_read_guest(vcpu->kvm, addr, &mc, sizeof(mc));
+		up_read(&vcpu->kvm->slots_lock);
+		if (ret)
+			return -KVM_EFAULT;
+
+		ret = dispatch_hypercall(vcpu, mc.nr, mc.a0, mc.a1, mc.a2,
+					    mc.a3);
+		if (ret)
+			result = ret;
+		addr += sizeof(mc);
+	}
+	if (result < 0)
+		return -KVM_EINVAL;
+	return result;
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
@@ -2337,23 +2387,11 @@ int kvm_emulate_hypercall(struct kvm_vcp
 		a3 &= 0xFFFFFFFF;
 	}
 
-	switch (nr) {
-	case KVM_HC_VAPIC_POLL_IRQ:
-		ret = 0;
-		break;
-	case KVM_HYPERCALL_MMU_WRITE:
-		ret = kvm_hypercall_mmu_write(vcpu, a0, a1, a2, a3);
-		break;
-	case KVM_HYPERCALL_FLUSH_TLB:
-		ret = kvm_hypercall_flush_tlb(vcpu);
-		break;
-	case KVM_HYPERCALL_RELEASE_PT:
-		ret = kvm_hypercall_release_pt(vcpu, a0);
-		break;
-	default:
-		ret = -KVM_ENOSYS;
-		break;
-	}
+	if (nr == KVM_HYPERCALL_MULTICALL)
+		ret = kvm_hypercall_multicall(vcpu, a0, a1);
+	else
+		ret = dispatch_hypercall(vcpu, nr, a0, a1, a2, a3);
+
 	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
 	kvm_x86_ops->decache_regs(vcpu);
 	return 0;
Index: kvm.paravirt/include/asm-x86/kvm_host.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_host.h
+++ kvm.paravirt/include/asm-x86/kvm_host.h
@@ -320,6 +320,8 @@ struct kvm_vcpu_stat {
 	u32 fpu_reload;
 	u32 insn_emulation;
 	u32 insn_emulation_fail;
+	u32 multicall;
+	u32 multicall_nr;
 };
 
 struct descriptor_table {
Index: kvm.paravirt/include/asm-x86/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_para.h
+++ kvm.paravirt/include/asm-x86/kvm_para.h
@@ -7,6 +7,7 @@
 #define KVM_CPUID_SIGNATURE	0x40000000
 #define KVM_FEATURE_NOP_IO_DELAY	0
 #define KVM_FEATURE_MMU_WRITE		1
+#define KVM_FEATURE_MULTICALL		2
 
 /* This CPUID returns a feature bitmap in eax.  Before enabling a particular
  * paravirtualization, the appropriate feature bit should be checked.
@@ -17,7 +18,17 @@
 #include <asm/processor.h>
 
 #define KVM_PARA_FEATURES ((1UL << KVM_FEATURE_NOP_IO_DELAY)	|	\
-			   (1UL << KVM_FEATURE_MMU_WRITE))
+			   (1UL << KVM_FEATURE_MMU_WRITE)	|	\
+			   (1UL << KVM_FEATURE_MULTICALL))
+
+struct kvm_multicall_entry
+{
+	u64 nr;
+	u64 a0;
+	u64 a1;
+	u64 a2;
+	u64 a3;
+};
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
  * trap that we will then rewrite to the appropriate instruction.
Index: kvm.paravirt/include/linux/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/linux/kvm_para.h
+++ kvm.paravirt/include/linux/kvm_para.h
@@ -13,11 +13,13 @@
 #define KVM_ENOSYS		1000
 #define KVM_EFAULT		EFAULT
 #define KVM_E2BIG		E2BIG
+#define KVM_EINVAL		EINVAL
 
 #define KVM_HC_VAPIC_POLL_IRQ		1
 #define KVM_HYPERCALL_MMU_WRITE		2
 #define KVM_HYPERCALL_FLUSH_TLB		3
 #define KVM_HYPERCALL_RELEASE_PT	4
+#define KVM_HYPERCALL_MULTICALL		5
 
 /*
  * hypercalls use architecture specific

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [patch 4/5] KVM: ignore zapped root pagetables
       [not found] <20080216220924.733723618@redhat.com>
                   ` (2 preceding siblings ...)
  2008-02-16 22:09 ` [patch 3/5] KVM: hypercall batching Marcelo Tosatti
@ 2008-02-16 22:09 ` Marcelo Tosatti
  2008-02-16 22:09 ` [patch 5/5] KVM: VMX cr3 cache support Marcelo Tosatti
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-16 22:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

[-- Attachment #1: mmu-invalid-root --]
[-- Type: text/plain, Size: 1879 bytes --]

Mark zapped root pagetables as invalid and ignore such pages during lookup.

This is a problem with the cr3-target feature, where a zapped root table fools
the faulting code into creating a read-only mapping. The result is a lockup
if the instruction can't be emulated.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>

Index: kvm.paravirt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/mmu.c
+++ kvm.paravirt/arch/x86/kvm/mmu.c
@@ -668,7 +668,8 @@ static struct kvm_mmu_page *kvm_mmu_look
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical) {
+		if (sp->gfn == gfn && !sp->role.metaphysical
+		    && !sp->role.invalid) {
 			pgprintk("%s: found role %x\n",
 				 __FUNCTION__, sp->role.word);
 			return sp;
@@ -796,8 +797,10 @@ static void kvm_mmu_zap_page(struct kvm 
 	if (!sp->root_count) {
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
-	} else
+	} else {
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
+		sp->role.invalid = 1;
+	}
 	kvm_mmu_reset_last_pte_updated(kvm);
 }
 
Index: kvm.paravirt/include/asm-x86/kvm_host.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_host.h
+++ kvm.paravirt/include/asm-x86/kvm_host.h
@@ -140,6 +140,7 @@ union kvm_mmu_page_role {
 		unsigned pad_for_nice_hex_output : 6;
 		unsigned metaphysical : 1;
 		unsigned access : 3;
+		unsigned invalid : 1;
 	};
 };
 

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [patch 5/5] KVM: VMX cr3 cache support
       [not found] <20080216220924.733723618@redhat.com>
                   ` (3 preceding siblings ...)
  2008-02-16 22:09 ` [patch 4/5] KVM: ignore zapped root pagetables Marcelo Tosatti
@ 2008-02-16 22:09 ` Marcelo Tosatti
  2008-02-16 23:37 ` [patch 0/5] KVM paravirt MMU updates and cr3 caching Anthony Liguori
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-16 22:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti

[-- Attachment #1: cr3-cache --]
[-- Type: text/plain, Size: 30996 bytes --]

Add support for the cr3 cache feature on Intel VMX CPU's. This avoids
vmexits on context switch if the cr3 value is cached in one of the 
entries (currently 4 are present).

This is especially important for Xenner, where each guest syscalls 
involves a cr3 switch.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>


Index: kvm.paravirt/arch/x86/kernel/kvm.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kernel/kvm.c
+++ kvm.paravirt/arch/x86/kernel/kvm.c
@@ -26,14 +26,16 @@
 #include <linux/cpu.h>
 #include <linux/mm.h>
 #include <linux/hardirq.h>
+#include <asm/tlbflush.h>
 
 #define MAX_MULTICALL_NR (PAGE_SIZE / sizeof(struct kvm_multicall_entry))
 
 struct kvm_para_state {
+	struct kvm_cr3_cache cr3_cache;
 	struct kvm_multicall_entry queue[MAX_MULTICALL_NR];
 	int queue_index;
 	enum paravirt_lazy_mode mode;
-};
+} __attribute__ ((aligned(PAGE_SIZE)));
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 
@@ -101,6 +103,98 @@ static void kvm_io_delay(void)
 {
 }
 
+static void kvm_new_cr3(unsigned long cr3)
+{
+	kvm_hypercall1(KVM_HYPERCALL_SET_CR3, cr3);
+}
+
+/*
+ * Special, register-to-cr3 instruction based hypercall API
+ * variant to the KVM host. This utilizes the cr3 filter capability
+ * of the hardware - if this works out then no VM exit happens,
+ * if a VM exit happens then KVM will get the virtual address too.
+ */
+static void kvm_write_cr3(unsigned long guest_cr3)
+{
+	struct kvm_para_state *para_state = &get_cpu_var(para_state);
+	struct kvm_cr3_cache *cache = &para_state->cr3_cache;
+	int idx;
+
+	/*
+	 * Check the cache (maintained by the host) for a matching
+	 * guest_cr3 => host_cr3 mapping. Use it if found:
+	 */
+	for (idx = 0; idx < cache->max_idx; idx++) {
+		if (cache->entry[idx].guest_cr3 == guest_cr3) {
+			/*
+			 * Cache-hit: we load the cached host-CR3 value.
+			 * This never causes any VM exit. (if it does then the
+			 * hypervisor could do nothing with this instruction
+			 * and the guest OS would be aborted)
+			 */
+			native_write_cr3(cache->entry[idx].host_cr3);
+			goto out;
+		}
+	}
+
+	/*
+	 * Cache-miss. Tell the host the new cr3 via hypercall (to avoid
+	 * aliasing problems with a cached host_cr3 == guest_cr3).
+	 */
+	kvm_new_cr3(guest_cr3);
+out:
+	put_cpu_var(para_state);
+}
+
+/*
+ * Avoid the VM exit upon cr3 load by using the cached
+ * ->active_mm->pgd value:
+ */
+static void kvm_flush_tlb_user(void)
+{
+	kvm_write_cr3(__pa(current->active_mm->pgd));
+}
+
+/*
+ * Disable global pages, do a flush, then enable global pages:
+ */
+static fastcall void kvm_flush_tlb_kernel(void)
+{
+	unsigned long orig_cr4 = read_cr4();
+
+	write_cr4(orig_cr4 & ~X86_CR4_PGE);
+	kvm_flush_tlb_user();
+	write_cr4(orig_cr4);
+}
+
+static void register_cr3_cache(void *cache)
+{
+	struct kvm_para_state *state;
+
+	state = &per_cpu(para_state, raw_smp_processor_id());
+	wrmsrl(KVM_MSR_SET_CR3_CACHE, __pa(&state->cr3_cache));
+}
+
+static unsigned __init kvm_patch(u8 type, u16 clobbers, void *ibuf,
+				 unsigned long addr, unsigned len)
+{
+	switch (type) {
+	case PARAVIRT_PATCH(pv_mmu_ops.write_cr3):
+		return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+	default:
+		return native_patch(type, clobbers, ibuf, addr, len);
+	}
+}
+
+static void __init setup_guest_cr3_cache(void)
+{
+	on_each_cpu(register_cr3_cache, NULL, 0, 1);
+
+	pv_mmu_ops.write_cr3 = kvm_write_cr3;
+	pv_mmu_ops.flush_tlb_user = kvm_flush_tlb_user;
+	pv_mmu_ops.flush_tlb_kernel = kvm_flush_tlb_kernel;
+}
+
 static void kvm_mmu_write(void *dest, const void *src, size_t size)
 {
 	const uint8_t *p = src;
@@ -117,6 +211,28 @@ static void kvm_mmu_write(void *dest, co
 }
 
 /*
+ * CR3 cache initialization uses on_each_cpu(), so it can't
+ * happen at kvm_guest_init time.
+ */
+int __init kvm_cr3_cache_init(void)
+{
+	unsigned long flags;
+
+	if (!kvm_para_available())
+		return -ENOSYS;
+
+	if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE)) {
+		setup_guest_cr3_cache();
+		local_irq_save(flags);
+		apply_paravirt(__parainstructions, __parainstructions_end);
+		local_irq_restore(flags);
+	}
+
+	return 0;
+}
+module_init(kvm_cr3_cache_init);
+
+/*
  * We only need to hook operations that are MMU writes.  We hook these so that
  * we can use lazy MMU mode to batch these operations.  We could probably
  * improve the performance of the host code if we used some of the information
@@ -236,6 +352,9 @@ static void paravirt_ops_setup(void)
 		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
 		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
 	}
+
+	if (kvm_para_has_feature(KVM_FEATURE_CR3_CACHE))
+		pv_init_ops.patch = kvm_patch;
 }
 
 void __init kvm_guest_init(void)
Index: kvm.paravirt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/mmu.c
+++ kvm.paravirt/arch/x86/kvm/mmu.c
@@ -258,6 +258,16 @@ static int mmu_topup_memory_cache(struct
 	}
 	return 0;
 }
+static void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu)
+{
+        struct kvm_cr3_cache *cache;
+
+        if (!vcpu->arch.cr3_cache)
+                return;
+        cache = vcpu->arch.cr3_cache;
+        memset(cache->entry, 0, sizeof(cache->entry));
+}
+
 
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 {
@@ -978,7 +988,7 @@ static void nonpaging_new_cr3(struct kvm
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			   gfn_t gfn, struct page *page, int level)
 {
-	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
+	hpa_t table_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
 	int pt_write = 0;
 
 	for (; ; level--) {
@@ -1058,49 +1068,71 @@ static void nonpaging_prefetch_page(stru
 
 static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
+	int i, j;
 	struct kvm_mmu_page *sp;
 
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return;
+	/*
+ 	 * Skip to the next cr3 filter entry and free it (if it's occupied).
+ 	 */
+	vcpu->arch.cr3_cache_idx++;
+	if (unlikely(vcpu->arch.cr3_cache_idx >= vcpu->arch.cr3_cache_limit))
+		vcpu->arch.cr3_cache_idx = 0;
+
+	j = vcpu->arch.cr3_cache_idx;
+	/*
+ 	 * Clear the guest-visible entry.
+ 	 */
+	if (vcpu->arch.cr3_cache) {
+		vcpu->arch.cr3_cache->entry[j].guest_cr3 = 0;
+		vcpu->arch.cr3_cache->entry[j].host_cr3 = 0;
+	}
 	spin_lock(&vcpu->kvm->mmu_lock);
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
+		hpa_t root = vcpu->arch.mmu.root_hpa[j];
+
+		if (!VALID_PAGE(root)) {
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			return;
+		}
 
 		sp = page_header(root);
 		--sp->root_count;
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+		vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
 	}
 #endif
-	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
-
-		if (root) {
-			root &= PT64_BASE_ADDR_MASK;
-			sp = page_header(root);
-			--sp->root_count;
+	ASSERT(vcpu->arch.mmu.pae_root[j]);
+	if (VALID_PAGE(vcpu->arch.mmu.pae_root[j][0])) {
+		for (i = 0; i < 4; ++i) {
+			hpa_t root = vcpu->arch.mmu.pae_root[j][i];
+
+			if (root) {
+				root &= PT64_BASE_ADDR_MASK;
+				sp = page_header(root);
+				--sp->root_count;
+			}
+			vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
 		}
-		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
 }
 
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
-	int i;
+	int i, j;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
 	int metaphysical = 0;
 
 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
+	j = vcpu->arch.cr3_cache_idx;
 
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
-		hpa_t root = vcpu->arch.mmu.root_hpa;
+		hpa_t root = vcpu->arch.mmu.root_hpa[j];
 
 		ASSERT(!VALID_PAGE(root));
 		if (tdp_enabled)
@@ -1110,7 +1142,7 @@ static void mmu_alloc_roots(struct kvm_v
 				      ACC_ALL, NULL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
-		vcpu->arch.mmu.root_hpa = root;
+		vcpu->arch.mmu.root_hpa[j] = root;
 		return;
 	}
 #endif
@@ -1118,7 +1150,7 @@ static void mmu_alloc_roots(struct kvm_v
 	if (tdp_enabled)
 		metaphysical = 1;
 	for (i = 0; i < 4; ++i) {
-		hpa_t root = vcpu->arch.mmu.pae_root[i];
+		hpa_t root = vcpu->arch.mmu.pae_root[j][i];
 
 		ASSERT(!VALID_PAGE(root));
 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
@@ -1134,9 +1166,9 @@ static void mmu_alloc_roots(struct kvm_v
 				      ACC_ALL, NULL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
-		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+		vcpu->arch.mmu.pae_root[j][i] = root | PT_PRESENT_MASK;
 	}
-	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+	vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
 }
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -1156,7 +1188,7 @@ static int nonpaging_page_fault(struct k
 		return r;
 
 	ASSERT(vcpu);
-	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa[j]));
 
 	gfn = gva >> PAGE_SHIFT;
 
@@ -1196,12 +1228,19 @@ static int tdp_page_fault(struct kvm_vcp
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
 {
-	mmu_free_roots(vcpu);
+	int j;
+
+	/*
+ 	 * This will cycle through all existing roots and free them.
+ 	 */
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+		mmu_free_roots(vcpu);
 }
 
 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = nonpaging_page_fault;
@@ -1210,7 +1249,8 @@ static int nonpaging_init_context(struct
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 	return 0;
 }
 
@@ -1249,6 +1289,7 @@ static void paging_free(struct kvm_vcpu 
 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -1258,7 +1299,8 @@ static int paging64_init_context_common(
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 	return 0;
 }
 
@@ -1270,6 +1312,7 @@ static int paging64_init_context(struct 
 static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
@@ -1278,7 +1321,8 @@ static int paging32_init_context(struct 
 	context->prefetch_page = paging32_prefetch_page;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 	return 0;
 }
 
@@ -1290,13 +1334,15 @@ static int paging32E_init_context(struct
 static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int i;
 
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->shadow_root_level = TDP_ROOT_LEVEL;
-	context->root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		context->root_hpa[i] = INVALID_PAGE;
 
 	if (!is_paging(vcpu)) {
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -1318,7 +1364,7 @@ static int init_kvm_tdp_mmu(struct kvm_v
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
 	if (!is_paging(vcpu))
 		return nonpaging_init_context(vcpu);
@@ -1340,11 +1386,14 @@ static int init_kvm_mmu(struct kvm_vcpu 
 
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
+	int j;
 	ASSERT(vcpu);
-	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
-		vcpu->arch.mmu.free(vcpu);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	}
+
+	for(j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+		if (VALID_PAGE(vcpu->arch.mmu.root_hpa[j])) {
+			vcpu->arch.mmu.free(vcpu);
+			vcpu->arch.mmu.root_hpa[j] = INVALID_PAGE;
+		}
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -1357,6 +1406,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_reset_context)
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
+	int j = vcpu->arch.cr3_cache_idx;
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -1365,8 +1415,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	kvm_mmu_free_some_pages(vcpu);
 	mmu_alloc_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-	kvm_mmu_flush_tlb(vcpu);
+	/* setting CR3 will flush the TLB */
+	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa[j]);
 out:
 	return r;
 }
@@ -1374,7 +1424,9 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-	mmu_free_roots(vcpu);
+	int j;
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++)
+		mmu_free_roots(vcpu);
 }
 
 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
@@ -1546,6 +1598,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
 			kvm_mmu_zap_page(vcpu->kvm, sp);
+			kvm_cr3_cache_clear(vcpu);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -1607,6 +1660,8 @@ int kvm_mmu_unprotect_page_virt(struct k
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (r)
+		kvm_cr3_cache_clear(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	return r;
 }
@@ -1619,6 +1674,7 @@ void __kvm_mmu_free_some_pages(struct kv
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_zap_page(vcpu->kvm, sp);
+		kvm_cr3_cache_clear(vcpu);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 }
@@ -1669,19 +1725,24 @@ EXPORT_SYMBOL_GPL(kvm_enable_tdp);
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
+	int j;
 
 	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 	}
-	free_page((unsigned long)vcpu->arch.mmu.pae_root);
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+		ASSERT(vcpu->arch.mmu.pae_root[j]);
+		free_page((unsigned long)vcpu->arch.mmu.pae_root[j]);
+		vcpu->arch.mmu.pae_root[j] = NULL;
+	}
 }
 
 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	struct page *page;
-	int i;
+	int i, j;
 
 	ASSERT(vcpu);
 
@@ -1691,17 +1752,23 @@ static int alloc_mmu_pages(struct kvm_vc
 	else
 		vcpu->kvm->arch.n_free_mmu_pages =
 					vcpu->kvm->arch.n_alloc_mmu_pages;
-	/*
-	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
-	 * Therefore we need to allocate shadow page tables in the first
-	 * 4GB of memory, which happens to fit the DMA32 zone.
-	 */
-	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
-	if (!page)
-		goto error_1;
-	vcpu->arch.mmu.pae_root = page_address(page);
-	for (i = 0; i < 4; ++i)
-		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
+		/*
+		 * When emulating 32-bit mode, cr3 is only 32 bits even on
+		 * x86_64. Therefore we need to allocate shadow page tables
+		 * in the first 4GB of memory, which happens to fit the DMA32
+		 * zone.
+		 */
+		page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+		if (!page)
+			goto error_1;
+
+		ASSERT(!vcpu->arch.mmu.pae_root[j]);
+		vcpu->arch.mmu.pae_root[j] = page_address(page);
+		for (i = 0; i < 4; ++i)
+			vcpu->arch.mmu.pae_root[j][i] = INVALID_PAGE;
+	}
 
 	return 0;
 
@@ -1713,7 +1780,7 @@ error_1:
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
 	return alloc_mmu_pages(vcpu);
 }
@@ -1721,7 +1788,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu
 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]));
 
 	return init_kvm_mmu(vcpu);
 }
@@ -1881,15 +1948,16 @@ static void audit_mappings(struct kvm_vc
 {
 	unsigned i;
 
-	if (vcpu->arch.mmu.root_level == 4)
+	if (vcpu->arch.mmu.root_level == 4) {
 		audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
-	else
+		return;
+	}
+	for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) {
 		for (i = 0; i < 4; ++i)
-			if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+			if (vcpu->arch.mmu.pae_root[j][i] & PT_PRESENT_MASK)
 				audit_mappings_page(vcpu,
-						    vcpu->arch.mmu.pae_root[i],
-						    i << 30,
-						    2);
+				  		  vcpu->arch.mmu.pae_root[j][i], 						    i << 30, 2);
+	}
 }
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
Index: kvm.paravirt/arch/x86/kvm/mmu.h
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/mmu.h
+++ kvm.paravirt/arch/x86/kvm/mmu.h
@@ -17,7 +17,8 @@ static inline void kvm_mmu_free_some_pag
 
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
-	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+	int idx = vcpu->arch.cr3_cache_idx;
+	if (likely(vcpu->arch.mmu.root_hpa[idx] != INVALID_PAGE))
 		return 0;
 
 	return kvm_mmu_load(vcpu);
Index: kvm.paravirt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.paravirt/arch/x86/kvm/paging_tmpl.h
@@ -283,10 +283,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
 	if (!is_present_pte(walker->ptes[walker->level - 1]))
 		return NULL;
 
-	shadow_addr = vcpu->arch.mmu.root_hpa;
+	shadow_addr = vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx];
 	level = vcpu->arch.mmu.shadow_root_level;
 	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		shadow_addr = vcpu->arch.mmu.pae_root[vcpu->arch.cr3_cache_idx][(addr >> 30) & 3];
 		shadow_addr &= PT64_BASE_ADDR_MASK;
 		--level;
 	}
Index: kvm.paravirt/arch/x86/kvm/vmx.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/vmx.c
+++ kvm.paravirt/arch/x86/kvm/vmx.c
@@ -216,6 +216,10 @@ static inline int cpu_has_vmx_vpid(void)
 	return (vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_VPID);
 }
+static inline bool cpu_has_cr3_cache(void)
+{
+	return true;
+}
 
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
@@ -785,6 +789,30 @@ static int vmx_get_msr(struct kvm_vcpu *
 	return 0;
 }
 
+int vmx_cr3_cache_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct page *page;
+	hva_t cr3_cache_hva;
+
+	if (data != PAGE_ALIGN(data) || vcpu->arch.cr3_cache)
+		return -EINVAL;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		return -EINVAL;
+	}
+
+	cr3_cache_hva = (hva_t)__va(page_to_phys(page));
+	vcpu->arch.cr3_cache = (void *)cr3_cache_hva;
+	vcpu->arch.cr3_cache->max_idx = vcpu->arch.cr3_cache_limit;
+
+	return 0;
+}
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -824,6 +852,9 @@ static int vmx_set_msr(struct kvm_vcpu *
 	case MSR_IA32_TIME_STAMP_COUNTER:
 		guest_write_tsc(data);
 		break;
+	case KVM_MSR_SET_CR3_CACHE:
+		ret = vmx_cr3_cache_msr(vcpu, data);
+		break;
 	default:
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
@@ -1322,10 +1353,23 @@ static void vmx_set_cr0(struct kvm_vcpu 
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	struct kvm_cr3_cache *cache;
+	int idx;
+
 	vmx_flush_tlb(vcpu);
 	vmcs_writel(GUEST_CR3, cr3);
 	if (vcpu->arch.cr0 & X86_CR0_PE)
 		vmx_fpu_deactivate(vcpu);
+
+	if (!vcpu->arch.cr3_cache)
+		return;
+
+	idx = vcpu->arch.cr3_cache_idx;
+	cache = vcpu->arch.cr3_cache;
+
+	cache->entry[idx].host_cr3 = cr3;
+	cache->entry[idx].guest_cr3 = vcpu->arch.cr3;
+	vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3);
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1503,6 +1547,39 @@ out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
 }
+/*
+ * Set up the cr3 validity hardware cache.
+ */
+static void vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu)
+{
+	unsigned int cr3_target_values, i;
+	u64 msr_val;
+
+	rdmsrl(MSR_IA32_VMX_MISC, msr_val);
+
+	printk("MSR_IA32_VMX_MISC: %016Lx\n", msr_val);
+
+	/*
+	 * 9 bits of "CR3 target values":
+	 */
+	cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1);
+	printk(" cr3 target values: %d\n", cr3_target_values);
+	if (cr3_target_values > KVM_CR3_CACHE_SIZE) {
+		printk("KVM: limiting cr3 cache size from %d to %d\n",
+			cr3_target_values, KVM_CR3_CACHE_SIZE);
+		cr3_target_values = KVM_CR3_CACHE_SIZE;
+	}
+
+	vcpu->arch.cr3_cache_idx = 0;
+	vcpu->arch.cr3_cache_limit = cr3_target_values;
+	/*
+	 * Initialize. TODO: set this to guest physical memory.
+	 */
+	for (i = 0; i < cr3_target_values; i++)
+		vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL);
+
+	vmcs_write32(CR3_TARGET_COUNT, cr3_target_values);
+}
 
 static void seg_setup(int seg)
 {
@@ -1599,7 +1676,7 @@ static int vmx_vcpu_setup(struct vcpu_vm
 
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
-	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
+	vmcs_setup_cr3_cache(&vmx->vcpu);
 
 	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
 	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
@@ -2396,6 +2473,56 @@ static void fixup_rmode_irq(struct vcpu_
 		| vmx->rmode.irq.vector;
 }
 
+static void kvm_cr3_cache_sync(struct kvm_vcpu *vcpu)
+{
+	void *guest_cr3_hva;
+	hpa_t guest_cr3_hpa;
+	struct kvm_cr3_cache *cache;
+	int j;
+	int idx = vcpu->arch.cr3_cache_idx;
+
+	if (!vcpu->arch.cr3_cache)
+		return;
+
+	guest_cr3_hpa = vmcs_readl(GUEST_CR3);
+	/*
+	 * Are they in sync already?
+	 */
+	if (guest_cr3_hpa == vcpu->arch.mmu.root_hpa[idx])
+		return;
+
+	cache = vcpu->arch.cr3_cache;
+#ifdef CONFIG_X86_64
+	if (vcpu->arch.mmu.shadow_root_level == 4) {
+		for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+			hpa_t root = cache->entry[j].host_cr3;
+			if (root != guest_cr3_hpa)
+				continue;
+			vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+			vcpu->arch.cr3_cache_idx = j;
+			vcpu->arch.mmu.root_hpa[j] = cache->entry[j].host_cr3;
+			++vcpu->stat.cr3_cache_synced;
+			return;
+		}
+	WARN_ON(j == KVM_CR3_CACHE_SIZE-1);
+	}
+#endif
+
+	guest_cr3_hva = __va(guest_cr3_hpa);
+	for (j = 0; j < vcpu->arch.cr3_cache_limit; j++) {
+		u64 *root = vcpu->arch.mmu.pae_root[j];
+		WARN_ON(!root);
+		if (root != guest_cr3_hva)
+			continue;
+		vcpu->arch.cr3 = cache->entry[j].guest_cr3;
+		vcpu->arch.cr3_cache_idx = j;
+		vcpu->arch.mmu.root_hpa[j] = __pa(vcpu->arch.mmu.pae_root[j]);
+		++vcpu->stat.cr3_cache_synced;
+		return;
+	}
+	WARN_ON(j == KVM_CR3_CACHE_SIZE-1);
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2406,6 +2533,8 @@ static void vmx_vcpu_run(struct kvm_vcpu
 	 */
 	vmcs_writel(HOST_CR0, read_cr0());
 
+	WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
+
 	asm(
 		/* Store host registers */
 #ifdef CONFIG_X86_64
@@ -2520,6 +2649,12 @@ static void vmx_vcpu_run(struct kvm_vcpu
 		, "ebx", "edi", "rsi"
 #endif
 	      );
+	/*
+ 	 * Figure out whether vcpu->cr3 needs updating because
+ 	 * the guest made use of the cr3 cache.
+ 	 */
+	kvm_cr3_cache_sync(vcpu);
+	WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->arch.mmu.root_hpa[vcpu->arch.cr3_cache_idx]);
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
@@ -2552,11 +2687,16 @@ static void vmx_free_vmcs(struct kvm_vcp
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *page = NULL;
 
 	spin_lock(&vmx_vpid_lock);
 	if (vmx->vpid != 0)
 		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
+	if (vcpu->arch.cr3_cache) {
+		page = virt_to_page(vcpu->arch.cr3_cache);
+		kvm_release_page_dirty(page);
+	}
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->host_msrs);
 	kfree(vmx->guest_msrs);
@@ -2641,6 +2781,7 @@ static struct kvm_x86_ops vmx_x86_ops = 
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 	.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+	.cpu_has_cr3_cache = cpu_has_cr3_cache,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
Index: kvm.paravirt/arch/x86/kvm/x86.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/x86.c
+++ kvm.paravirt/arch/x86/kvm/x86.c
@@ -80,6 +80,7 @@ struct kvm_stats_debugfs_item debugfs_en
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "multicall", VCPU_STAT(multicall) },
 	{ "multicall_nr", VCPU_STAT(multicall_nr) },
+	{ "cr3_cache_synced", VCPU_STAT(cr3_cache_synced) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -766,10 +767,13 @@ long kvm_arch_dev_ioctl(struct file *fil
 	}
 	case KVM_GET_PARA_FEATURES: {
 		__u32 para_features = KVM_PARA_FEATURES;
+
 		if (tdp_enabled) {
 			para_features &= ~(1UL << KVM_FEATURE_MMU_WRITE);
 			para_features &= ~(1UL << KVM_FEATURE_MULTICALL);
 		}
+		if (!kvm_x86_ops->cpu_has_cr3_cache())
+			para_features &= ~(1UL << KVM_FEATURE_CR3_CACHE);
 
 		r = -EFAULT;
 		if (copy_to_user(argp, &para_features, sizeof para_features))
@@ -2321,6 +2325,12 @@ static int kvm_hypercall_release_pt(stru
 	return 0;
 }
 
+static int kvm_hypercall_set_cr3(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+	set_cr3(vcpu, cr3);
+	return 0;
+}
+
 static int dispatch_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
 			      unsigned long a0, unsigned long a1,
 			      unsigned long a2, unsigned long a3)
@@ -2334,6 +2344,8 @@ static int dispatch_hypercall(struct kvm
 		return kvm_hypercall_flush_tlb(vcpu);
 	case KVM_HYPERCALL_RELEASE_PT:
 		return kvm_hypercall_release_pt(vcpu, a0);
+	case KVM_HYPERCALL_SET_CR3:
+		return kvm_hypercall_set_cr3(vcpu, a0);
 	}
 
 	return -KVM_ENOSYS;
@@ -3245,12 +3257,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *
 {
 	struct page *page;
 	struct kvm *kvm;
-	int r;
+	int r, i;
 
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	for (i = 0; i < KVM_CR3_CACHE_SIZE; i++)
+		vcpu->arch.mmu.root_hpa[i] = INVALID_PAGE;
 	if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
 		vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
 	else
Index: kvm.paravirt/include/asm-x86/kvm_host.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_host.h
+++ kvm.paravirt/include/asm-x86/kvm_host.h
@@ -181,11 +181,11 @@ struct kvm_mmu {
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
-	hpa_t root_hpa;
+	hpa_t root_hpa[KVM_CR3_CACHE_SIZE];
 	int root_level;
 	int shadow_root_level;
 
-	u64 *pae_root;
+	u64 *pae_root[KVM_CR3_CACHE_SIZE];
 };
 
 struct kvm_vcpu_arch {
@@ -199,6 +199,9 @@ struct kvm_vcpu_arch {
 	unsigned long cr0;
 	unsigned long cr2;
 	unsigned long cr3;
+	struct kvm_cr3_cache *cr3_cache;
+	unsigned int cr3_cache_idx;
+	unsigned int cr3_cache_limit;
 	unsigned long cr4;
 	unsigned long cr8;
 	u64 pdptrs[4]; /* pae */
@@ -323,6 +326,7 @@ struct kvm_vcpu_stat {
 	u32 insn_emulation_fail;
 	u32 multicall;
 	u32 multicall_nr;
+	u32 cr3_cache_synced;
 };
 
 struct descriptor_table {
@@ -339,6 +343,7 @@ struct kvm_x86_ops {
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
+	bool (*cpu_has_cr3_cache)(void);
 
 	/* Create, but do not attach this VCPU */
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
Index: kvm.paravirt/include/asm-x86/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/asm-x86/kvm_para.h
+++ kvm.paravirt/include/asm-x86/kvm_para.h
@@ -8,6 +8,7 @@
 #define KVM_FEATURE_NOP_IO_DELAY	0
 #define KVM_FEATURE_MMU_WRITE		1
 #define KVM_FEATURE_MULTICALL		2
+#define KVM_FEATURE_CR3_CACHE		3
 
 /* This CPUID returns a feature bitmap in eax.  Before enabling a particular
  * paravirtualization, the appropriate feature bit should be checked.
@@ -19,7 +20,10 @@
 
 #define KVM_PARA_FEATURES ((1UL << KVM_FEATURE_NOP_IO_DELAY)	|	\
 			   (1UL << KVM_FEATURE_MMU_WRITE)	|	\
-			   (1UL << KVM_FEATURE_MULTICALL))
+			   (1UL << KVM_FEATURE_MULTICALL)	|	\
+			   (1UL << KVM_FEATURE_CR3_CACHE))
+
+#define KVM_MSR_SET_CR3_CACHE 0x87655678
 
 struct kvm_multicall_entry
 {
@@ -118,4 +122,16 @@ static inline unsigned int kvm_arch_para
 
 #endif
 
+#define KVM_CR3_CACHE_SIZE 4
+
+struct kvm_cr3_cache_entry {
+	__u64 guest_cr3;
+	__u64 host_cr3;
+};
+
+struct kvm_cr3_cache {
+	struct kvm_cr3_cache_entry entry[KVM_CR3_CACHE_SIZE];
+	__u32 max_idx;
+};
+
 #endif
Index: kvm.paravirt/include/linux/kvm_para.h
===================================================================
--- kvm.paravirt.orig/include/linux/kvm_para.h
+++ kvm.paravirt/include/linux/kvm_para.h
@@ -20,6 +20,7 @@
 #define KVM_HYPERCALL_FLUSH_TLB		3
 #define KVM_HYPERCALL_RELEASE_PT	4
 #define KVM_HYPERCALL_MULTICALL		5
+#define KVM_HYPERCALL_SET_CR3		6
 
 /*
  * hypercalls use architecture specific
Index: kvm.paravirt/arch/x86/kvm/svm.c
===================================================================
--- kvm.paravirt.orig/arch/x86/kvm/svm.c
+++ kvm.paravirt/arch/x86/kvm/svm.c
@@ -1801,6 +1801,11 @@ static bool svm_cpu_has_accelerated_tpr(
 	return false;
 }
 
+static bool cpu_has_cr3_cache(void)
+{
+	return false;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -1810,6 +1815,7 @@ static struct kvm_x86_ops svm_x86_ops = 
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+	.cpu_has_cr3_cache = cpu_has_cr3_cache,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,

-- 


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 0/5] KVM paravirt MMU updates and cr3 caching
       [not found] <20080216220924.733723618@redhat.com>
                   ` (4 preceding siblings ...)
  2008-02-16 22:09 ` [patch 5/5] KVM: VMX cr3 cache support Marcelo Tosatti
@ 2008-02-16 23:37 ` Anthony Liguori
  2008-02-17  2:24   ` Marcelo Tosatti
       [not found] ` <20080216221220.843135254@redhat.com>
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 21+ messages in thread
From: Anthony Liguori @ 2008-02-16 23:37 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

Marcelo Tosatti wrote:
> The following patchset, based on earlier work by Anthony and Ingo, adds
> paravirt_ops support for KVM guests enabling hypercall based pte updates,
> hypercall batching and cr3 caching.
>   

Could you post performance results for each optimization?  I'm 
particularly curious if the hypercall batching is very useful.

Regards,

Anthony Liguori



-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 0/5] KVM paravirt MMU updates and cr3 caching
  2008-02-16 23:37 ` [patch 0/5] KVM paravirt MMU updates and cr3 caching Anthony Liguori
@ 2008-02-17  2:24   ` Marcelo Tosatti
  0 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-17  2:24 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Avi Kivity

On Sat, Feb 16, 2008 at 05:37:00PM -0600, Anthony Liguori wrote:
> Marcelo Tosatti wrote:
> >The following patchset, based on earlier work by Anthony and Ingo, adds
> >paravirt_ops support for KVM guests enabling hypercall based pte updates,
> >hypercall batching and cr3 caching.
> >  
> 
> Could you post performance results for each optimization?  I'm 
> particularly curious if the hypercall batching is very useful.

Batched hypercall pte updates give 8.5% performance improvement on
kernel compile:

http://www.mail-archive.com/kvm-devel@lists.sourceforge.net/msg12395.html

I can get separate results tomorrow or Monday, but I'm sure batching
plays a significant role. For the kernel compile test, there is an
average of 5 pte updates per batched hypercall.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

[parent not found: <20080216221220.843135254@redhat.com>]

* Re: [patch 2/5] KVM: hypercall based pte updates and TLB flushes
       [not found] ` <20080216221220.843135254@redhat.com>
@ 2008-02-17  8:28   ` Avi Kivity
  2008-02-17 13:13     ` Avi Kivity
  2008-02-17  8:32   ` Avi Kivity
  1 sibling, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-02-17  8:28 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> Hypercall based pte updates are faster than faults, and also allow use
> of the lazy MMU mode to batch operations.
>
> Don't report the feature if two dimensional paging is enabled.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> +/*
> + * We only need to hook operations that are MMU writes.  We hook these so that
> + * we can use lazy MMU mode to batch these operations.  We could probably
> + * improve the performance of the host code if we used some of the information
> + * here to simplify processing of batched writes.
> + */
>   

One option is, if the guest promises never to write to a page table 
directly, is to avoid write protecting guest page tables.  I think the 
shadow code can handle it (since the gfn/spte relationship is maintained 
by shadow code, and doesn't require reading the guest page tables), but 
am not sure.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 2/5] KVM: hypercall based pte updates and TLB flushes
  2008-02-17  8:28   ` [patch 2/5] KVM: hypercall based pte updates and TLB flushes Avi Kivity
@ 2008-02-17 13:13     ` Avi Kivity
  2008-02-17 14:51       ` Marcelo Tosatti
  0 siblings, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-02-17 13:13 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Avi Kivity wrote:
> Marcelo Tosatti wrote:
>> Hypercall based pte updates are faster than faults, and also allow use
>> of the lazy MMU mode to batch operations.
>>
>> Don't report the feature if two dimensional paging is enabled.
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>> +/*
>> + * We only need to hook operations that are MMU writes.  We hook 
>> these so that
>> + * we can use lazy MMU mode to batch these operations.  We could 
>> probably
>> + * improve the performance of the host code if we used some of the 
>> information
>> + * here to simplify processing of batched writes.
>> + */
>>   
>
> One option is, if the guest promises never to write to a page table 
> directly, is to avoid write protecting guest page tables.  I think the 
> shadow code can handle it (since the gfn/spte relationship is 
> maintained by shadow code, and doesn't require reading the guest page 
> tables), but am not sure.
>

In addition to reducing mmu work for write protection, this allows more 
efficient use of large pages.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 2/5] KVM: hypercall based pte updates and TLB flushes
  2008-02-17 13:13     ` Avi Kivity
@ 2008-02-17 14:51       ` Marcelo Tosatti
  2008-02-17 14:57         ` Avi Kivity
  0 siblings, 1 reply; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-17 14:51 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel

On Sun, Feb 17, 2008 at 03:13:47PM +0200, Avi Kivity wrote:
> Avi Kivity wrote:
> >Marcelo Tosatti wrote:
> >>Hypercall based pte updates are faster than faults, and also allow use
> >>of the lazy MMU mode to batch operations.
> >>
> >>Don't report the feature if two dimensional paging is enabled.
> >>
> >>Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >>+/*
> >>+ * We only need to hook operations that are MMU writes.  We hook 
> >>these so that
> >>+ * we can use lazy MMU mode to batch these operations.  We could 
> >>probably
> >>+ * improve the performance of the host code if we used some of the 
> >>information
> >>+ * here to simplify processing of batched writes.
> >>+ */
> >>  
> >
> >One option is, if the guest promises never to write to a page table 
> >directly, is to avoid write protecting guest page tables.  I think the 
> >shadow code can handle it (since the gfn/spte relationship is 
> >maintained by shadow code, and doesn't require reading the guest page 
> >tables), but am not sure.
> >
> 
> In addition to reducing mmu work for write protection, this allows more 
> efficient use of large pages.

Yes, and gets rid of the remote TLB flushing.

Issue is the paravirt_ops code in Linux does not cover all pte updates
(bit updates, ptep_get_and_clear, etc).

The plan is to get the basic infrastructure merged into KVM first (which
is a significant improvement already) and then later have paravirt_ops
cover all updates, disabling write protection.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 2/5] KVM: hypercall based pte updates and TLB flushes
  2008-02-17 14:51       ` Marcelo Tosatti
@ 2008-02-17 14:57         ` Avi Kivity
  2008-02-18  5:00           ` Marcelo Tosatti
  0 siblings, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-02-17 14:57 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> Issue is the paravirt_ops code in Linux does not cover all pte updates
> (bit updates, ptep_get_and_clear, etc).
>
> The plan is to get the basic infrastructure merged into KVM first (which
> is a significant improvement already) and then later have paravirt_ops
> cover all updates, disabling write protection.
>   

Okay, sounds good.

Do you know if anyone is working on extending pv_ops as you describe?

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 2/5] KVM: hypercall based pte updates and TLB flushes
  2008-02-17 14:57         ` Avi Kivity
@ 2008-02-18  5:00           ` Marcelo Tosatti
  0 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-18  5:00 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel

On Sun, Feb 17, 2008 at 04:57:36PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> > Issue is the paravirt_ops code in Linux does not cover all pte updates
> > (bit updates, ptep_get_and_clear, etc).
> >
> > The plan is to get the basic infrastructure merged into KVM first (which
> > is a significant improvement already) and then later have paravirt_ops
> > cover all updates, disabling write protection.
> >   
> 
> Okay, sounds good.
> 
> Do you know if anyone is working on extending pv_ops as you describe?

Not that I'm aware of.

I was planning to do it. The first part of that is a mask based bit
update hypercall (ie. a non-broken version of the patch I posted earlier
which Jeremy replied to).

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 2/5] KVM: hypercall based pte updates and TLB flushes
       [not found] ` <20080216221220.843135254@redhat.com>
  2008-02-17  8:28   ` [patch 2/5] KVM: hypercall based pte updates and TLB flushes Avi Kivity
@ 2008-02-17  8:32   ` Avi Kivity
  1 sibling, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-02-17  8:32 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> Hypercall based pte updates are faster than faults, and also allow use
> of the lazy MMU mode to batch operations.
>
> Don't report the feature if two dimensional paging is enabled.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Anthony Liguori <aliguori@us.ibm.com>
>  
> +static int kvm_hypercall_mmu_write(struct kvm_vcpu *vcpu, gva_t addr,
> +				   unsigned long size, unsigned long a0,
> +				   unsigned long a1)
> +{
> +	gpa_t gpa;
> +	u64 value;
> +
> +	if (mmu_topup_memory_caches(vcpu))
> +		return -KVM_EFAULT;
> +
> +	down_read(&vcpu->kvm->slots_lock);
> +	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
> +	up_read(&vcpu->kvm->slots_lock);
> +
> +	if (gpa == UNMAPPED_GVA)
> +		return -KVM_EFAULT;
> +	if (size == 1) {
> +		if (!emulator_write_phys(vcpu, gpa, &a0, sizeof(a0)))
> +			return -KVM_EFAULT;
> +	} else if (size == 2) {
> +		if (!is_long_mode(vcpu) && is_pae(vcpu))
> +			value = (u64)a1 << 32 | a0;
> +		else
> +			value = a0;
> +		if (!emulator_write_phys(vcpu, gpa, &value, sizeof(value)))
> +			return -KVM_EFAULT;
>   

The size logic can be simplified if it is redefined to mean "size in 
guest longs".  Actually, it can be inferred from the guest mode, and a1 
is only required iff the guest is a 32-bit pae.


-- 
Any sufficiently difficult bug is indistinguishable from a feature.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

[parent not found: <20080216221220.924823582@redhat.com>]

* Re: [patch 3/5] KVM: hypercall batching
       [not found] ` <20080216221220.924823582@redhat.com>
@ 2008-02-17  8:40   ` Avi Kivity
  2008-02-18 16:47     ` Marcelo Tosatti
  2008-02-17 18:40   ` Hollis Blanchard
  1 sibling, 1 reply; 21+ messages in thread
From: Avi Kivity @ 2008-02-17  8:40 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> Batch pte updates and tlb flushes in lazy MMU mode.
>
>   

I am slightly uneasy about generic hypercall batching.  An alternative 
way to do it would be to define a kvm_mmu_op structure (with an embedded 
opcode) and have a single hypercall execute a number of these (and not 
have any hypercall for executing just one).

> +static int kvm_hypercall_multicall(struct kvm_vcpu *vcpu, gpa_t addr, u32 nents)
> +{
> +	int i, result = 0;
> +
> +	++vcpu->stat.multicall;
> +	vcpu->stat.multicall_nr += nents;
> +
> +	for (i = 0; i < nents; i++) {
> +		struct kvm_multicall_entry mc;
> +		int ret;
> +
> +		down_read(&vcpu->kvm->slots_lock);
> +		ret = kvm_read_guest(vcpu->kvm, addr, &mc, sizeof(mc));
> +		up_read(&vcpu->kvm->slots_lock);
> +		if (ret)
> +			return -KVM_EFAULT;
> +
> +		ret = dispatch_hypercall(vcpu, mc.nr, mc.a0, mc.a1, mc.a2,
> +					    mc.a3);
> +		if (ret)
> +			result = ret;
> +		addr += sizeof(mc);
> +	}
> +	if (result < 0)
> +		return -KVM_EINVAL;
> +	return result;
> +}
> +

The return code of the hypercalls (if positive) is lost.  In the case 
that one hypercall failed, which one exactly is also lost.

For mmu ops, it doesn't matter (since if any fails, the guest is 
toast).  For a generic framework, it may matter.  If we keep it generic, 
I suggest adding a field for the return code of each hypercall, and have 
the multicall return the number of executed hypercalls.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 3/5] KVM: hypercall batching
  2008-02-17  8:40   ` [patch 3/5] KVM: hypercall batching Avi Kivity
@ 2008-02-18 16:47     ` Marcelo Tosatti
  0 siblings, 0 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2008-02-18 16:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel

On Sun, Feb 17, 2008 at 10:40:56AM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >Batch pte updates and tlb flushes in lazy MMU mode.
> >
> >  
> 
> I am slightly uneasy about generic hypercall batching.  An alternative 
> way to do it would be to define a kvm_mmu_op structure (with an embedded 
> opcode) and have a single hypercall execute a number of these (and not 
> have any hypercall for executing just one).
> 
> >+static int kvm_hypercall_multicall(struct kvm_vcpu *vcpu, gpa_t addr, u32 
> >nents)
> >+{
> >+	int i, result = 0;
> >+
> >+	++vcpu->stat.multicall;
> >+	vcpu->stat.multicall_nr += nents;
> >+
> >+	for (i = 0; i < nents; i++) {
> >+		struct kvm_multicall_entry mc;
> >+		int ret;
> >+
> >+		down_read(&vcpu->kvm->slots_lock);
> >+		ret = kvm_read_guest(vcpu->kvm, addr, &mc, sizeof(mc));
> >+		up_read(&vcpu->kvm->slots_lock);
> >+		if (ret)
> >+			return -KVM_EFAULT;
> >+
> >+		ret = dispatch_hypercall(vcpu, mc.nr, mc.a0, mc.a1, mc.a2,
> >+					    mc.a3);
> >+		if (ret)
> >+			result = ret;
> >+		addr += sizeof(mc);
> >+	}
> >+	if (result < 0)
> >+		return -KVM_EINVAL;
> >+	return result;
> >+}
> >+
> 
> The return code of the hypercalls (if positive) is lost.  In the case 
> that one hypercall failed, which one exactly is also lost.
> 
> For mmu ops, it doesn't matter (since if any fails, the guest is 
> toast).  For a generic framework, it may matter.  If we keep it generic, 
> I suggest adding a field for the return code of each hypercall, and have 
> the multicall return the number of executed hypercalls.

We need a "generic" mechanism which allows different operations in the
same multicall. For example mixed bit pte updates and normal whole pte
updates.

I've added an error code to each multicall entry and changed the
hypercall itself to return the number of processed entries as you
suggested.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 3/5] KVM: hypercall batching
       [not found] ` <20080216221220.924823582@redhat.com>
  2008-02-17  8:40   ` [patch 3/5] KVM: hypercall batching Avi Kivity
@ 2008-02-17 18:40   ` Hollis Blanchard
  2008-02-18  8:06     ` Avi Kivity
  1 sibling, 1 reply; 21+ messages in thread
From: Hollis Blanchard @ 2008-02-17 18:40 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, Avi Kivity

On Sat, 2008-02-16 at 17:09 -0500, Marcelo Tosatti wrote:
> plain text document attachment (kvm-multicall)
> Batch pte updates and tlb flushes in lazy MMU mode.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Anthony Liguori <aliguori@us.ibm.com>
> 
> Index: kvm.paravirt/arch/x86/kernel/kvm.c
> ===================================================================
> --- kvm.paravirt.orig/arch/x86/kernel/kvm.c
> +++ kvm.paravirt/arch/x86/kernel/kvm.c
> @@ -25,6 +25,74 @@
>  #include <linux/kvm_para.h>
>  #include <linux/cpu.h>
>  #include <linux/mm.h>
> +#include <linux/hardirq.h>
> +
> +#define MAX_MULTICALL_NR (PAGE_SIZE / sizeof(struct kvm_multicall_entry))
> +
> +struct kvm_para_state {
> +	struct kvm_multicall_entry queue[MAX_MULTICALL_NR];
> +	int queue_index;
> +	enum paravirt_lazy_mode mode;
> +};
> +
> +static DEFINE_PER_CPU(struct kvm_para_state, para_state);

AFAICS there is no guarantee about page-alignment here...

> +static int kvm_hypercall_multicall(struct kvm_vcpu *vcpu, gpa_t addr, u32 nents)
> +{
> +	int i, result = 0;
> +
> +	++vcpu->stat.multicall;
> +	vcpu->stat.multicall_nr += nents;
> +
> +	for (i = 0; i < nents; i++) {
> +		struct kvm_multicall_entry mc;
> +		int ret;
> +
> +		down_read(&vcpu->kvm->slots_lock);
> +		ret = kvm_read_guest(vcpu->kvm, addr, &mc, sizeof(mc));
> +		up_read(&vcpu->kvm->slots_lock);
> +		if (ret)
> +			return -KVM_EFAULT;
> +
> +		ret = dispatch_hypercall(vcpu, mc.nr, mc.a0, mc.a1, mc.a2,
> +					    mc.a3);
> +		if (ret)
> +			result = ret;
> +		addr += sizeof(mc);
> +	}
> +	if (result < 0)
> +		return -KVM_EINVAL;
> +	return result;
> +}

... but here you're assuming that 'queue' is physically contiguous,
which is not necessarily true one you cross a page boundary.

-- 
Hollis Blanchard
IBM Linux Technology Center


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 3/5] KVM: hypercall batching
  2008-02-17 18:40   ` Hollis Blanchard
@ 2008-02-18  8:06     ` Avi Kivity
  2008-02-18  8:43       ` Christian Borntraeger
       [not found]       ` <1203361276.3428.6.camel@basalt>
  0 siblings, 2 replies; 21+ messages in thread
From: Avi Kivity @ 2008-02-18  8:06 UTC (permalink / raw)
  To: Hollis Blanchard; +Cc: kvm-devel, Marcelo Tosatti

Hollis Blanchard wrote:
> On Sat, 2008-02-16 at 17:09 -0500, Marcelo Tosatti wrote:
>   
>> plain text document attachment (kvm-multicall)
>> Batch pte updates and tlb flushes in lazy MMU mode.
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>> Cc: Anthony Liguori <aliguori@us.ibm.com>
>>
>> Index: kvm.paravirt/arch/x86/kernel/kvm.c
>> ===================================================================
>> --- kvm.paravirt.orig/arch/x86/kernel/kvm.c
>> +++ kvm.paravirt/arch/x86/kernel/kvm.c
>> @@ -25,6 +25,74 @@
>>  #include <linux/kvm_para.h>
>>  #include <linux/cpu.h>
>>  #include <linux/mm.h>
>> +#include <linux/hardirq.h>
>> +
>> +#define MAX_MULTICALL_NR (PAGE_SIZE / sizeof(struct kvm_multicall_entry))
>> +
>> +struct kvm_para_state {
>> +	struct kvm_multicall_entry queue[MAX_MULTICALL_NR];
>> +	int queue_index;
>> +	enum paravirt_lazy_mode mode;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct kvm_para_state, para_state);
>>     
>
> AFAICS there is no guarantee about page-alignment here...
>
>   

Right.

>> +static int kvm_hypercall_multicall(struct kvm_vcpu *vcpu, gpa_t addr, u32 nents)
>> +{
>> +	int i, result = 0;
>> +
>> +	++vcpu->stat.multicall;
>> +	vcpu->stat.multicall_nr += nents;
>> +
>> +	for (i = 0; i < nents; i++) {
>> +		struct kvm_multicall_entry mc;
>> +		int ret;
>> +
>> +		down_read(&vcpu->kvm->slots_lock);
>> +		ret = kvm_read_guest(vcpu->kvm, addr, &mc, sizeof(mc));
>> +		up_read(&vcpu->kvm->slots_lock);
>> +		if (ret)
>> +			return -KVM_EFAULT;
>> +
>> +		ret = dispatch_hypercall(vcpu, mc.nr, mc.a0, mc.a1, mc.a2,
>> +					    mc.a3);
>> +		if (ret)
>> +			result = ret;
>> +		addr += sizeof(mc);
>> +	}
>> +	if (result < 0)
>> +		return -KVM_EINVAL;
>> +	return result;
>> +}
>>     
>
> ... but here you're assuming that 'queue' is physically contiguous,
> which is not necessarily true one you cross a page boundary.
>   

Kernel data is physically contiguous (true for per-cpu data as well?), 
so no there's issue here.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 3/5] KVM: hypercall batching
  2008-02-18  8:06     ` Avi Kivity
@ 2008-02-18  8:43       ` Christian Borntraeger
  2008-02-18  8:47         ` Avi Kivity
       [not found]       ` <1203361276.3428.6.camel@basalt>
  1 sibling, 1 reply; 21+ messages in thread
From: Christian Borntraeger @ 2008-02-18  8:43 UTC (permalink / raw)
  To: kvm-devel; +Cc: Marcelo Tosatti, Hollis Blanchard, Avi Kivity

Am Montag, 18. Februar 2008 schrieb Avi Kivity:
> > AFAICS there is no guarantee about page-alignment here...

> Kernel data is physically contiguous (true for per-cpu data as well?), 
> so no there's issue here.

Modules are loaded into vmalloc space, no? I think, if kvm is built as 
module, static module variables are not guaranteed to be contiguous.

Christian

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch 3/5] KVM: hypercall batching
  2008-02-18  8:43       ` Christian Borntraeger
@ 2008-02-18  8:47         ` Avi Kivity
  0 siblings, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-02-18  8:47 UTC (permalink / raw)
  To: Christian Borntraeger; +Cc: kvm-devel, Marcelo Tosatti, Hollis Blanchard

Christian Borntraeger wrote:
> Am Montag, 18. Februar 2008 schrieb Avi Kivity:
>   
>>> AFAICS there is no guarantee about page-alignment here...
>>>       
>
>   
>> Kernel data is physically contiguous (true for per-cpu data as well?), 
>> so no there's issue here.
>>     
>
> Modules are loaded into vmalloc space, no? I think, if kvm is built as 
> module, static module variables are not guaranteed to be contiguous.
>   

It's  not a module (the code is part of pv-ops for kvm, not the kvm host 
itself).

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

[parent not found: <1203361276.3428.6.camel@basalt>]

* Re: [patch 3/5] KVM: hypercall batching
       [not found]       ` <1203361276.3428.6.camel@basalt>
@ 2008-02-19  8:30         ` Avi Kivity
  0 siblings, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-02-19  8:30 UTC (permalink / raw)
  To: Hollis Blanchard; +Cc: kvm-devel, Marcelo Tosatti

Hollis Blanchard wrote:
>> Kernel data is physically contiguous (true for per-cpu data as well?), 
>> so no there's issue here.
>>     
>
> So this is an addition to the ABI, that the data must be physically
> contiguous. That's a pretty subtle implicit requirement, and it's easy
> to resolve the issue by requiring page-aligned data in the guest in the
> first place.
>
> Please don't forget that there are other OSes that could use this
> interface as well...
>   

I think it's fairly straightforward that a (phys_addr, length) API 
implies physically contiguous memory.  If the guest doesn't have 
physically contiguous memory, it can page align itself.

But you do raise an important issue, that our ABIs (host/user and 
host/guest) are seriously underdocumented.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

[parent not found: <20080216221221.002948712@redhat.com>]

* Re: [patch 4/5] KVM: ignore zapped root pagetables
       [not found] ` <20080216221221.002948712@redhat.com>
@ 2008-02-17  8:52   ` Avi Kivity
  0 siblings, 0 replies; 21+ messages in thread
From: Avi Kivity @ 2008-02-17  8:52 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel

Marcelo Tosatti wrote:
> Mark zapped root pagetables as invalid and ignore such pages during lookup.
>
> This is a problem with the cr3-target feature, where a zapped root table fools
> the faulting code into creating a read-only mapping. The result is a lockup
> if the instruction can't be emulated.
>   

> @@ -796,8 +797,10 @@ static void kvm_mmu_zap_page(struct kvm 
>  	if (!sp->root_count) {
>  		hlist_del(&sp->hash_link);
>  		kvm_mmu_free_page(kvm, sp);
> -	} else
> +	} else {
>  		list_move(&sp->link, &kvm->arch.active_mmu_pages);
> +		sp->role.invalid = 1;
> +	}
>  	kvm_mmu_reset_last_pte_updated(kvm)

There's an smp issue here.  You're marking a shadow page as invalid, but 
it may be currently in use by another vcpu.  So the shadow page and the 
guest page may be out of sync.

A fix is to send an IPI to all vcpus in such a situation, and request 
them to unload the mmu.

Also, we can't rely on memory pressure to flush out the invalid shadow 
pages, because for many workloads the shadow cache is large enough (the 
"mmu_recycled" counter never increments).  So a check for (root_count == 
0 && role.invalid) when decrementing root_count can help to zap those pages.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2008-02-19  8:30 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20080216220924.733723618@redhat.com>
2008-02-16 22:09 ` [patch 1/5] KVM: add basic paravirt support Marcelo Tosatti
2008-02-16 22:09 ` [patch 2/5] KVM: hypercall based pte updates and TLB flushes Marcelo Tosatti
2008-02-16 22:09 ` [patch 3/5] KVM: hypercall batching Marcelo Tosatti
2008-02-16 22:09 ` [patch 4/5] KVM: ignore zapped root pagetables Marcelo Tosatti
2008-02-16 22:09 ` [patch 5/5] KVM: VMX cr3 cache support Marcelo Tosatti
2008-02-16 23:37 ` [patch 0/5] KVM paravirt MMU updates and cr3 caching Anthony Liguori
2008-02-17  2:24   ` Marcelo Tosatti
     [not found] ` <20080216221220.843135254@redhat.com>
2008-02-17  8:28   ` [patch 2/5] KVM: hypercall based pte updates and TLB flushes Avi Kivity
2008-02-17 13:13     ` Avi Kivity
2008-02-17 14:51       ` Marcelo Tosatti
2008-02-17 14:57         ` Avi Kivity
2008-02-18  5:00           ` Marcelo Tosatti
2008-02-17  8:32   ` Avi Kivity
     [not found] ` <20080216221220.924823582@redhat.com>
2008-02-17  8:40   ` [patch 3/5] KVM: hypercall batching Avi Kivity
2008-02-18 16:47     ` Marcelo Tosatti
2008-02-17 18:40   ` Hollis Blanchard
2008-02-18  8:06     ` Avi Kivity
2008-02-18  8:43       ` Christian Borntraeger
2008-02-18  8:47         ` Avi Kivity
     [not found]       ` <1203361276.3428.6.camel@basalt>
2008-02-19  8:30         ` Avi Kivity
     [not found] ` <20080216221221.002948712@redhat.com>
2008-02-17  8:52   ` [patch 4/5] KVM: ignore zapped root pagetables Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox