Nested VMX support v4

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Nested VMX support v4
@ 2009-12-10 18:38 oritw
  2009-12-10 18:38 ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff oritw
  2009-12-17 13:49 ` Nested VMX support v4 Avi Kivity
  0 siblings, 2 replies; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

Avi,                                                           
We have addressed all of the comments, please apply.           

The following patches implement nested VMX support. The patches enable a guest
to use the VMX APIs in order to run its own nested guest (i.e., enable running
other hypervisors which use VMX under KVM). The current patches support running
Linux under a nested KVM using shadow page table (with bypass_guest_pf
disabled). Reworking EPT support to mesh cleanly with the current shadow paging
design per Avi's comments is a work-in-progress.

The current patches support multiple nested hypervisors, which can run
multiple guests. Only 64-bit nested hypervisors are supported. SMP is
supported. Additional patches for running Windows under nested KVM, and
Linux under nested VMware server, are currently running in the lab, and
will be sent as a follow-on patchset.

This patches were written by:
     Orit Wasserman, oritw <at> il.ibm.com
     Ben-Ami Yassor, benami <at> il.ibm.com
     Abel Gordon, abelg <at> il.ibm.com
     Muli Ben-Yehuda, muli <at> il.ibm.com

With contributions by:
     Anthony Liguori, aliguori <at> us.ibm.com
     Mike Day, mdday <at> us.ibm.com

This work was inspired by the nested SVM support by Alexander Graf and Joerg
Roedel.

Changes since v3:
               Added support for 32-bit nested guests
               Added support for multiple nested guests
               Added support for multiple nested hypervisors
               Implemented VMX instruction decoding
               Implemented CR0.TS handling for nested

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-10 18:38 Nested VMX support v4 oritw
@ 2009-12-10 18:38 ` oritw
  2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
                     ` (2 more replies)
  2009-12-17 13:49 ` Nested VMX support v4 Avi Kivity
  1 sibling, 3 replies; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/svm.c |    3 -
 arch/x86/kvm/vmx.c |  265 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |   11 ++-
 arch/x86/kvm/x86.h |    2 +
 4 files changed, 274 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3de0b37..3f63cdd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -121,9 +121,6 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9a0a2cf..2726a6c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -92,6 +92,16 @@ struct shared_msr_entry {
 	u64 mask;
 };
 
+struct __attribute__ ((__packed__)) level_state {
+};
+
+struct nested_vmx {
+	/* Has the level1 guest done vmxon? */
+	bool vmxon;
+	/* Level 1 state for switching to level 2 and back */
+	struct level_state *l1_state;
+};
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
@@ -136,6 +146,9 @@ struct vcpu_vmx {
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
+
+	/* Nested vmx */
+	struct nested_vmx nested;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -201,6 +214,7 @@ static struct kvm_vmx_segment_field {
 static u64 host_efer;
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+static int create_l1_state(struct kvm_vcpu *vcpu);
 
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
@@ -961,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 }
 
 /*
+ * Handles msr read for nested virtualization
+ */
+static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
+			      u64 *pdata)
+{
+	u64 vmx_msr = 0;
+
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_BASIC:
+		*pdata = 0;
+		rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
+		*pdata = (vmx_msr & 0x00ffffcfffffffff);
+		break;
+	case MSR_IA32_VMX_PINBASED_CTLS:
+		rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr);
+		*pdata = (PIN_BASED_EXT_INTR_MASK & vmcs_config.pin_based_exec_ctrl) |
+			(PIN_BASED_NMI_EXITING & vmcs_config.pin_based_exec_ctrl) |
+			(PIN_BASED_VIRTUAL_NMIS & vmcs_config.pin_based_exec_ctrl);
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+	{
+		u32 vmx_msr_high, vmx_msr_low;
+		u32 control = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+			CPU_BASED_CR8_LOAD_EXITING |
+			CPU_BASED_CR8_STORE_EXITING |
+#endif
+			CPU_BASED_CR3_LOAD_EXITING |
+			CPU_BASED_CR3_STORE_EXITING |
+			CPU_BASED_USE_IO_BITMAPS |
+			CPU_BASED_MOV_DR_EXITING |
+			CPU_BASED_USE_TSC_OFFSETING |
+			CPU_BASED_INVLPG_EXITING |
+			CPU_BASED_TPR_SHADOW |
+			CPU_BASED_USE_MSR_BITMAPS |
+			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+
+		control &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+		control |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
+
+		*pdata = (CPU_BASED_HLT_EXITING & control) |
+#ifdef CONFIG_X86_64
+			(CPU_BASED_CR8_LOAD_EXITING & control) |
+			(CPU_BASED_CR8_STORE_EXITING & control) |
+#endif
+			(CPU_BASED_CR3_LOAD_EXITING & control) |
+			(CPU_BASED_CR3_STORE_EXITING & control) |
+			(CPU_BASED_USE_IO_BITMAPS & control) |
+			(CPU_BASED_MOV_DR_EXITING & control) |
+			(CPU_BASED_USE_TSC_OFFSETING & control) |
+			(CPU_BASED_INVLPG_EXITING & control) ;
+
+		if (cpu_has_secondary_exec_ctrls())
+			*pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+		if (vm_need_tpr_shadow(vcpu->kvm))
+			*pdata |= CPU_BASED_TPR_SHADOW;
+		break;
+	}
+	case MSR_IA32_VMX_EXIT_CTLS:
+		*pdata = 0;
+#ifdef CONFIG_X86_64
+		*pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+		break;
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		*pdata = 0;
+		if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+			*pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		break;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		*pdata = 0;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1004,6 +1107,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		break;
 	default:
 		vmx_load_host_state(to_vmx(vcpu));
+		if (nested &&
+		    !nested_vmx_get_msr(vcpu, msr_index, &data))
+			break;
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
 			vmx_load_host_state(to_vmx(vcpu));
@@ -1018,6 +1124,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 }
 
 /*
+ * Writes msr value for nested virtualization
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		if ((data & (FEATURE_CONTROL_LOCKED |
+			     FEATURE_CONTROL_VMXON_ENABLED))
+		    != (FEATURE_CONTROL_LOCKED |
+			FEATURE_CONTROL_VMXON_ENABLED))
+			return 1;
+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1067,6 +1194,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		}
 		/* Otherwise falls through to kvm_set_msr_common */
 	default:
+		if (nested &&
+		    !nested_vmx_set_msr(vcpu, msr_index, data))
+			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 			vmx_load_host_state(vmx);
@@ -1163,6 +1293,31 @@ static void vmclear_local_vcpus(void)
 		__vcpu_clear(vmx);
 }
 
+static struct level_state *create_state(void)
+{
+	struct level_state *state = NULL;
+
+	state = kzalloc(sizeof(struct level_state), GFP_KERNEL);
+	if (!state) {
+		printk(KERN_INFO "Error create level state\n");
+		return NULL;
+	}
+	return state;
+}
+
+static int create_l1_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l1_state) {
+		vmx->nested.l1_state = create_state();
+		if (!vmx->nested.l1_state)
+			return -ENOMEM;
+	} else
+		return 0;
+
+	return 0;
+}
 
 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
  * tricks.
@@ -1333,6 +1488,18 @@ static void free_vmcs(struct vmcs *vmcs)
 	free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+static void free_l1_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.l1_state)
+		return;
+
+	kfree(vmx->nested.l1_state);
+	vmx->nested.l1_state = NULL;
+}
+
+
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -3146,12 +3313,105 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Check to see if vcpu can execute vmx command
+ * Inject the corrseponding exception
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!vmx->nested.vmxon) {
+		pr_debug("%s: vmx not on\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+	    (is_long_mode(vcpu) && !cs.l)) {
+		pr_debug("%s: invalid mode cs.l %d is_long mode %d\n",
+			 __func__, cs.l, is_long_mode(vcpu));
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 0;
+	}
+
+	return 1;
+}
+
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
 	kvm_queue_exception(vcpu, UD_VECTOR);
 	return 1;
 }
 
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmx->nested.vmxon = 0;
+
+	free_l1_state(vcpu);
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested) {
+		pr_debug("%s: nested vmx not enabled\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+	if (!(vcpu->arch.cr4 & X86_CR4_VMXE) ||
+	    !(vcpu->arch.cr0 & X86_CR0_PE) ||
+	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		printk(KERN_INFO "%s invalid register state\n", __func__);
+		return 1;
+	}
+
+	if (is_long_mode(vcpu) && !cs.l) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		printk(KERN_INFO "%s invalid register state\n", __func__);
+		return 1;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		printk(KERN_INFO "%s no permission\n", __func__);
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	if (create_l1_state(vcpu)) {
+		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx->nested.vmxon = 1;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3442,8 +3702,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-	[EXIT_REASON_VMON]                    = handle_vmx_insn,
+	[EXIT_REASON_VMOFF]                   = handle_vmoff,
+	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
@@ -3823,6 +4083,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	if (vmx->vpid != 0)
 		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
+	free_l1_state(vcpu);
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd15d7a..b698952 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -88,6 +88,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+int nested = 1;
+EXPORT_SYMBOL_GPL(nested);
+module_param(nested, int, S_IRUGO);
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -505,7 +509,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return;
 	}
 
-	if (cr4 & X86_CR4_VMXE) {
+	if (cr4 & X86_CR4_VMXE && !nested) {
 		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 		kvm_inject_gp(vcpu, 0);
 		return;
@@ -615,7 +619,10 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+	MSR_IA32_FEATURE_CONTROL,  MSR_IA32_VMX_BASIC, MSR_IA32_VMX_PINBASED_CTLS,
+	MSR_IA32_VMX_PROCBASED_CTLS, MSR_IA32_VMX_EXIT_CTLS, MSR_IA32_VMX_ENTRY_CTLS,
+	MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_FEATURE_CONTROL
 };
 
 static unsigned num_msrs_to_save;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea5..57204cb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
 
+extern int nested;
+
 #endif
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/7] Nested VMX patch 2 implements vmclear
  2009-12-10 18:38 ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff oritw
@ 2009-12-10 18:38   ` oritw
  2009-12-10 18:38     ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst oritw
                       ` (2 more replies)
  2009-12-16 13:34   ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
  2009-12-20 14:20   ` Gleb Natapov
  2 siblings, 3 replies; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  235 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |    5 +-
 arch/x86/kvm/x86.h |    3 +
 3 files changed, 240 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2726a6c..a7ffd5e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,13 +93,39 @@ struct shared_msr_entry {
 };
 
 struct __attribute__ ((__packed__)) level_state {
+	/* Has the level1 guest done vmclear? */
+	bool vmclear;
+};
+
+/*
+ * This structure is mapped to guest memory.
+ * It is packed in order to preseve the binary content
+ * after live migration.
+ * If there are changed in the content or layout the revision_id must be updated.
+ */
+struct __attribute__ ((__packed__)) nested_vmcs_page {
+	u32 revision_id;
+	u32 abort;
+	struct level_state l2_state;
+};
+
+struct nested_vmcs_list {
+	struct list_head list;
+	gpa_t vmcs_addr;
+	struct vmcs *l2_vmcs;
 };
 
 struct nested_vmx {
 	/* Has the level1 guest done vmxon? */
 	bool vmxon;
+	/* What is the location of the current vmcs l1 keeps for l2 */
+	gpa_t current_vmptr;
 	/* Level 1 state for switching to level 2 and back */
 	struct level_state *l1_state;
+	/* list of vmcs for each l2 guest created by l1 */
+	struct list_head l2_vmcs_list;
+	/* l2 page corresponding to the current vmcs set by l1 */
+	struct nested_vmcs_page *current_l2_page;
 };
 
 struct vcpu_vmx {
@@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct page *nested_get_page(struct kvm_vcpu *vcpu,
+				    u64 vmcs_addr)
+{
+	struct page *vmcs_page = NULL;
+
+	down_read(&current->mm->mmap_sem);
+	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(vmcs_page)) {
+		printk(KERN_ERR "%s error allocating page 0x%llx\n",
+		       __func__, vmcs_addr);
+		kvm_release_page_clean(vmcs_page);
+		return NULL;
+	}
+
+	return vmcs_page;
+
+}
+
+static int nested_map_current(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *vmcs_page =
+		nested_get_page(vcpu, vmx->nested.current_vmptr);
+	struct nested_vmcs_page *mapped_page;
+
+	if (vmcs_page == NULL) {
+		printk(KERN_INFO "%s: failure in nested_get_page\n", __func__);
+		return 0;
+	}
+
+	if (vmx->nested.current_l2_page) {
+		printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__);
+		WARN_ON(1);
+		return 0;
+	}
+
+	mapped_page = kmap_atomic(vmcs_page, KM_USER0);
+
+	if (!mapped_page) {
+		printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
+		return 0;
+	}
+
+	vmx->nested.current_l2_page = mapped_page;
+
+	return 1;
+}
+
+static void nested_unmap_current(struct kvm_vcpu *vcpu)
+{
+	struct page *page;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.current_l2_page) {
+		printk(KERN_INFO "Shadow vmcs already unmapped\n");
+		WARN_ON(1);
+		return;
+	}
+
+	page = kmap_atomic_to_page(vmx->nested.current_l2_page);
+
+	kunmap_atomic(vmx->nested.current_l2_page, KM_USER0);
+
+	kvm_release_page_dirty(page);
+
+	vmx->nested.current_l2_page = NULL;
+}
+
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 
@@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	return 0;
 }
 
+static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
+{
+	int r = 0;
+	uint size;
+
+	*gentry = 0;
+
+	if (is_long_mode(vcpu))
+		size = sizeof(u64);
+	else
+		size = sizeof(u32);
+
+	r = kvm_read_guest_virt(gva, gentry,
+				size, vcpu);
+	if (r) {
+		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
+		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
+		return r;
+	}
+
+	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
+		printk(KERN_DEBUG "%s addr %llx not aligned\n",
+		       __func__, *gentry);
+		return 1;
+	}
+
+	return 0;
+}
+
 /*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
@@ -1316,6 +1441,7 @@ static int create_l1_state(struct kvm_vcpu *vcpu)
 	} else
 		return 0;
 
+	INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list));
 	return 0;
 }
 
@@ -1488,15 +1614,35 @@ static void free_vmcs(struct vmcs *vmcs)
 	free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+static void nested_free_current_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct nested_vmcs_list *list_item, *n;
+
+	list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list)
+		if (list_item->vmcs_addr == vmx->nested.current_vmptr) {
+			free_vmcs(list_item->l2_vmcs);
+			list_del(&(list_item->list));
+			return;
+		}
+}
+
 static void free_l1_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct nested_vmcs_list *list_item, *n;
 
 	if (!vmx->nested.l1_state)
 		return;
 
 	kfree(vmx->nested.l1_state);
 	vmx->nested.l1_state = NULL;
+
+	list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list,
+				 list) {
+		free_vmcs(list_item->l2_vmcs);
+		list_del(&(list_item->list));
+	}
 }
 
 
@@ -3352,6 +3498,93 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
+	vmx_set_rflags(vcpu, rflags);
+}
+
+/*
+ * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11
+ * For additional information regarding offset calculation see 3.7.5
+ */
+static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
+				 unsigned long exit_qualification,
+				 u32 vmx_instruction_info)
+{
+	int  scaling        = vmx_instruction_info & 3;             /* bits 0:1 scaling */
+	int  addr_size      = (vmx_instruction_info >> 7) & 7;      /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */
+	bool is_reg         = vmx_instruction_info & (1u << 10);    /* bit  10  1=register operand, 0= memory */
+	int  seg_reg        = (vmx_instruction_info >> 15) & 7;     /* bits 15:17 segment register */
+	int  index_reg      = (vmx_instruction_info >> 18) & 0xf;   /* bits 18:21 index register */
+	bool index_is_valid = !(vmx_instruction_info & (1u << 22)); /* bit  22 index register validity, 0=valid, 1=invalid */
+	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;   /* bits 23:26 index register */
+	bool base_is_valid  = !(vmx_instruction_info & (1u << 27)); /* bit  27 base register validity, 0=valid, 1=invalid */
+	gva_t addr;
+
+	if (is_reg)
+		return 0;
+
+	switch (addr_size) {
+	case 1:
+		exit_qualification &= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */
+		break;
+	case 2:
+		break;
+	default:
+		return 0;
+	}
+
+	/* Addr = segment_base + offset */
+	/* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */
+	addr = vmx_get_segment_base(vcpu, seg_reg);
+	if (base_is_valid)
+		addr += kvm_register_read(vcpu, base_reg);
+	if (index_is_valid)
+		addr += kvm_register_read(vcpu, index_reg)*scaling;
+	addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */
+
+	return addr;
+}
+
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct level_state *l2_state;
+	gpa_t guest_vmcs_addr;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t vmcs_gva;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
+				       vmx_instruction_info);
+
+	if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr))
+		return 1;
+
+	vmx->nested.current_vmptr = guest_vmcs_addr;
+	if (!nested_map_current(vcpu))
+		return 1;
+
+	l2_state = &(to_vmx(vcpu)->nested.current_l2_page->l2_state);
+	l2_state->vmclear = 1;
+	nested_free_current_vmcs(vcpu);
+
+	vmx->nested.current_vmptr = -1ull;
+
+	nested_unmap_current(vcpu);
+
+	skip_emulated_instruction(vcpu);
+	clear_rflags_cf_zf(vcpu);
+
+	return 1;
+}
+
 static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3695,7 +3928,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
+	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
 	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b698952..e5acf22 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2773,8 +2773,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
 }
 
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
-			       struct kvm_vcpu *vcpu)
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			struct kvm_vcpu *vcpu)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
@@ -2802,6 +2802,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
 out:
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
 				struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 57204cb..2d7b2dc 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -35,6 +35,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
 
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			struct kvm_vcpu *vcpu);
+
 extern int nested;
 
 #endif
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
@ 2009-12-10 18:38     ` oritw
  2009-12-10 18:38       ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite oritw
  2009-12-16 14:32       ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
  2009-12-16 13:59     ` [PATCH 2/7] Nested VMX patch 2 implements vmclear Avi Kivity
  2009-12-28 14:57     ` Gleb Natapov
  2 siblings, 2 replies; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  292 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.c |    6 +-
 arch/x86/kvm/x86.h |    3 +
 3 files changed, 289 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a7ffd5e..46a4f3a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -92,9 +92,142 @@ struct shared_msr_entry {
 	u64 mask;
 };
 
+struct __attribute__ ((__packed__)) shadow_vmcs {
+	u16 virtual_processor_id;
+	u16 guest_es_selector;
+	u16 guest_cs_selector;
+	u16 guest_ss_selector;
+	u16 guest_ds_selector;
+	u16 guest_fs_selector;
+	u16 guest_gs_selector;
+	u16 guest_ldtr_selector;
+	u16 guest_tr_selector;
+	u16 host_es_selector;
+	u16 host_cs_selector;
+	u16 host_ss_selector;
+	u16 host_ds_selector;
+	u16 host_fs_selector;
+	u16 host_gs_selector;
+	u16 host_tr_selector;
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+	u64 vm_exit_msr_store_addr;
+	u64 vm_exit_msr_load_addr;
+	u64 vm_entry_msr_load_addr;
+	u64 tsc_offset;
+	u64 virtual_apic_page_addr;
+	u64 apic_access_addr;
+	u64 ept_pointer;
+	u64 guest_physical_address;
+	u64 vmcs_link_pointer;
+	u64 guest_ia32_debugctl;
+	u64 guest_ia32_pat;
+	u64 guest_pdptr0;
+	u64 guest_pdptr1;
+	u64 guest_pdptr2;
+	u64 guest_pdptr3;
+	u64 host_ia32_pat;
+	u32 pin_based_vm_exec_control;
+	u32 cpu_based_vm_exec_control;
+	u32 exception_bitmap;
+	u32 page_fault_error_code_mask;
+	u32 page_fault_error_code_match;
+	u32 cr3_target_count;
+	u32 vm_exit_controls;
+	u32 vm_exit_msr_store_count;
+	u32 vm_exit_msr_load_count;
+	u32 vm_entry_controls;
+	u32 vm_entry_msr_load_count;
+	u32 vm_entry_intr_info_field;
+	u32 vm_entry_exception_error_code;
+	u32 vm_entry_instruction_len;
+	u32 tpr_threshold;
+	u32 secondary_vm_exec_control;
+	u32 vm_instruction_error;
+	u32 vm_exit_reason;
+	u32 vm_exit_intr_info;
+	u32 vm_exit_intr_error_code;
+	u32 idt_vectoring_info_field;
+	u32 idt_vectoring_error_code;
+	u32 vm_exit_instruction_len;
+	u32 vmx_instruction_info;
+	u32 guest_es_limit;
+	u32 guest_cs_limit;
+	u32 guest_ss_limit;
+	u32 guest_ds_limit;
+	u32 guest_fs_limit;
+	u32 guest_gs_limit;
+	u32 guest_ldtr_limit;
+	u32 guest_tr_limit;
+	u32 guest_gdtr_limit;
+	u32 guest_idtr_limit;
+	u32 guest_es_ar_bytes;
+	u32 guest_cs_ar_bytes;
+	u32 guest_ss_ar_bytes;
+	u32 guest_ds_ar_bytes;
+	u32 guest_fs_ar_bytes;
+	u32 guest_gs_ar_bytes;
+	u32 guest_ldtr_ar_bytes;
+	u32 guest_tr_ar_bytes;
+	u32 guest_interruptibility_info;
+	u32 guest_activity_state;
+	u32 guest_sysenter_cs;
+	u32 host_ia32_sysenter_cs;
+	unsigned long cr0_guest_host_mask;
+	unsigned long cr4_guest_host_mask;
+	unsigned long cr0_read_shadow;
+	unsigned long cr4_read_shadow;
+	unsigned long cr3_target_value0;
+	unsigned long cr3_target_value1;
+	unsigned long cr3_target_value2;
+	unsigned long cr3_target_value3;
+	unsigned long exit_qualification;
+	unsigned long guest_linear_address;
+	unsigned long guest_cr0;
+	unsigned long guest_cr3;
+	unsigned long guest_cr4;
+	unsigned long guest_es_base;
+	unsigned long guest_cs_base;
+	unsigned long guest_ss_base;
+	unsigned long guest_ds_base;
+	unsigned long guest_fs_base;
+	unsigned long guest_gs_base;
+	unsigned long guest_ldtr_base;
+	unsigned long guest_tr_base;
+	unsigned long guest_gdtr_base;
+	unsigned long guest_idtr_base;
+	unsigned long guest_dr7;
+	unsigned long guest_rsp;
+	unsigned long guest_rip;
+	unsigned long guest_rflags;
+	unsigned long guest_pending_dbg_exceptions;
+	unsigned long guest_sysenter_esp;
+	unsigned long guest_sysenter_eip;
+	unsigned long host_cr0;
+	unsigned long host_cr3;
+	unsigned long host_cr4;
+	unsigned long host_fs_base;
+	unsigned long host_gs_base;
+	unsigned long host_tr_base;
+	unsigned long host_gdtr_base;
+	unsigned long host_idtr_base;
+	unsigned long host_ia32_sysenter_esp;
+	unsigned long host_ia32_sysenter_eip;
+	unsigned long host_rsp;
+	unsigned long host_rip;
+};
+
+
 struct __attribute__ ((__packed__)) level_state {
 	/* Has the level1 guest done vmclear? */
 	bool vmclear;
+
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+
+	bool first_launch;
 };
 
 /*
@@ -122,6 +255,8 @@ struct nested_vmx {
 	gpa_t current_vmptr;
 	/* Level 1 state for switching to level 2 and back */
 	struct level_state *l1_state;
+	/* Level 1 shadow vmcs for switching to level 2 and back */
+	struct shadow_vmcs *l1_shadow_vmcs;
 	/* list of vmcs for each l2 guest created by l1 */
 	struct list_head l2_vmcs_list;
 	/* l2 page corresponding to the current vmcs set by l1 */
@@ -187,10 +322,7 @@ static struct page *nested_get_page(struct kvm_vcpu *vcpu,
 {
 	struct page *vmcs_page = NULL;
 
-	down_read(&current->mm->mmap_sem);
 	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
-	up_read(&current->mm->mmap_sem);
-
 	if (is_error_page(vmcs_page)) {
 		printk(KERN_ERR "%s error allocating page 0x%llx\n",
 		       __func__, vmcs_addr);
@@ -832,13 +964,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 		u8 error;
-
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
+
 		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
 			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
 			      : "cc");
+
 		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+			printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
 			       vmx->vmcs, phys_addr);
 	}
 
@@ -1240,6 +1373,7 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	return 0;
 }
 
+
 static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
 {
 	int r = 0;
@@ -1430,6 +1564,18 @@ static struct level_state *create_state(void)
 	return state;
 }
 
+static struct vmcs *nested_get_current_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct nested_vmcs_list *list_item, *n;
+
+	list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list)
+		if (list_item->vmcs_addr == vmx->nested.current_vmptr)
+			return list_item->l2_vmcs;
+
+	return NULL;
+}
+
 static int create_l1_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1441,10 +1587,75 @@ static int create_l1_state(struct kvm_vcpu *vcpu)
 	} else
 		return 0;
 
+	vmx->nested.l1_shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!vmx->nested.l1_shadow_vmcs) {
+		printk(KERN_INFO "%s could not allocate memory for l1_shadow vmcs\n",
+		       __func__);
+		kfree(vmx->nested.l1_state);
+		return -ENOMEM;
+	}
+
 	INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list));
 	return 0;
 }
 
+static struct vmcs *alloc_vmcs(void);
+int create_l2_state(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs *l2_vmcs;
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_ERR "%s error mapping  level 2 page", __func__);
+		return -ENOMEM;
+	}
+
+	l2_vmcs = nested_get_current_vmcs(vcpu);
+	if (!l2_vmcs) {
+		struct nested_vmcs_list *new_l2_guest =
+			(struct nested_vmcs_list *)
+			kmalloc(sizeof(struct nested_vmcs_list), GFP_KERNEL);
+
+		if (!new_l2_guest) {
+			printk(KERN_ERR "%s error could not allocate memory for a new l2 guest list item",
+			       __func__);
+			nested_unmap_current(vcpu);
+			return -ENOMEM;
+		}
+
+		l2_vmcs = alloc_vmcs();
+
+		if (!l2_vmcs) {
+			printk(KERN_ERR "%s error could not allocate memory for l2_vmcs",
+			       __func__);
+			kfree(new_l2_guest);
+			nested_unmap_current(vcpu);
+			return -ENOMEM;
+		}
+
+		new_l2_guest->vmcs_addr = vmx->nested.current_vmptr;
+		new_l2_guest->l2_vmcs   = l2_vmcs;
+		list_add(&(new_l2_guest->list), &(vmx->nested.l2_vmcs_list));
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.current_l2_page->l2_state.msr_bitmap =
+			vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.current_l2_page->l2_state.msr_bitmap = 0;
+
+	vmx->nested.current_l2_page->l2_state.io_bitmap_a =
+		vmcs_read64(IO_BITMAP_A);
+	vmx->nested.current_l2_page->l2_state.io_bitmap_b =
+		vmcs_read64(IO_BITMAP_B);
+
+	vmx->nested.current_l2_page->l2_state.first_launch = true;
+
+	nested_unmap_current(vcpu);
+
+	return 0;
+}
+
 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
  * tricks.
  */
@@ -1623,6 +1834,7 @@ static void nested_free_current_vmcs(struct kvm_vcpu *vcpu)
 		if (list_item->vmcs_addr == vmx->nested.current_vmptr) {
 			free_vmcs(list_item->l2_vmcs);
 			list_del(&(list_item->list));
+			kfree(list_item);
 			return;
 		}
 }
@@ -1637,11 +1849,14 @@ static void free_l1_state(struct kvm_vcpu *vcpu)
 
 	kfree(vmx->nested.l1_state);
 	vmx->nested.l1_state = NULL;
+	kfree(vmx->nested.l1_shadow_vmcs);
+	vmx->nested.l1_shadow_vmcs = NULL;
 
 	list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list,
 				 list) {
 		free_vmcs(list_item->l2_vmcs);
 		list_del(&(list_item->list));
+		kfree(list_item);
 	}
 }
 
@@ -3604,6 +3819,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
 	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int r = 0;
 
 	if (!nested) {
 		pr_debug("%s: nested vmx not enabled\n", __func__);
@@ -3633,8 +3849,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return 1;
 	}
 
-	if (create_l1_state(vcpu)) {
-		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
+	r = create_l1_state(vcpu);
+	if (r) {
+		printk(KERN_ERR "%s create_l1_state failed: %d\n", __func__, r);
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
@@ -3645,6 +3862,63 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 guest_vmcs_addr;
+	gva_t vmcs_gva;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	int r = 0;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
+				       vmx_instruction_info);
+
+	if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr))
+		return 1;
+
+	if (vmx->nested.current_vmptr != guest_vmcs_addr) {
+		vmx->nested.current_vmptr = guest_vmcs_addr;
+		r = create_l2_state(vcpu);
+		if (r) {
+			printk(KERN_ERR "%s create_l2_state failed: %d\n",
+			       __func__, r);
+			return 1;
+		}
+	}
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+	int r = 0;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t vmcs_gva;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
+				       vmx_instruction_info);
+
+	r = kvm_write_guest_virt(vmcs_gva,
+				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
+				 sizeof(u64), vcpu);
+	if (r) {
+		printk(KERN_INFO "%s failed to write vmptr\n", __func__);
+		return 1;
+	}
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3930,8 +4204,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
-	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
-	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
+	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
+	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
 	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5acf22..e990405 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2804,8 +2804,8 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-				struct kvm_vcpu *vcpu)
+int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			 struct kvm_vcpu *vcpu)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
@@ -2833,7 +2833,7 @@ static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
 out:
 	return r;
 }
-
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt);
 
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d7b2dc..b49b55a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -38,6 +38,9 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
 			struct kvm_vcpu *vcpu);
 
+int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			 struct kvm_vcpu *vcpu);
+
 extern int nested;
 
 #endif
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite
  2009-12-10 18:38     ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst oritw
@ 2009-12-10 18:38       ` oritw
  2009-12-10 18:38         ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling oritw
  2009-12-16 14:44         ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
  2009-12-16 14:32       ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
  1 sibling, 2 replies; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  670 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 660 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 46a4f3a..8745d44 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -239,6 +239,7 @@ struct __attribute__ ((__packed__)) level_state {
 struct __attribute__ ((__packed__)) nested_vmcs_page {
 	u32 revision_id;
 	u32 abort;
+	struct shadow_vmcs shadow_vmcs;
 	struct level_state l2_state;
 };
 
@@ -263,6 +264,55 @@ struct nested_vmx {
 	struct nested_vmcs_page *current_l2_page;
 };
 
+enum vmcs_field_type {
+	VMCS_FIELD_TYPE_U16 = 0,
+	VMCS_FIELD_TYPE_U64 = 1,
+	VMCS_FIELD_TYPE_U32 = 2,
+	VMCS_FIELD_TYPE_ULONG = 3
+};
+
+#define VMCS_FIELD_LENGTH_OFFSET 13
+#define VMCS_FIELD_LENGTH_MASK 0x6000
+
+/*
+  Returns VMCS Field type
+*/
+static inline int vmcs_field_type(unsigned long field)
+{
+	/* For 32 bit L1 when it using the HIGH field */
+	if (0x1 & field)
+		return VMCS_FIELD_TYPE_U32;
+
+	return (VMCS_FIELD_LENGTH_MASK & field) >> 13;
+}
+
+/*
+  Returncs VMCS field size in bits
+*/
+static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
+{
+	switch (field_type) {
+	case VMCS_FIELD_TYPE_U16:
+		return 2;
+	case VMCS_FIELD_TYPE_U32:
+		return 4;
+	case VMCS_FIELD_TYPE_U64:
+		return 8;
+	case VMCS_FIELD_TYPE_ULONG:
+#ifdef CONFIG_X86_64
+		if (is_long_mode(vcpu))
+			return 8;
+		else
+			return 4;
+#else
+		return 4;
+#endif
+	}
+
+	printk(KERN_INFO "WARNING: invalid field type %d \n", field_type);
+	return 0;
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
@@ -317,6 +367,411 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static inline struct shadow_vmcs *get_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+	WARN_ON(!to_vmx(vcpu)->nested.current_l2_page);
+	return &(to_vmx(vcpu)->nested.current_l2_page->shadow_vmcs);
+}
+
+#define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
+
+static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
+
+	[VIRTUAL_PROCESSOR_ID] =
+		SHADOW_VMCS_OFFSET(virtual_processor_id),
+	[GUEST_ES_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_es_selector),
+	[GUEST_CS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_cs_selector),
+	[GUEST_SS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_ss_selector),
+	[GUEST_DS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_ds_selector),
+	[GUEST_FS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_fs_selector),
+	[GUEST_GS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_gs_selector),
+	[GUEST_LDTR_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_selector),
+	[GUEST_TR_SELECTOR] =
+		SHADOW_VMCS_OFFSET(guest_tr_selector),
+	[HOST_ES_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_es_selector),
+	[HOST_CS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_cs_selector),
+	[HOST_SS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_ss_selector),
+	[HOST_DS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_ds_selector),
+	[HOST_FS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_fs_selector),
+	[HOST_GS_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_gs_selector),
+	[HOST_TR_SELECTOR] =
+		SHADOW_VMCS_OFFSET(host_tr_selector),
+	[IO_BITMAP_A] =
+		SHADOW_VMCS_OFFSET(io_bitmap_a),
+	[IO_BITMAP_A_HIGH] =
+		SHADOW_VMCS_OFFSET(io_bitmap_a)+4,
+	[IO_BITMAP_B] =
+		SHADOW_VMCS_OFFSET(io_bitmap_b),
+	[IO_BITMAP_B_HIGH] =
+		SHADOW_VMCS_OFFSET(io_bitmap_b)+4,
+	[MSR_BITMAP] =
+		SHADOW_VMCS_OFFSET(msr_bitmap),
+	[MSR_BITMAP_HIGH] =
+		SHADOW_VMCS_OFFSET(msr_bitmap)+4,
+	[VM_EXIT_MSR_STORE_ADDR] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr),
+	[VM_EXIT_MSR_STORE_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_store_addr)+4,
+	[VM_EXIT_MSR_LOAD_ADDR] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr),
+	[VM_EXIT_MSR_LOAD_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_load_addr)+4,
+	[VM_ENTRY_MSR_LOAD_ADDR] =
+		SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr),
+	[VM_ENTRY_MSR_LOAD_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(vm_entry_msr_load_addr)+4,
+	[TSC_OFFSET] =
+		SHADOW_VMCS_OFFSET(tsc_offset),
+	[TSC_OFFSET_HIGH] =
+		SHADOW_VMCS_OFFSET(tsc_offset)+4,
+	[VIRTUAL_APIC_PAGE_ADDR] =
+		SHADOW_VMCS_OFFSET(virtual_apic_page_addr),
+	[VIRTUAL_APIC_PAGE_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(virtual_apic_page_addr)+4,
+	[APIC_ACCESS_ADDR] =
+		SHADOW_VMCS_OFFSET(apic_access_addr),
+	[APIC_ACCESS_ADDR_HIGH] =
+		SHADOW_VMCS_OFFSET(apic_access_addr)+4,
+	[EPT_POINTER] =
+		SHADOW_VMCS_OFFSET(ept_pointer),
+	[EPT_POINTER_HIGH] =
+		SHADOW_VMCS_OFFSET(ept_pointer)+4,
+	[GUEST_PHYSICAL_ADDRESS] =
+		SHADOW_VMCS_OFFSET(guest_physical_address),
+	[GUEST_PHYSICAL_ADDRESS_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_physical_address)+4,
+	[VMCS_LINK_POINTER] =
+		SHADOW_VMCS_OFFSET(vmcs_link_pointer),
+	[VMCS_LINK_POINTER_HIGH] =
+		SHADOW_VMCS_OFFSET(vmcs_link_pointer)+4,
+	[GUEST_IA32_DEBUGCTL] =
+		SHADOW_VMCS_OFFSET(guest_ia32_debugctl),
+	[GUEST_IA32_DEBUGCTL_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_ia32_debugctl)+4,
+	[GUEST_IA32_PAT] =
+		SHADOW_VMCS_OFFSET(guest_ia32_pat),
+	[GUEST_IA32_PAT_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_ia32_pat)+4,
+	[GUEST_PDPTR0] =
+		SHADOW_VMCS_OFFSET(guest_pdptr0),
+	[GUEST_PDPTR0_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr0)+4,
+	[GUEST_PDPTR1] =
+		SHADOW_VMCS_OFFSET(guest_pdptr1),
+	[GUEST_PDPTR1_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr1)+4,
+	[GUEST_PDPTR2] =
+		SHADOW_VMCS_OFFSET(guest_pdptr2),
+	[GUEST_PDPTR2_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr2)+4,
+	[GUEST_PDPTR3] =
+		SHADOW_VMCS_OFFSET(guest_pdptr3),
+	[GUEST_PDPTR3_HIGH] =
+		SHADOW_VMCS_OFFSET(guest_pdptr3)+4,
+	[HOST_IA32_PAT] =
+		SHADOW_VMCS_OFFSET(host_ia32_pat),
+	[HOST_IA32_PAT_HIGH] =
+		SHADOW_VMCS_OFFSET(host_ia32_pat)+4,
+	[PIN_BASED_VM_EXEC_CONTROL] =
+		SHADOW_VMCS_OFFSET(pin_based_vm_exec_control),
+	[CPU_BASED_VM_EXEC_CONTROL] =
+		SHADOW_VMCS_OFFSET(cpu_based_vm_exec_control),
+	[EXCEPTION_BITMAP] =
+		SHADOW_VMCS_OFFSET(exception_bitmap),
+	[PAGE_FAULT_ERROR_CODE_MASK] =
+		SHADOW_VMCS_OFFSET(page_fault_error_code_mask),
+	[PAGE_FAULT_ERROR_CODE_MATCH] =
+		SHADOW_VMCS_OFFSET(page_fault_error_code_match),
+	[CR3_TARGET_COUNT] =
+		SHADOW_VMCS_OFFSET(cr3_target_count),
+	[VM_EXIT_CONTROLS] =
+		SHADOW_VMCS_OFFSET(vm_exit_controls),
+	[VM_EXIT_MSR_STORE_COUNT] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_store_count),
+	[VM_EXIT_MSR_LOAD_COUNT] =
+		SHADOW_VMCS_OFFSET(vm_exit_msr_load_count),
+	[VM_ENTRY_CONTROLS] =
+		SHADOW_VMCS_OFFSET(vm_entry_controls),
+	[VM_ENTRY_MSR_LOAD_COUNT] =
+		SHADOW_VMCS_OFFSET(vm_entry_msr_load_count),
+	[VM_ENTRY_INTR_INFO_FIELD] =
+		SHADOW_VMCS_OFFSET(vm_entry_intr_info_field),
+	[VM_ENTRY_EXCEPTION_ERROR_CODE] =
+		SHADOW_VMCS_OFFSET(vm_entry_exception_error_code),
+	[VM_ENTRY_INSTRUCTION_LEN] =
+		SHADOW_VMCS_OFFSET(vm_entry_instruction_len),
+	[TPR_THRESHOLD] =
+		SHADOW_VMCS_OFFSET(tpr_threshold),
+	[SECONDARY_VM_EXEC_CONTROL] =
+		SHADOW_VMCS_OFFSET(secondary_vm_exec_control),
+	[VM_INSTRUCTION_ERROR] =
+		SHADOW_VMCS_OFFSET(vm_instruction_error),
+	[VM_EXIT_REASON] =
+		SHADOW_VMCS_OFFSET(vm_exit_reason),
+	[VM_EXIT_INTR_INFO] =
+		SHADOW_VMCS_OFFSET(vm_exit_intr_info),
+	[VM_EXIT_INTR_ERROR_CODE] =
+		SHADOW_VMCS_OFFSET(vm_exit_intr_error_code),
+	[IDT_VECTORING_INFO_FIELD] =
+		SHADOW_VMCS_OFFSET(idt_vectoring_info_field),
+	[IDT_VECTORING_ERROR_CODE] =
+		SHADOW_VMCS_OFFSET(idt_vectoring_error_code),
+	[VM_EXIT_INSTRUCTION_LEN] =
+		SHADOW_VMCS_OFFSET(vm_exit_instruction_len),
+	[VMX_INSTRUCTION_INFO] =
+		SHADOW_VMCS_OFFSET(vmx_instruction_info),
+	[GUEST_ES_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_es_limit),
+	[GUEST_CS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_cs_limit),
+	[GUEST_SS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_ss_limit),
+	[GUEST_DS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_ds_limit),
+	[GUEST_FS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_fs_limit),
+	[GUEST_GS_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_gs_limit),
+	[GUEST_LDTR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_limit),
+	[GUEST_TR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_tr_limit),
+	[GUEST_GDTR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_gdtr_limit),
+	[GUEST_IDTR_LIMIT] =
+		SHADOW_VMCS_OFFSET(guest_idtr_limit),
+	[GUEST_ES_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_es_ar_bytes),
+	[GUEST_CS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_cs_ar_bytes),
+	[GUEST_SS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_ss_ar_bytes),
+	[GUEST_DS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_ds_ar_bytes),
+	[GUEST_FS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_fs_ar_bytes),
+	[GUEST_GS_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_gs_ar_bytes),
+	[GUEST_LDTR_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_ar_bytes),
+	[GUEST_TR_AR_BYTES] =
+		SHADOW_VMCS_OFFSET(guest_tr_ar_bytes),
+	[GUEST_INTERRUPTIBILITY_INFO] =
+		SHADOW_VMCS_OFFSET(guest_interruptibility_info),
+	[GUEST_ACTIVITY_STATE] =
+		SHADOW_VMCS_OFFSET(guest_activity_state),
+	[GUEST_SYSENTER_CS] =
+		SHADOW_VMCS_OFFSET(guest_sysenter_cs),
+	[HOST_IA32_SYSENTER_CS] =
+		SHADOW_VMCS_OFFSET(host_ia32_sysenter_cs),
+	[CR0_GUEST_HOST_MASK] =
+		SHADOW_VMCS_OFFSET(cr0_guest_host_mask),
+	[CR4_GUEST_HOST_MASK] =
+		SHADOW_VMCS_OFFSET(cr4_guest_host_mask),
+	[CR0_READ_SHADOW] =
+		SHADOW_VMCS_OFFSET(cr0_read_shadow),
+	[CR4_READ_SHADOW] =
+		SHADOW_VMCS_OFFSET(cr4_read_shadow),
+	[CR3_TARGET_VALUE0] =
+		SHADOW_VMCS_OFFSET(cr3_target_value0),
+	[CR3_TARGET_VALUE1] =
+		SHADOW_VMCS_OFFSET(cr3_target_value1),
+	[CR3_TARGET_VALUE2] =
+		SHADOW_VMCS_OFFSET(cr3_target_value2),
+	[CR3_TARGET_VALUE3] =
+		SHADOW_VMCS_OFFSET(cr3_target_value3),
+	[EXIT_QUALIFICATION] =
+		SHADOW_VMCS_OFFSET(exit_qualification),
+	[GUEST_LINEAR_ADDRESS] =
+		SHADOW_VMCS_OFFSET(guest_linear_address),
+	[GUEST_CR0] =
+		SHADOW_VMCS_OFFSET(guest_cr0),
+	[GUEST_CR3] =
+		SHADOW_VMCS_OFFSET(guest_cr3),
+	[GUEST_CR4] =
+		SHADOW_VMCS_OFFSET(guest_cr4),
+	[GUEST_ES_BASE] =
+		SHADOW_VMCS_OFFSET(guest_es_base),
+	[GUEST_CS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_cs_base),
+	[GUEST_SS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_ss_base),
+	[GUEST_DS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_ds_base),
+	[GUEST_FS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_fs_base),
+	[GUEST_GS_BASE] =
+		SHADOW_VMCS_OFFSET(guest_gs_base),
+	[GUEST_LDTR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_ldtr_base),
+	[GUEST_TR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_tr_base),
+	[GUEST_GDTR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_gdtr_base),
+	[GUEST_IDTR_BASE] =
+		SHADOW_VMCS_OFFSET(guest_idtr_base),
+	[GUEST_DR7] =
+		SHADOW_VMCS_OFFSET(guest_dr7),
+	[GUEST_RSP] =
+		SHADOW_VMCS_OFFSET(guest_rsp),
+	[GUEST_RIP] =
+		SHADOW_VMCS_OFFSET(guest_rip),
+	[GUEST_RFLAGS] =
+		SHADOW_VMCS_OFFSET(guest_rflags),
+	[GUEST_PENDING_DBG_EXCEPTIONS] =
+		SHADOW_VMCS_OFFSET(guest_pending_dbg_exceptions),
+	[GUEST_SYSENTER_ESP] =
+		SHADOW_VMCS_OFFSET(guest_sysenter_esp),
+	[GUEST_SYSENTER_EIP] =
+		SHADOW_VMCS_OFFSET(guest_sysenter_eip),
+	[HOST_CR0] =
+		SHADOW_VMCS_OFFSET(host_cr0),
+	[HOST_CR3] =
+		SHADOW_VMCS_OFFSET(host_cr3),
+	[HOST_CR4] =
+		SHADOW_VMCS_OFFSET(host_cr4),
+	[HOST_FS_BASE] =
+		SHADOW_VMCS_OFFSET(host_fs_base),
+	[HOST_GS_BASE] =
+		SHADOW_VMCS_OFFSET(host_gs_base),
+	[HOST_TR_BASE] =
+		SHADOW_VMCS_OFFSET(host_tr_base),
+	[HOST_GDTR_BASE] =
+		SHADOW_VMCS_OFFSET(host_gdtr_base),
+	[HOST_IDTR_BASE] =
+		SHADOW_VMCS_OFFSET(host_idtr_base),
+	[HOST_IA32_SYSENTER_ESP] =
+		SHADOW_VMCS_OFFSET(host_ia32_sysenter_esp),
+	[HOST_IA32_SYSENTER_EIP] =
+		SHADOW_VMCS_OFFSET(host_ia32_sysenter_eip),
+	[HOST_RSP] =
+		SHADOW_VMCS_OFFSET(host_rsp),
+	[HOST_RIP] =
+		SHADOW_VMCS_OFFSET(host_rip),
+};
+
+static inline unsigned short vmcs_field_to_offset(unsigned long field)
+{
+
+	if (field > HOST_RIP || vmcs_field_to_offset_table[field] == 0) {
+		printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
+		return -1;
+	}
+
+	return vmcs_field_to_offset_table[field];
+}
+
+static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
+					      unsigned long field)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long *entry;
+
+	if (!vmx->nested.current_l2_page) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return -1;
+	}
+
+	entry = (unsigned long *)((char *)(get_shadow_vmcs(vcpu)) +
+				 vmcs_field_to_offset(field));
+	return *entry;
+}
+
+static inline u16 nested_vmcs_read16(struct kvm_vcpu *vcpu,
+				     unsigned long field)
+{
+	return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u32 nested_vmcs_read32(struct kvm_vcpu *vcpu, unsigned long field)
+{
+	return nested_vmcs_readl(vcpu, field);
+}
+
+static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu, unsigned long field)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 *entry;
+	if (!vmx->nested.current_l2_page) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return -1;
+	}
+
+	entry = (u64 *)((char *)(get_shadow_vmcs(vcpu)) +
+				 vmcs_field_to_offset(field));
+	return *entry;
+}
+
+static inline void nested_vmcs_writel(struct kvm_vcpu *vcpu,
+				      unsigned long field, unsigned long value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry =
+		(unsigned long)(get_shadow_vmcs(vcpu));
+
+	if (!vmx->nested.current_l2_page) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+
+	entry += vmcs_field_to_offset(field);
+	*(unsigned long *)entry = value;
+}
+
+static inline void nested_vmcs_write16(struct kvm_vcpu *vcpu,
+				       unsigned long field, u16 value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry =
+		(unsigned long)(get_shadow_vmcs(vcpu));
+
+	if (!vmx->nested.current_l2_page) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(u16 *)entry = value;
+}
+
+static inline void nested_vmcs_write32(struct kvm_vcpu *vcpu,
+				       unsigned long field, u32 value)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry =
+		(unsigned long)(get_shadow_vmcs(vcpu));
+
+	if (!vmx->nested.current_l2_page) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return;
+	}
+	entry += vmcs_field_to_offset(field);
+	*(u32 *)entry = value;
+}
+
+static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
+				       unsigned long field, u64 value)
+{
+#ifdef CONFIG_X86_64
+	nested_vmcs_writel(vcpu, field, value);
+#else /* nested: 32 bit not actually tested */
+	nested_vmcs_writel(vcpu, field, value);
+	nested_vmcs_writel(vcpu, field+1, value >> 32);
+#endif
+}
+
 static struct page *nested_get_page(struct kvm_vcpu *vcpu,
 				    u64 vmcs_addr)
 {
@@ -354,11 +809,6 @@ static int nested_map_current(struct kvm_vcpu *vcpu)
 
 	mapped_page = kmap_atomic(vmcs_page, KM_USER0);
 
-	if (!mapped_page) {
-		printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
-		return 0;
-	}
-
 	vmx->nested.current_l2_page = mapped_page;
 
 	return 1;
@@ -1390,7 +1840,7 @@ static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
 				size, vcpu);
 	if (r) {
 		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
-		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
+		       __func__, gva, r);
 		return r;
 	}
 
@@ -3764,6 +4214,26 @@ static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
 	return addr;
 }
 
+static void set_rflags_to_vmx_fail_invalid(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_CF;
+	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_ZF &
+		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+}
+
+static void set_rflags_to_vmx_fail_valid(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_ZF;
+	rflags &= ~X86_EFLAGS_PF & ~X86_EFLAGS_AF & ~X86_EFLAGS_CF &
+		~X86_EFLAGS_SF & ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+  }
+
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3800,6 +4270,181 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmread_reg(struct kvm_vcpu *vcpu, int reg,
+			     unsigned long field)
+{
+	u64 field_value;
+
+	switch (vmcs_field_type(field)) {
+	case VMCS_FIELD_TYPE_U16:
+		field_value = nested_vmcs_read16(vcpu, field);
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		field_value = nested_vmcs_read32(vcpu, field);
+		break;
+	case VMCS_FIELD_TYPE_U64:
+		field_value = nested_vmcs_read64(vcpu, field);
+#ifdef CONFIG_X86_64
+		if (!is_long_mode(vcpu)) {
+			kvm_register_write(vcpu, reg+1, field_value >> 32);
+			field_value = (u32)field_value;
+		}
+#endif
+		break;
+	case VMCS_FIELD_TYPE_ULONG:
+		field_value = nested_vmcs_readl(vcpu, field);
+#ifdef CONFIG_X86_64
+		if (!is_long_mode(vcpu)) {
+			kvm_register_write(vcpu, reg+1, field_value >> 32);
+			field_value = (u32)field_value;
+		}
+#endif
+		break;
+	default:
+		printk(KERN_INFO "%s invalid field\n", __func__);
+		return 0;
+	}
+
+	kvm_register_write(vcpu, reg, field_value);
+	return 1;
+}
+
+static int handle_vmread_mem(struct kvm_vcpu *vcpu, gva_t gva,
+			     unsigned long field)
+{
+	u64 field_value;
+
+	switch (vmcs_field_type(field)) {
+	case VMCS_FIELD_TYPE_U16:
+		field_value = nested_vmcs_read16(vcpu, field);
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		field_value = nested_vmcs_read32(vcpu, field);
+		break;
+	case VMCS_FIELD_TYPE_U64:
+		field_value = nested_vmcs_read64(vcpu, field);
+		break;
+	case VMCS_FIELD_TYPE_ULONG:
+		field_value = nested_vmcs_readl(vcpu, field);
+		break;
+	default:
+		printk(KERN_INFO "%s invalid field\n", __func__);
+		return 0;
+	}
+
+	kvm_write_guest_virt(gva, &field_value,
+			     vmcs_field_size(vmcs_field_type(field), vcpu),
+			     vcpu);
+	return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+	unsigned long field;
+	int reg;
+	int is_reg;
+	unsigned long exit_qualification   = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t gva = 0;
+	int read_succeed;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
+		set_rflags_to_vmx_fail_invalid(vcpu);
+		return 1;
+	}
+
+	/* decode instruction info to get the field to read and where to store its value */
+	/* Bit 10, Mem/Reg (0 = memory, 1 = register) */
+	is_reg = vmx_instruction_info & (1u << 10);  /* bit 10 */
+	field = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);  /* bits 31:28 */
+
+	if (is_reg) {
+		reg   = (vmx_instruction_info >> 3) & 0xf;   /* bits 3:6 */
+		read_succeed = handle_vmread_reg(vcpu, reg, field);
+	} else {
+		gva = get_vmx_mem_address(vcpu, exit_qualification,
+					  vmx_instruction_info);
+		read_succeed = handle_vmread_mem(vcpu, gva, field);
+	}
+
+	if (read_succeed) {
+		clear_rflags_cf_zf(vcpu);
+		skip_emulated_instruction(vcpu);
+	} else {
+		set_rflags_to_vmx_fail_valid(vcpu);
+		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+	}
+
+	nested_unmap_current(vcpu);
+	return 1;
+}
+
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+	unsigned long field;
+	u64 field_value = 0;
+	int reg;
+	int is_reg;
+	gva_t gva;
+	int field_type;
+	unsigned long exit_qualification   = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
+		set_rflags_to_vmx_fail_invalid(vcpu);
+		return 1;
+	}
+
+	/* decode instruction info to get the field to read and where to store its value */
+	/* Bit 10, Mem/Reg (0 = memory, 1 = register  */
+	is_reg = vmx_instruction_info & (1u << 10); /* bit 10 */
+	reg   = (vmx_instruction_info >> 3) & 0xf;   /* bits 3:6 */
+	field = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);  /* bits 31:28 */
+	field_type = vmcs_field_type(field);
+
+	if (is_reg)
+		field_value = kvm_register_read(vcpu, reg);
+	else {
+		gva  = get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info);
+		kvm_read_guest_virt(gva, &field_value, vmcs_field_size(field_type, vcpu), vcpu);
+	}
+
+	switch (field_type) {
+	case VMCS_FIELD_TYPE_U16:
+		nested_vmcs_write16(vcpu, field, field_value);
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		nested_vmcs_write32(vcpu, field, field_value);
+		break;
+	case VMCS_FIELD_TYPE_U64:
+		nested_vmcs_write64(vcpu, field, field_value);
+		break;
+	case VMCS_FIELD_TYPE_ULONG:
+		nested_vmcs_writel(vcpu, field, field_value);
+		break;
+	default:
+		printk(KERN_INFO "%s invalid field\n", __func__);
+		set_rflags_to_vmx_fail_valid(vcpu);
+		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
+		nested_unmap_current(vcpu);
+		return 1;
+	}
+
+	clear_rflags_cf_zf(vcpu);
+	skip_emulated_instruction(vcpu);
+	nested_unmap_current(vcpu);
+	return 1;
+}
+
 static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3901,15 +4546,20 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	gva_t vmcs_gva;
-
+	uint size;
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
 				       vmx_instruction_info);
 
+	if (is_long_mode(vcpu))
+		size = sizeof(u64);
+	else
+		size = sizeof(u32);
+
 	r = kvm_write_guest_virt(vmcs_gva,
 				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
-				 sizeof(u64), vcpu);
+				 size, vcpu);
 	if (r) {
 		printk(KERN_INFO "%s failed to write vmptr\n", __func__);
 		return 1;
@@ -4206,9 +4856,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
-	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
+	[EXIT_REASON_VMREAD]                  = handle_vmread,
 	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
-	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
+	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling
  2009-12-10 18:38       ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite oritw
@ 2009-12-10 18:38         ` oritw
  2009-12-10 18:38           ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume oritw
  2009-12-17  9:10           ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling Avi Kivity
  2009-12-16 14:44         ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
  1 sibling, 2 replies; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |   27 +++++++++++++++++----------
 1 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8745d44..de1f596 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1244,8 +1244,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	u32 eb;
 
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
-	if (!vcpu->fpu_active)
-		eb |= 1u << NM_VECTOR;
 	/*
 	 * Unconditionally intercept #DB so we can maintain dr6 without
 	 * reading it every exit.
@@ -1463,10 +1461,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 	if (vcpu->fpu_active)
 		return;
 	vcpu->fpu_active = 1;
-	vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
-	if (vcpu->arch.cr0 & X86_CR0_TS)
-		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
-	update_exception_bitmap(vcpu);
 }
 
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
@@ -1474,8 +1468,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 	if (!vcpu->fpu_active)
 		return;
 	vcpu->fpu_active = 0;
-	vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
-	update_exception_bitmap(vcpu);
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -2715,8 +2707,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 	vmx_flush_tlb(vcpu);
 	vmcs_writel(GUEST_CR3, guest_cr3);
-	if (vcpu->arch.cr0 & X86_CR0_PE)
-		vmx_fpu_deactivate(vcpu);
+	if (vcpu->arch.cr0 & X86_CR0_PE) {
+		if (guest_cr3 != vmcs_readl(GUEST_CR3))
+			vmx_fpu_deactivate(vcpu);
+	}
 }
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -5208,6 +5202,19 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.switch_db_regs)
 		get_debugreg(vcpu->arch.dr6, 6);
 
+	if (vcpu->fpu_active) {
+		if (vmcs_readl(CR0_READ_SHADOW) & X86_CR0_TS)
+			vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+		else
+			vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
+			vmcs_write32(EXCEPTION_BITMAP,
+				     vmcs_read32(EXCEPTION_BITMAP) &  ~(1u << NM_VECTOR));
+	} else {
+		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+		vmcs_write32(EXCEPTION_BITMAP,
+			     vmcs_read32(EXCEPTION_BITMAP) |  (1u << NM_VECTOR));
+	}
+
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume
  2009-12-10 18:38         ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling oritw
@ 2009-12-10 18:38           ` oritw
  2009-12-10 18:38             ` [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits oritw
  2009-12-17 10:10             ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume Avi Kivity
  2009-12-17  9:10           ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling Avi Kivity
  1 sibling, 2 replies; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  890 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 873 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index de1f596..0d36b49 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -223,10 +223,16 @@ struct __attribute__ ((__packed__)) level_state {
 	/* Has the level1 guest done vmclear? */
 	bool vmclear;
 
+	u64 shadow_efer;
+	unsigned long cr3;
+	unsigned long cr4;
+
 	u64 io_bitmap_a;
 	u64 io_bitmap_b;
 	u64 msr_bitmap;
 
+	int cpu;
+	int launched;
 	bool first_launch;
 };
 
@@ -254,10 +260,14 @@ struct nested_vmx {
 	bool vmxon;
 	/* What is the location of the current vmcs l1 keeps for l2 */
 	gpa_t current_vmptr;
+	/* Are we running nested guest */
+	bool nested_mode;
 	/* Level 1 state for switching to level 2 and back */
 	struct level_state *l1_state;
 	/* Level 1 shadow vmcs for switching to level 2 and back */
 	struct shadow_vmcs *l1_shadow_vmcs;
+	/* Level 1 vmcs loaded into the processor */
+	struct vmcs *l1_vmcs;
 	/* list of vmcs for each l2 guest created by l1 */
 	struct list_head l2_vmcs_list;
 	/* l2 page corresponding to the current vmcs set by l1 */
@@ -287,7 +297,7 @@ static inline int vmcs_field_type(unsigned long field)
 }
 
 /*
-  Returncs VMCS field size in bits
+  Returns VMCS field size in bits
 */
 static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
 {
@@ -313,6 +323,10 @@ static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
+					VM_EXIT_SAVE_IA32_PAT))
+#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
+					 VM_ENTRY_IA32E_MODE))
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
@@ -892,7 +906,11 @@ static struct kvm_vmx_segment_field {
 static u64 host_efer;
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
 static int create_l1_state(struct kvm_vcpu *vcpu);
+static int create_l2_state(struct kvm_vcpu *vcpu);
+static int launch_guest(struct kvm_vcpu *vcpu, bool launch);
 
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
@@ -993,6 +1011,18 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
 	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
 }
 
+static inline int is_exception(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline int is_nmi(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
 static inline int cpu_has_vmx_invept_individual_addr(void)
 {
 	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -1049,6 +1079,51 @@ static inline bool report_flexpriority(void)
 	return flexpriority_enabled;
 }
 
+static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
+{
+	return cpu_has_vmx_tpr_shadow() &&
+		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_TPR_SHADOW;
+}
+
+static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
+{
+	return cpu_has_secondary_exec_ctrls() &&
+		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
+							   *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->
+		secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control &
+		SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->vm_entry_controls &
+		VM_ENTRY_LOAD_IA32_PAT;
+}
+
+static inline int nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+	return get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control &
+		CPU_BASED_USE_MSR_BITMAPS;
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -1390,6 +1465,8 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 }
 
+static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu);
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -1994,6 +2071,206 @@ static void vmclear_local_vcpus(void)
 		__vcpu_clear(vmx);
 }
 
+
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *l2_shadow_vmcs =
+		get_shadow_vmcs(vcpu);
+
+	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
+	l2_shadow_vmcs->guest_physical_address =
+		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	l2_shadow_vmcs->vm_entry_intr_info_field =
+		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	l2_shadow_vmcs->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	l2_shadow_vmcs->vm_entry_instruction_len =
+		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vm_instruction_error =
+		vmcs_read32(VM_INSTRUCTION_ERROR);
+	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	l2_shadow_vmcs->vm_exit_intr_error_code =
+		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	l2_shadow_vmcs->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	l2_shadow_vmcs->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	l2_shadow_vmcs->vm_exit_instruction_len =
+		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	l2_shadow_vmcs->vmx_instruction_info =
+		vmcs_read32(VMX_INSTRUCTION_INFO);
+	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	l2_shadow_vmcs->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	l2_shadow_vmcs->guest_activity_state =
+		vmcs_read32(GUEST_ACTIVITY_STATE);
+	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+
+	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+	if (l2_shadow_vmcs->cr0_guest_host_mask & X86_CR0_TS)
+		l2_shadow_vmcs->guest_cr0 =  vmcs_readl(GUEST_CR0);
+	else /* if CR0_GUEST_HOST_MASK[TS]=0 l1 should think TS was really written to CR0 */
+		l2_shadow_vmcs->guest_cr0 =
+			(vmcs_readl(GUEST_CR0)&~X86_CR0_TS) | (vmcs_readl(CR0_READ_SHADOW) & X86_CR0_TS);
+
+	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
+	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
+	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
+	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
+	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	l2_shadow_vmcs->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+}
+
+int load_vmcs_common(struct shadow_vmcs *src)
+{
+	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
+
+	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
+
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
+
+	if (src->vm_entry_msr_load_count < 512)
+		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		     src->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len);
+
+	vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		     src->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
+
+	vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
+	vmcs_writel(GUEST_DR7, src->guest_dr7);
+	vmcs_writel(GUEST_RSP, src->guest_rsp);
+	vmcs_writel(GUEST_RIP, src->guest_rip);
+	vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		    src->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
+
+	return 0;
+}
+
+int load_vmcs_host_state(struct shadow_vmcs *src)
+{
+	vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
+	vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
+	vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
+	vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
+	vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
+	vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
+	vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);
+
+	vmcs_write64(TSC_OFFSET, src->tsc_offset);
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
+
+	vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
+
+	vmcs_writel(HOST_CR0, src->host_cr0);
+	vmcs_writel(HOST_CR3, src->host_cr3);
+	vmcs_writel(HOST_CR4, src->host_cr4);
+	vmcs_writel(HOST_FS_BASE, src->host_fs_base);
+	vmcs_writel(HOST_GS_BASE, src->host_gs_base);
+	vmcs_writel(HOST_TR_BASE, src->host_tr_base);
+	vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);
+	vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);
+	vmcs_writel(HOST_RSP, src->host_rsp);
+	vmcs_writel(HOST_RIP, src->host_rip);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);
+
+	return 0;
+}
+
 static struct level_state *create_state(void)
 {
 	struct level_state *state = NULL;
@@ -2301,8 +2578,6 @@ static void free_l1_state(struct kvm_vcpu *vcpu)
 		kfree(list_item);
 	}
 }
-
-
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -3574,6 +3849,10 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (vmx->nested.nested_mode) {
+		return;
+	}
+
 	if (!cpu_has_virtual_nmis()) {
 		/*
 		 * Tracking the NMI-blocked state in software is built upon
@@ -3759,7 +4038,12 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		return 1;  /* already handled by vmx_vcpu_run() */
 
 	if (is_no_device(intr_info)) {
+		/* if l0 handled an fpu operation for l2 it's because l1 is
+		   not interested (exception bitmap 12 does not include NM_VECTOR)
+		   enable fpu and resume l2 (avoid switching to l1)
+		*/
 		vmx_fpu_activate(vcpu);
+
 		return 1;
 	}
 
@@ -4151,12 +4435,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
 static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags;
@@ -4264,6 +4542,17 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+	return launch_guest(vcpu, true);
+}
+
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+	return launch_guest(vcpu, false);
+}
+
 static int handle_vmread_reg(struct kvm_vcpu *vcpu, int reg,
 			     unsigned long field)
 {
@@ -4504,7 +4793,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 guest_vmcs_addr;
+	gpa_t guest_vmcs_addr;
 	gva_t vmcs_gva;
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -4847,11 +5136,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 	[EXIT_REASON_VMREAD]                  = handle_vmread,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 	[EXIT_REASON_VMON]                    = handle_vmon,
@@ -4895,7 +5184,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 			= vmcs_read32(VM_INSTRUCTION_ERROR);
 		return 0;
 	}
-
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
 			exit_reason != EXIT_REASON_EPT_VIOLATION &&
@@ -4903,8 +5191,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
-
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (!vmx->nested.nested_mode && unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4951,10 +5238,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	int type;
 	bool idtv_info_valid;
 
-	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
+	if (vmx->nested.nested_mode)
+		return;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
 	/* Handle machine checks before interrupts are enabled */
 	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
 	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
@@ -5068,6 +5358,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 nested_exception_bitmap = 0;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -5099,6 +5390,37 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.switch_db_regs)
 		set_debugreg(vcpu->arch.dr6, 6);
 
+	if (vcpu->fpu_active) {
+		if (vmcs_readl(CR0_READ_SHADOW) & X86_CR0_TS)
+			vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+		else
+			vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
+
+		if (vmx->nested.nested_mode) {
+			if (!nested_map_current(vcpu)) {
+				vmx->fail = 1;
+				return;
+			}
+
+			nested_exception_bitmap =  get_shadow_vmcs(vcpu)->
+				exception_bitmap;
+
+			nested_unmap_current(vcpu);
+		}
+
+		if (vmx->nested.nested_mode &&
+		    (nested_exception_bitmap & (1u << NM_VECTOR)))
+			vmcs_write32(EXCEPTION_BITMAP,
+				     vmcs_read32(EXCEPTION_BITMAP) |  (1u << NM_VECTOR));
+		else
+			vmcs_write32(EXCEPTION_BITMAP,
+				     vmcs_read32(EXCEPTION_BITMAP) &  ~(1u << NM_VECTOR));
+	} else {
+		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
+		vmcs_write32(EXCEPTION_BITMAP,
+			     vmcs_read32(EXCEPTION_BITMAP) |  (1u << NM_VECTOR));
+	}
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -5216,6 +5538,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	}
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
@@ -5300,6 +5623,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->nested.current_vmptr = -1ull;
+
+	vmx->nested.l1_state = NULL;
+	vmx->nested.current_l2_page = NULL;
+
 	return &vmx->vcpu;
 
 free_vmcs:
@@ -5388,6 +5716,534 @@ static bool vmx_gb_page_enable(void)
 	return false;
 }
 
+void save_vmcs(struct shadow_vmcs *dst)
+{
+	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	if (cpu_has_vmx_msr_bitmap())
+		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
+
+	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
+	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
+	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
+	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
+	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
+	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
+	if (enable_ept)
+		dst->ept_pointer = vmcs_read64(EPT_POINTER);
+
+	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	if (enable_ept) {
+		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+	}
+	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
+	dst->page_fault_error_code_mask =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
+	dst->page_fault_error_code_match =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
+	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
+	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
+	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
+	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
+	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
+	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	dst->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
+	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	if (enable_vpid && dst->secondary_vm_exec_control &
+	    SECONDARY_EXEC_ENABLE_VPID)
+		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
+	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	dst->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
+	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
+	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
+	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
+	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
+	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
+	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
+	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
+	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
+	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
+	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
+	dst->guest_rsp = vmcs_readl(GUEST_RSP);
+	dst->guest_rip = vmcs_readl(GUEST_RIP);
+	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	dst->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+	dst->host_cr0 = vmcs_readl(HOST_CR0);
+	dst->host_cr3 = vmcs_readl(HOST_CR3);
+	dst->host_cr4 = vmcs_readl(HOST_CR4);
+	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
+	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
+	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
+	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
+	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
+	dst->host_rsp = vmcs_readl(HOST_RSP);
+	dst->host_rip = vmcs_readl(HOST_RIP);
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT)
+		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
+}
+
+int prepare_vmcs_02(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct shadow_vmcs *src , *l1_shadow_vmcs = vmx->nested.l1_shadow_vmcs;
+	struct level_state *l2_state;
+	u32 exec_control;
+
+	src = get_shadow_vmcs(vcpu);
+	if (!src) {
+		nested_unmap_current(vcpu);
+		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
+		return 1;
+	}
+
+	load_vmcs_common(src);
+
+	l2_state = &(vmx->nested.current_l2_page->l2_state);
+
+	if (l2_state->first_launch) {
+
+		vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
+
+		if (l2_state->io_bitmap_a)
+			vmcs_write64(IO_BITMAP_A, l2_state->io_bitmap_a);
+
+		if (l2_state->io_bitmap_b)
+			vmcs_write64(IO_BITMAP_B, l2_state->io_bitmap_b);
+
+		if (l2_state->msr_bitmap)
+			vmcs_write64(MSR_BITMAP, l2_state->msr_bitmap);
+
+		if (src->vm_entry_msr_load_count > 0) {
+			struct page *page;
+
+			page = nested_get_page(vcpu,
+					       src->vm_entry_msr_load_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
+
+			kvm_release_page_clean(page);
+		}
+
+		if (nested_cpu_has_vmx_tpr_shadow(vcpu)) {
+			struct page *page;
+
+			page = nested_get_page(vcpu,
+					       src->virtual_apic_page_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
+
+			kvm_release_page_clean(page);
+		}
+
+		if (nested_vm_need_virtualize_apic_accesses(vcpu)) {
+			struct page *page =
+				nested_get_page(vcpu, src->apic_access_addr);
+			if (!page)
+				return 1;
+
+			vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+			kvm_release_page_clean(page);
+		}
+
+
+
+		vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+			     (l1_shadow_vmcs->pin_based_vm_exec_control |
+			      src->pin_based_vm_exec_control));
+
+		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+			     (l1_shadow_vmcs->page_fault_error_code_mask &
+			      src->page_fault_error_code_mask));
+
+		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+			     (l1_shadow_vmcs->page_fault_error_code_match &
+			      src->page_fault_error_code_match));
+
+		if (cpu_has_secondary_exec_ctrls()) {
+
+			exec_control =
+				l1_shadow_vmcs->secondary_vm_exec_control;
+
+			if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
+
+				exec_control |= src->secondary_vm_exec_control;
+
+				if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
+				    !nested_vm_need_virtualize_apic_accesses(vcpu))
+					exec_control &=
+						~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+			}
+
+			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		}
+
+		load_vmcs_host_state(l1_shadow_vmcs);
+
+		l2_state->first_launch = false;
+	}
+
+	if (vm_need_tpr_shadow(vcpu->kvm) &&
+	    nested_cpu_has_vmx_tpr_shadow(vcpu))
+		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
+
+	if (enable_ept) {
+		if (!nested_cpu_has_vmx_ept(vcpu)) {
+			vmcs_write64(EPT_POINTER,
+				     l1_shadow_vmcs->ept_pointer);
+			vmcs_write64(GUEST_PDPTR0,
+				     l1_shadow_vmcs->guest_pdptr0);
+			vmcs_write64(GUEST_PDPTR1,
+				     l1_shadow_vmcs->guest_pdptr1);
+			vmcs_write64(GUEST_PDPTR2,
+				     l1_shadow_vmcs->guest_pdptr2);
+			vmcs_write64(GUEST_PDPTR3,
+				     l1_shadow_vmcs->guest_pdptr3);
+		}
+	}
+
+	exec_control = l1_shadow_vmcs->cpu_based_vm_exec_control;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+
+	exec_control |= src->cpu_based_vm_exec_control;
+
+	if (!vm_need_tpr_shadow(vcpu->kvm) ||
+	    src->virtual_apic_page_addr == 0) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+			CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	} else if (exec_control & CPU_BASED_TPR_SHADOW) {
+
+#ifdef CONFIG_X86_64
+		exec_control &= ~CPU_BASED_CR8_STORE_EXITING;
+		exec_control &= ~CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (l1_shadow_vmcs->exception_bitmap |
+		      src->exception_bitmap));
+
+	vmcs_write32(VM_EXIT_CONTROLS,
+		     ((l1_shadow_vmcs->vm_exit_controls &
+		       NESTED_VM_EXIT_CONTROLS_MASK) | src->vm_exit_controls));
+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     (l1_shadow_vmcs->vm_entry_controls &
+		      NESTED_VM_ENTRY_CONTROLS_MASK) | src->vm_entry_controls);
+
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	vmcs_writel(CR0_GUEST_HOST_MASK,
+		     src->cr0_guest_host_mask | X86_CR0_TS);
+	vmcs_writel(CR4_GUEST_HOST_MASK,
+		    (l1_shadow_vmcs->cr4_guest_host_mask  &
+		     src->cr4_guest_host_mask));
+
+	return 0;
+}
+
+int switch_back_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct shadow_vmcs *src = to_vmx(vcpu)->nested.l1_shadow_vmcs;
+
+	if (enable_vpid && src->virtual_processor_id != 0)
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, src->virtual_processor_id);
+
+	vmcs_write64(IO_BITMAP_A, src->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, src->io_bitmap_b);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, src->msr_bitmap);
+
+	vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, src->virtual_apic_page_addr);
+
+	if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+		vmcs_write64(APIC_ACCESS_ADDR,
+			     src->apic_access_addr);
+
+	if (enable_ept) {
+		vmcs_write64(EPT_POINTER, src->ept_pointer);
+		vmcs_write64(GUEST_PDPTR0, src->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, src->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, src->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, src->guest_pdptr3);
+	}
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, src->pin_based_vm_exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, src->cpu_based_vm_exec_control);
+	vmcs_write32(EXCEPTION_BITMAP, src->exception_bitmap);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		     src->page_fault_error_code_mask);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		     src->page_fault_error_code_match);
+	vmcs_write32(VM_EXIT_CONTROLS, src->vm_exit_controls);
+	vmcs_write32(VM_ENTRY_CONTROLS, src->vm_entry_controls);
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+			     src->secondary_vm_exec_control);
+
+	load_vmcs_common(src);
+
+	load_vmcs_host_state(to_vmx(vcpu)->nested.l1_shadow_vmcs);
+
+	return 0;
+}
+
+void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
+{
+	unsigned long mask;
+
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+	mask = ~((1 << VCPU_REGS_RSP) | (1 << VCPU_REGS_RIP));
+
+	if (vcpu->arch.regs_dirty & mask) {
+		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
+		       vcpu->arch.regs_dirty, mask);
+		WARN_ON(1);
+	}
+
+	vcpu->arch.regs_dirty = 0;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu)
+{
+	/* verify that l1 has done vmptrld for l2 earlier */
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	int r = 0;
+	struct level_state *l2_state;
+
+	vmx->nested.nested_mode = 1;
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	save_vmcs(vmx->nested.l1_shadow_vmcs);
+
+	vmx->nested.l1_state->shadow_efer = vcpu->arch.shadow_efer;
+	if (!enable_ept)
+		vmx->nested.l1_state->cr3 = vcpu->arch.cr3;
+	vmx->nested.l1_state->cr4 = vcpu->arch.cr4;
+
+	if (!nested_map_current(vcpu)) {
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	l2_state = &(vmx->nested.current_l2_page->l2_state);
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx->nested.l1_state->msr_bitmap = vmcs_read64(MSR_BITMAP);
+	else
+		vmx->nested.l1_state->msr_bitmap = 0;
+
+	vmx->nested.l1_state->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	vmx->nested.l1_state->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	vmx->nested.l1_vmcs = vmx->vmcs;
+	vmx->nested.l1_state->cpu = vcpu->cpu;
+	vmx->nested.l1_state->launched = vmx->launched;
+
+	vmx->vmcs = nested_get_current_vmcs(vcpu);
+	if (!vmx->vmcs) {
+		printk(KERN_ERR "Missing VMCS\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	vcpu->cpu = l2_state->cpu;
+	vmx->launched = l2_state->launched;
+
+	if (l2_state->vmclear || !vmx->launched) {
+		vmcs_clear(vmx->vmcs);
+		vmx->launched = 0;
+		l2_state->vmclear = 0;
+	}
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	prepare_vmcs_02(vcpu);
+
+	if (get_shadow_vmcs(vcpu)->vm_entry_controls &
+	    VM_ENTRY_IA32E_MODE) {
+		if (!((vcpu->arch.shadow_efer & EFER_LMA) &&
+		      (vcpu->arch.shadow_efer & EFER_LME)))
+			vcpu->arch.shadow_efer |= (EFER_LMA | EFER_LME);
+	} else {
+		if ((vcpu->arch.shadow_efer & EFER_LMA) ||
+		    (vcpu->arch.shadow_efer & EFER_LME))
+			vcpu->arch.shadow_efer = 0;
+	}
+
+	vmx_set_cr0(vcpu,
+		    (get_shadow_vmcs(vcpu)->guest_cr0 & ~get_shadow_vmcs(vcpu)->cr0_guest_host_mask) |
+		    (get_shadow_vmcs(vcpu)->cr0_read_shadow & get_shadow_vmcs(vcpu)->cr0_guest_host_mask));
+
+	vmx_set_cr4(vcpu, get_shadow_vmcs(vcpu)->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW,
+		    get_shadow_vmcs(vcpu)->cr4_read_shadow);
+
+	vcpu->arch.cr0 |= X86_CR0_PG;
+
+	if (enable_ept && !nested_cpu_has_vmx_ept(vcpu)) {
+		vmcs_write32(GUEST_CR3, get_shadow_vmcs(vcpu)->guest_cr3);
+		vmx->vcpu.arch.cr3 = get_shadow_vmcs(vcpu)->guest_cr3;
+	} else {
+		kvm_set_cr3(vcpu, get_shadow_vmcs(vcpu)->guest_cr3);
+		kvm_mmu_reset_context(vcpu);
+
+		nested_unmap_current(vcpu);
+
+		r = kvm_mmu_load(vcpu);
+		if (unlikely(r)) {
+			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
+			set_rflags_to_vmx_fail_valid(vcpu);
+			/* switch back to L1 */
+			vmx->nested.nested_mode = 0;
+			vmx->vmcs = vmx->nested.l1_vmcs;
+			vcpu->cpu = vmx->nested.l1_state->cpu;
+			vmx->launched = vmx->nested.l1_state->launched;
+
+			vmx_vcpu_load(vcpu, get_cpu());
+			put_cpu();
+
+			return 1;
+		}
+
+		nested_map_current(vcpu);
+	}
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   get_shadow_vmcs(vcpu)->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   get_shadow_vmcs(vcpu)->guest_rip);
+
+	vmcs_write32(EXCEPTION_BITMAP,
+		     (vmx->nested.l1_shadow_vmcs->exception_bitmap |
+		      get_shadow_vmcs(vcpu)->exception_bitmap));
+
+	nested_unmap_current(vcpu);
+
+	return 1;
+}
+
+static int launch_guest(struct kvm_vcpu *vcpu, bool launch)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (!nested_map_current(vcpu))
+		return 1;
+
+	if (to_vmx(vcpu)->nested.current_l2_page->l2_state.vmclear != launch) {
+		set_rflags_to_vmx_fail_valid(vcpu);
+		nested_unmap_current(vcpu);
+		return 1;
+	}
+
+	nested_unmap_current(vcpu);
+
+	skip_emulated_instruction(vcpu);
+
+	nested_vmx_run(vcpu);
+
+	return 1;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits
  2009-12-10 18:38           ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume oritw
@ 2009-12-10 18:38             ` oritw
  2009-12-17 13:46               ` Avi Kivity
  2009-12-17 10:10             ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume Avi Kivity
  1 sibling, 1 reply; 24+ messages in thread
From: oritw @ 2009-12-10 18:38 UTC (permalink / raw)
  To: avi; +Cc: kvm, oritw, benami, abelg, muli, aliguori, mdday

From: Orit Wasserman <oritw@il.ibm.com>

---
 arch/x86/kvm/vmx.c |  521 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 515 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0d36b49..203f016 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -262,6 +262,10 @@ struct nested_vmx {
 	gpa_t current_vmptr;
 	/* Are we running nested guest */
 	bool nested_mode;
+	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
+	bool nested_run_pending;
+	/* flag indicating if there was a valid IDT after exiting from l2 */
+	bool valid_idt_vectoring_info;
 	/* Level 1 state for switching to level 2 and back */
 	struct level_state *l1_state;
 	/* Level 1 shadow vmcs for switching to level 2 and back */
@@ -908,9 +912,16 @@ static u64 host_efer;
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu);
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code);
+static int nested_vmx_intr(struct kvm_vcpu *vcpu);
 static int create_l1_state(struct kvm_vcpu *vcpu);
 static int create_l2_state(struct kvm_vcpu *vcpu);
 static int launch_guest(struct kvm_vcpu *vcpu, bool launch);
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu);
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override);
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt);
 
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
@@ -1467,6 +1478,8 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu);
 
+int load_vmcs_host_state(struct shadow_vmcs *src);
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -1503,6 +1516,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (vcpu->cpu != cpu) {
 		struct descriptor_table dt;
 		unsigned long sysenter_esp;
+		struct shadow_vmcs *l1_shadow_vmcs = vmx->nested.l1_shadow_vmcs;
 
 		vcpu->cpu = cpu;
 		/*
@@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			new_offset = vmcs_read64(TSC_OFFSET) + delta;
 			vmcs_write64(TSC_OFFSET, new_offset);
 		}
+
+		if (l1_shadow_vmcs != NULL) {
+			l1_shadow_vmcs->host_tr_base =
+				vmcs_readl(HOST_TR_BASE);
+			l1_shadow_vmcs->host_gdtr_base =
+				vmcs_readl(HOST_GDTR_BASE);
+			l1_shadow_vmcs->host_ia32_sysenter_esp =
+				vmcs_readl(HOST_IA32_SYSENTER_ESP);
+
+			if (tsc_this < vcpu->arch.host_tsc)
+				l1_shadow_vmcs->tsc_offset =
+					vmcs_read64(TSC_OFFSET);
+
+			if (vmx->nested.nested_mode)
+				load_vmcs_host_state(l1_shadow_vmcs);
+		}
 	}
 }
 
@@ -1611,6 +1641,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	if (nested_vmx_check_exception(vmx, nr, has_error_code, error_code))
+		return;
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -2185,9 +2218,6 @@ int load_vmcs_common(struct shadow_vmcs *src)
 	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
 		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
 
-	if (src->vm_entry_msr_load_count < 512)
-		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
-
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field);
 	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
 		     src->vm_entry_exception_error_code);
@@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
 
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		nested_vmx_intr(vcpu);
+		return;
+	}
+
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -3922,6 +3957,11 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (!nested_vmx_intr(vcpu))
+			return 0;
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		   not interested (exception bitmap 12 does not include NM_VECTOR)
 		   enable fpu and resume l2 (avoid switching to l1)
 		*/
+
+		if (vmx->nested.nested_mode)
+			vmx->nested.nested_run_pending = 1; /* removing this line cause hung on boot of l2*/
+
 		vmx_fpu_activate(vcpu);
 
 		return 1;
@@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		trace_kvm_cr_write(cr, val);
 		switch (cr) {
 		case 0:
-			kvm_set_cr0(vcpu, val);
+			if (to_vmx(vcpu)->nested.nested_mode) {
+				/* assume only X86_CR0_TS is handled by l0 */
+				long new_cr0 = vmcs_readl(GUEST_CR0);
+				long new_cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+
+				vmx_fpu_deactivate(vcpu);
+
+				if (val & X86_CR0_TS) {
+					new_cr0 |= X86_CR0_TS;
+					new_cr0_read_shadow |= X86_CR0_TS;
+					vcpu->arch.cr0 |= X86_CR0_TS;
+				} else {
+					new_cr0 &= ~X86_CR0_TS;
+					new_cr0_read_shadow &= ~X86_CR0_TS;
+					vcpu->arch.cr0 &= X86_CR0_TS;
+				}
+
+				vmcs_writel(GUEST_CR0, new_cr0);
+				vmcs_writel(CR0_READ_SHADOW, new_cr0_read_shadow);
+
+				if (!(val & X86_CR0_TS) || !(val & X86_CR0_PE))
+					vmx_fpu_activate(vcpu);
+
+				to_vmx(vcpu)->nested.nested_run_pending = 1;
+			} else
+				kvm_set_cr0(vcpu, val);
+
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 3:
@@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		break;
 	case 2: /* clts */
 		vmx_fpu_deactivate(vcpu);
-		vcpu->arch.cr0 &= ~X86_CR0_TS;
-		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+		if (to_vmx(vcpu)->nested.nested_mode) {
+			vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0) & ~X86_CR0_TS);
+			vmcs_writel(CR0_READ_SHADOW, vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+			vcpu->arch.cr0 &= ~X86_CR0_TS;
+			to_vmx(vcpu)->nested.nested_run_pending = 1;
+		} else {
+			vcpu->arch.cr0 &= ~X86_CR0_TS;
+			vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+		}
 		vmx_fpu_activate(vcpu);
 		skip_emulated_instruction(vcpu);
 		return 1;
@@ -5173,6 +5250,17 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required && emulate_invalid_guest_state)
 		return handle_invalid_guest_state(vcpu);
 
+	if (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME)
+		vmx->nested.nested_run_pending = 1;
+	else
+		vmx->nested.nested_run_pending = 0;
+
+	if (vmx->nested.nested_mode && nested_vmx_exit_handled(vcpu, true)) {
+		nested_vmx_vmexit(vcpu, false);
+		return 1;
+	}
+
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
 	if (enable_ept && is_paging(vcpu))
@@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int irq;
+	int type;
+	int errCodeValid;
+	u32 idt_vectoring_info;
+	u32 guest_intr;
+	bool nmi_window_open;
+	bool interrupt_window_open;
+
+	if (vmx->nested.valid_idt_vectoring_info) {
+		idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+		irq  = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+		type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+		errCodeValid = idt_vectoring_info &
+			VECTORING_INFO_DELIVER_CODE_MASK;
+
+		guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+		nmi_window_open =
+			!(guest_intr & (GUEST_INTR_STATE_STI |
+					GUEST_INTR_STATE_MOV_SS |
+					GUEST_INTR_STATE_NMI));
+
+		interrupt_window_open =
+			((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+			 !(guest_intr & (GUEST_INTR_STATE_STI |
+					 GUEST_INTR_STATE_MOV_SS)));
+
+		if (type == INTR_TYPE_EXT_INTR && !interrupt_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 interrupt window closed!\n");
+			return 0;
+		}
+
+		if (type == INTR_TYPE_NMI_INTR && !nmi_window_open) {
+			printk(KERN_INFO "IDT ignored, l2 nmi window closed!\n");
+			return 0;
+		}
+
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			irq | type | INTR_INFO_VALID_MASK | errCodeValid);
+
+
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+
+		if (errCodeValid)
+			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+				     vmcs_read32(IDT_VECTORING_ERROR_CODE));
+	}
+
+	return 1;
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
@@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int r;
 	u32 nested_exception_bitmap = 0;
 
+	if (vmx->nested.nested_mode) {
+		r = nested_handle_valid_idt(vcpu);
+		if (!r) {
+			vmx->fail = 1;
+			return;
+		}
+	}
+
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
 		vmx->entry_time = ktime_get();
@@ -5539,6 +5690,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
+	vmx->nested.valid_idt_vectoring_info = vmx->nested.nested_mode &&
+		(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
@@ -6191,6 +6345,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu)
 		r = kvm_mmu_load(vcpu);
 		if (unlikely(r)) {
 			printk(KERN_ERR "Error in kvm_mmu_load r %d\n", r);
+			nested_vmx_vmexit(vcpu, false);
 			set_rflags_to_vmx_fail_valid(vcpu);
 			/* switch back to L1 */
 			vmx->nested.nested_mode = 0;
@@ -6244,6 +6399,360 @@ static int launch_guest(struct kvm_vcpu *vcpu, bool launch)
 	return 1;
 }
 
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
+			     bool is_interrupt)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct level_state *l2_state;
+	int efer_offset;
+
+	if (!vmx->nested.nested_mode) {
+		printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
+		       __func__);
+		return 0;
+	}
+
+	sync_cached_regs_to_vmcs(vcpu);
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	l2_state = &(vmx->nested.current_l2_page->l2_state);
+	prepare_vmcs_12(vcpu);
+	if (is_interrupt)
+		get_shadow_vmcs(vcpu)->vm_exit_reason =
+			EXIT_REASON_EXTERNAL_INTERRUPT;
+
+	l2_state->launched = vmx->launched;
+	l2_state->cpu = vcpu->cpu;
+
+	nested_unmap_current(vcpu);
+
+	vmx->vmcs = vmx->nested.l1_vmcs;
+	vcpu->cpu = vmx->nested.l1_state->cpu;
+	vmx->launched = vmx->nested.l1_state->launched;
+
+	vmx_vcpu_load(vcpu, get_cpu());
+	put_cpu();
+
+	vcpu->arch.shadow_efer = vmx->nested.l1_state->shadow_efer;
+	if ((vcpu->arch.shadow_efer & EFER_LMA) &&
+	    !(vcpu->arch.shadow_efer & EFER_SCE))
+		vcpu->arch.shadow_efer |= EFER_SCE;
+
+	efer_offset = __find_msr_index(vmx, MSR_EFER);
+	if (update_transition_efer(vmx, efer_offset))
+		wrmsrl(MSR_EFER, vmx->guest_msrs[efer_offset].data);
+
+	vmx_set_cr0(vcpu, vmx->nested.l1_shadow_vmcs->cr0_read_shadow);
+	vmx_set_cr4(vcpu, vmx->nested.l1_state->cr4);
+
+	if (enable_ept) {
+		vcpu->arch.cr3 = vmx->nested.l1_shadow_vmcs->guest_cr3;
+		vmcs_write32(GUEST_CR3, vmx->nested.l1_shadow_vmcs->guest_cr3);
+	} else {
+		kvm_set_cr3(vcpu, vmx->nested.l1_state->cr3);
+	}
+
+	if (!nested_map_current(vcpu)) {
+		printk(KERN_INFO "Error mapping shadow vmcs\n");
+		set_rflags_to_vmx_fail_valid(vcpu);
+		return 1;
+	}
+
+	switch_back_vmcs(vcpu);
+
+	vmx->nested.l1_shadow_vmcs->guest_cr0 = get_shadow_vmcs(vcpu)->host_cr0;
+
+	nested_unmap_current(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP,
+			   vmx->nested.l1_shadow_vmcs->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP,
+			   vmx->nested.l1_shadow_vmcs->guest_rip);
+
+	vmx->nested.nested_mode = 0;
+
+	kvm_mmu_reset_context(vcpu);
+	kvm_mmu_load(vcpu);
+
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		set_rflags_to_vmx_fail_valid(vcpu);
+	} else
+		clear_rflags_cf_zf(vcpu);
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		struct page *msr_page = NULL;
+		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+		struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
+
+		if (!cpu_has_vmx_msr_bitmap()
+		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+			return 1;
+
+		msr_page = nested_get_page(vcpu,
+					   l2svmcs->msr_bitmap);
+
+		if (!msr_page) {
+			printk(KERN_INFO "%s error in nested_get_page\n",
+			       __func__);
+			return 0;
+		}
+
+		switch (exit_code) {
+		case EXIT_REASON_MSR_READ:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x000)))
+					return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x400)))
+					return 1;
+			}
+			break;
+		case EXIT_REASON_MSR_WRITE:
+			if (msr_index <= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x800)))
+						return 1;
+			} else if ((msr_index >= 0xc0000000) &&
+				   (msr_index <= 0xc0001fff)) {
+				msr_index &= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0xc00)))
+					return 1;
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct shadow_vmcs *l2svmcs;
+
+	int r = 0;
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	if (kvm_override) {
+		switch (exit_code) {
+		case EXIT_REASON_EXTERNAL_INTERRUPT:
+			return 0;
+		case EXIT_REASON_EXCEPTION_NMI:
+			if (!is_exception(intr_info))
+				return 0;
+
+			if (is_page_fault(intr_info) && (!enable_ept))
+				return 0;
+
+			break;
+		case EXIT_REASON_EPT_VIOLATION:
+			if (enable_ept)
+				return 0;
+
+			break;
+		}
+	}
+
+
+	if (!nested_map_current(vcpu))
+		return 0;
+
+	l2svmcs = get_shadow_vmcs(vcpu);
+
+	switch (exit_code) {
+	case EXIT_REASON_INVLPG:
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_INVLPG_EXITING)
+			r = 1;
+		break;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		r = nested_vmx_exit_handled_msr(vcpu);
+		break;
+	case EXIT_REASON_CR_ACCESS: {
+		unsigned long exit_qualification =
+			vmcs_readl(EXIT_QUALIFICATION);
+		int cr = exit_qualification & 15;
+		int reg = (exit_qualification >> 8) & 15;
+		unsigned long val = kvm_register_read(vcpu, reg);
+
+		switch ((exit_qualification >> 4) & 3) {
+		case 0: /* mov to cr */
+			switch (cr) {
+			case 0:
+				if (l2svmcs->cr0_guest_host_mask &
+				    (val ^ l2svmcs->cr0_read_shadow))
+					r = 1;
+				break;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_LOAD_EXITING)
+					r = 1;
+				break;
+			case 4:
+				if (l2svmcs->cr4_guest_host_mask &
+				    (l2svmcs->cr4_read_shadow ^ val))
+					r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_LOAD_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 2: /* clts */
+			if (l2svmcs->cr0_guest_host_mask & X86_CR0_TS)
+				r = 1;
+			break;
+		case 1: /*mov from cr*/
+			switch (cr) {
+			case 0:
+				r = 1;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR3_STORE_EXITING)
+					r = 1;
+				break;
+			case 4:
+				r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control &
+				    CPU_BASED_CR8_STORE_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 3: /* lmsw */
+			if (l2svmcs->cr0_guest_host_mask &
+			    (val ^ l2svmcs->cr0_read_shadow))
+				r = 1;
+			break;
+		}
+		break;
+	}
+	case EXIT_REASON_DR_ACCESS: {
+		if (l2svmcs->cpu_based_vm_exec_control &
+		    CPU_BASED_MOV_DR_EXITING)
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXCEPTION_NMI: {
+
+		if (is_external_interrupt(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_EXT_INTR_MASK))
+			r = 1;
+		else if (is_nmi(intr_info) &&
+		    (l2svmcs->pin_based_vm_exec_control &
+		     PIN_BASED_NMI_EXITING))
+			r = 1;
+		else if (is_exception(intr_info) &&
+		    (l2svmcs->exception_bitmap &
+		     (1u << (intr_info & INTR_INFO_VECTOR_MASK))))
+			r = 1;
+		else if (is_page_fault(intr_info))
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		if (l2svmcs->pin_based_vm_exec_control &
+		    PIN_BASED_EXT_INTR_MASK)
+			r = 1;
+		break;
+	default:
+		r = 1;
+	}
+	nested_unmap_current(vcpu);
+
+	return r;
+}
+
+static int nested_vmx_check_exception(struct vcpu_vmx *vmx, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (vmx->nested.nested_mode) {
+		if (nested_vmx_exit_handled(&vmx->vcpu, false)) {
+			nested_vmx_vmexit(&vmx->vcpu, false);
+			if (!nested_map_current(&vmx->vcpu))
+				return 1;
+			get_shadow_vmcs(&vmx->vcpu)->vm_exit_reason =
+				EXIT_REASON_EXCEPTION_NMI;
+			get_shadow_vmcs(&vmx->vcpu)->vm_exit_intr_info =
+				(nr | INTR_TYPE_HARD_EXCEPTION
+				 | (has_error_code ?
+				    INTR_INFO_DELIVER_CODE_MASK : 0)
+				 | INTR_INFO_VALID_MASK);
+
+			if (has_error_code)
+				get_shadow_vmcs(&vmx->vcpu)->
+					vm_exit_intr_error_code = error_code;
+			nested_unmap_current(&vmx->vcpu);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int nested_vmx_intr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!nested_map_current(vcpu))
+		return 0;
+
+	if (get_shadow_vmcs(vcpu)->pin_based_vm_exec_control &
+	    PIN_BASED_EXT_INTR_MASK) {
+
+		if (vmx->nested.nested_run_pending) {
+			nested_unmap_current(vcpu);
+			return 0;
+		}
+
+		nested_unmap_current(vcpu);
+		nested_vmx_vmexit(vcpu, true);
+		return 1;
+	}
+
+	nested_unmap_current(vcpu);
+
+	return 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-10 18:38 ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff oritw
  2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
@ 2009-12-16 13:34   ` Avi Kivity
  2009-12-20 14:20   ` Gleb Natapov
  2 siblings, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-16 13:34 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
>    

Missing changelog entry.  Please use the format common to all kvm patches.


> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 3de0b37..3f63cdd 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -121,9 +121,6 @@ static int npt = 1;
>
>   module_param(npt, int, S_IRUGO);
>
> -static int nested = 1;
> -module_param(nested, int, S_IRUGO);
> -
>    

Separate moving 'nested' into a different patch.

> +struct __attribute__ ((__packed__)) level_state {
> +};
> +
> +struct nested_vmx {
> +	/* Has the level1 guest done vmxon? */
> +	bool vmxon;
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
>    

If this doesn't grow too large, can keep it as a member instead of a 
pointer.

> +};
> +
>
>   static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
> @@ -201,6 +214,7 @@ static struct kvm_vmx_segment_field {
>   static u64 host_efer;
>
>   static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
> +static int create_l1_state(struct kvm_vcpu *vcpu);
>
>   /*
>    * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
> @@ -961,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
>   }
>
>   /*
> + * Handles msr read for nested virtualization
> + */
> +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
> +			      u64 *pdata)
> +{
> +	u64 vmx_msr = 0;
> +
> +	switch (msr_index) {
> +	case MSR_IA32_FEATURE_CONTROL:
> +		*pdata = 0;
> +		break;
> +	case MSR_IA32_VMX_BASIC:
> +		*pdata = 0;
>    

Not needed.

> +		rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
> +		*pdata = (vmx_msr&  0x00ffffcfffffffff);
>    

Please use symbolic constants.

> +		break;
> +	case MSR_IA32_VMX_PINBASED_CTLS:
> +		rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr);
> +		*pdata = (PIN_BASED_EXT_INTR_MASK&  vmcs_config.pin_based_exec_ctrl) |
> +			(PIN_BASED_NMI_EXITING&  vmcs_config.pin_based_exec_ctrl) |
> +			(PIN_BASED_VIRTUAL_NMIS&  vmcs_config.pin_based_exec_ctrl);
>    

Don't understand.  You read vmx_msr and then use vmcs_config?

> +	case MSR_IA32_VMX_PROCBASED_CTLS:
> +	{
> +		u32 vmx_msr_high, vmx_msr_low;
> +		u32 control = CPU_BASED_HLT_EXITING |
> +#ifdef CONFIG_X86_64
> +			CPU_BASED_CR8_LOAD_EXITING |
> +			CPU_BASED_CR8_STORE_EXITING |
> +#endif
> +			CPU_BASED_CR3_LOAD_EXITING |
> +			CPU_BASED_CR3_STORE_EXITING |
> +			CPU_BASED_USE_IO_BITMAPS |
> +			CPU_BASED_MOV_DR_EXITING |
> +			CPU_BASED_USE_TSC_OFFSETING |
> +			CPU_BASED_INVLPG_EXITING |
> +			CPU_BASED_TPR_SHADOW |
> +			CPU_BASED_USE_MSR_BITMAPS |
> +			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +
> +		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
> +
> +		control&= vmx_msr_high; /* bit == 0 in high word ==>  must be zero */
> +		control |= vmx_msr_low;  /* bit == 1 in low word  ==>  must be one  */
> +
> +		*pdata = (CPU_BASED_HLT_EXITING&  control) |
> +#ifdef CONFIG_X86_64
> +			(CPU_BASED_CR8_LOAD_EXITING&  control) |
> +			(CPU_BASED_CR8_STORE_EXITING&  control) |
> +#endif
> +			(CPU_BASED_CR3_LOAD_EXITING&  control) |
> +			(CPU_BASED_CR3_STORE_EXITING&  control) |
> +			(CPU_BASED_USE_IO_BITMAPS&  control) |
> +			(CPU_BASED_MOV_DR_EXITING&  control) |
> +			(CPU_BASED_USE_TSC_OFFSETING&  control) |
> +			(CPU_BASED_INVLPG_EXITING&  control) ;
>    

What about the high word of the msr?  Will it always allow 0?

>
>   /*
> + * Writes msr value for nested virtualization
> + * Returns 0 on success, non-0 otherwise.
> + */
> +static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
> +{
> +	switch (msr_index) {
> +	case MSR_IA32_FEATURE_CONTROL:
> +		if ((data&  (FEATURE_CONTROL_LOCKED |
> +			     FEATURE_CONTROL_VMXON_ENABLED))
> +		    != (FEATURE_CONTROL_LOCKED |
> +			FEATURE_CONTROL_VMXON_ENABLED))
> +			return 1;
> +		break;
>    

Need to trap if unsupported bits are set.

Need a way for userspace to write these msrs, so that live migration to 
an older kvm can work.  We do the same thing with cpuid - userspace sets 
cpuid to values that are common across the migration cluster.

> +static void free_l1_state(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.l1_state)
> +		return;
>    

Check isn't needed, kfree() likes NULLs.

> +
> +	kfree(vmx->nested.l1_state);
> +	vmx->nested.l1_state = NULL;
> +}
> +
> +
>
>    

>
>   struct kvm_shared_msrs_global {
> @@ -505,7 +509,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>   		return;
>   	}
>
> -	if (cr4&  X86_CR4_VMXE) {
> +	if (cr4&  X86_CR4_VMXE&&  !nested) {
>   		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
>   		kvm_inject_gp(vcpu, 0);
>   		return;
>    

Some bits are required to be set when VMXE is enabled.

Please split the MSR changes into a separate patch.  Even cr4 is better 
on its own.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/7] Nested VMX patch 2 implements vmclear
  2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
  2009-12-10 18:38     ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst oritw
@ 2009-12-16 13:59     ` Avi Kivity
  2009-12-28 14:57     ` Gleb Natapov
  2 siblings, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-16 13:59 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
> ---
>   arch/x86/kvm/vmx.c |  235 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   arch/x86/kvm/x86.c |    5 +-
>   arch/x86/kvm/x86.h |    3 +
>   3 files changed, 240 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 2726a6c..a7ffd5e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -93,13 +93,39 @@ struct shared_msr_entry {
>   };
>
>   struct __attribute__ ((__packed__)) level_state {
> +	/* Has the level1 guest done vmclear? */
> +	bool vmclear;
> +};
>    

Suggest calling it launch_state and using an enum.  We can have three 
states: uninitialized, clear, and launched.  Not sure if this is really 
required by the spec.

Do we need vmclear in l1_state?

> +struct __attribute__ ((__packed__)) nested_vmcs_page {
> +	u32 revision_id;
> +	u32 abort;
> +	struct level_state l2_state;
> +};
> +
> +struct nested_vmcs_list {
> +	struct list_head list;
>    

'link'

> +	gpa_t vmcs_addr;
> +	struct vmcs *l2_vmcs;
>   };
>
>   struct nested_vmx {
>   	/* Has the level1 guest done vmxon? */
>   	bool vmxon;
> +	/* What is the location of the current vmcs l1 keeps for l2 */
> +	gpa_t current_vmptr;
>   	/* Level 1 state for switching to level 2 and back */
>   	struct level_state *l1_state;
> +	/* list of vmcs for each l2 guest created by l1 */
> +	struct list_head l2_vmcs_list;
> +	/* l2 page corresponding to the current vmcs set by l1 */
> +	struct nested_vmcs_page *current_l2_page;
>   };
>
>   struct vcpu_vmx {
> @@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
>   	return container_of(vcpu, struct vcpu_vmx, vcpu);
>   }
>
> +static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> +				    u64 vmcs_addr)
> +{
> +	struct page *vmcs_page = NULL;
> +
> +	down_read(&current->mm->mmap_sem);
> +	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr>>  PAGE_SHIFT);
> +	up_read(&current->mm->mmap_sem);
>    

gfn_to_page() doesn't need mmap_sem (and may deadlock if you take it).

> +
> +	if (is_error_page(vmcs_page)) {
> +		printk(KERN_ERR "%s error allocating page 0x%llx\n",
> +		       __func__, vmcs_addr);
> +		kvm_release_page_clean(vmcs_page);
> +		return NULL;
> +	}
> +
> +	return vmcs_page;
> +
> +}
> +
> +static int nested_map_current(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page =
> +		nested_get_page(vcpu, vmx->nested.current_vmptr);
> +	struct nested_vmcs_page *mapped_page;
> +
> +	if (vmcs_page == NULL) {
> +		printk(KERN_INFO "%s: failure in nested_get_page\n", __func__);
> +		return 0;
> +	}
> +
> +	if (vmx->nested.current_l2_page) {
> +		printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__);
> +		WARN_ON(1);
> +		return 0;
> +	}
> +
> +	mapped_page = kmap_atomic(vmcs_page, KM_USER0);
> +
> +	if (!mapped_page) {
> +		printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
> +		return 0;
> +	}
>    

kmap_atomic() can't fail.

> +
> +	vmx->nested.current_l2_page = mapped_page;
> +
> +	return 1;
> +}
> +
> +static void nested_unmap_current(struct kvm_vcpu *vcpu)
> +{
> +	struct page *page;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.current_l2_page) {
> +		printk(KERN_INFO "Shadow vmcs already unmapped\n");
> +		WARN_ON(1);
>    

Use BUG_ON(), since this can't happen unless there's a bug.

> +		return;
> +	}
> +
> +	page = kmap_atomic_to_page(vmx->nested.current_l2_page);
> +
> +	kunmap_atomic(vmx->nested.current_l2_page, KM_USER0);
> +
> +	kvm_release_page_dirty(page);
> +
> +	vmx->nested.current_l2_page = NULL;
> +}
> +
>   static int init_rmode(struct kvm *kvm);
>   static u64 construct_eptp(unsigned long root_hpa);
>
> @@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
>   	return 0;
>   }
>
> +static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
> +{
> +	int r = 0;
> +	uint size;
> +
> +	*gentry = 0;
> +
> +	if (is_long_mode(vcpu))
> +		size = sizeof(u64);
> +	else
> +		size = sizeof(u32);
>    

I think the gpa is always 64 bit, regardless of the current mode.

> +
> +	r = kvm_read_guest_virt(gva, gentry,
> +				size, vcpu);
> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> +		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
>    

RAX may not be relevant.  Just return, and the user can disassemble the 
instructions and see for themselves.

> +		return r;
> +	}
> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>
> +/*
> + * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11
> + * For additional information regarding offset calculation see 3.7.5
> + */
> +static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
> +				 unsigned long exit_qualification,
> +				 u32 vmx_instruction_info)
> +{
> +	int  scaling        = vmx_instruction_info&  3;             /* bits 0:1 scaling */
> +	int  addr_size      = (vmx_instruction_info>>  7)&  7;      /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */
> +	bool is_reg         = vmx_instruction_info&  (1u<<  10);    /* bit  10  1=register operand, 0= memory */
> +	int  seg_reg        = (vmx_instruction_info>>  15)&  7;     /* bits 15:17 segment register */
> +	int  index_reg      = (vmx_instruction_info>>  18)&  0xf;   /* bits 18:21 index register */
> +	bool index_is_valid = !(vmx_instruction_info&  (1u<<  22)); /* bit  22 index register validity, 0=valid, 1=invalid */
> +	int  base_reg       = (vmx_instruction_info>>  23)&  0xf;   /* bits 23:26 index register */
> +	bool base_is_valid  = !(vmx_instruction_info&  (1u<<  27)); /* bit  27 base register validity, 0=valid, 1=invalid */
> +	gva_t addr;
> +
> +	if (is_reg)
> +		return 0;
>    

Should #UD.

> +
> +	switch (addr_size) {
> +	case 1:
> +		exit_qualification&= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */
> +		break;
> +	case 2:
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	/* Addr = segment_base + offset */
> +	/* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */
> +	addr = vmx_get_segment_base(vcpu, seg_reg);
> +	if (base_is_valid)
> +		addr += kvm_register_read(vcpu, base_reg);
> +	if (index_is_valid)
> +		addr += kvm_register_read(vcpu, index_reg)*scaling;
>    

Shouldn't this be a shift?

Wish we had something like that for emulate.c.

> +	addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */
> +
> +	return addr;
> +}
> +
> +static int handle_vmclear(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct level_state *l2_state;
> +	gpa_t guest_vmcs_addr;
> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	gva_t vmcs_gva;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
> +				       vmx_instruction_info);
>    

I think you can let get_vmx_mem_address() do the vmread()s, simpler.

> +
> +	if (read_guest_vmcs_gpa(vcpu, vmcs_gva,&guest_vmcs_addr))
> +		return 1;
> +
> +	vmx->nested.current_vmptr = guest_vmcs_addr;
> +	if (!nested_map_current(vcpu))
> +		return 1;
> +
> +	l2_state =&(to_vmx(vcpu)->nested.current_l2_page->l2_state);
> +	l2_state->vmclear = 1;
> +	nested_free_current_vmcs(vcpu);
>    

Why free?  Isn't the purpose of the list to keep those active?

> +
> +	vmx->nested.current_vmptr = -1ull;
> +
> +	nested_unmap_current(vcpu);
> +
> +	skip_emulated_instruction(vcpu);
> +	clear_rflags_cf_zf(vcpu);
> +
> +	return 1;
> +}
> +
>    

As usual, if you can split some of the infrastructure into separate 
patches, it would help review.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst
  2009-12-10 18:38     ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst oritw
  2009-12-10 18:38       ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite oritw
@ 2009-12-16 14:32       ` Avi Kivity
  1 sibling, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-16 14:32 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
>
> +
> +
>   struct __attribute__ ((__packed__)) level_state {
>   	/* Has the level1 guest done vmclear? */
>   	bool vmclear;
> +
> +	u64 io_bitmap_a;
> +	u64 io_bitmap_b;
> +	u64 msr_bitmap;
> +
> +	bool first_launch;
>   };
>    

Please keep things naturally aligned.

>   /*
> @@ -122,6 +255,8 @@ struct nested_vmx {
>   	gpa_t current_vmptr;
>   	/* Level 1 state for switching to level 2 and back */
>   	struct level_state *l1_state;
> +	/* Level 1 shadow vmcs for switching to level 2 and back */
> +	struct shadow_vmcs *l1_shadow_vmcs;
>   	/* list of vmcs for each l2 guest created by l1 */
>   	struct list_head l2_vmcs_list;
>   	/* l2 page corresponding to the current vmcs set by l1 */
> @@ -187,10 +322,7 @@ static struct page *nested_get_page(struct kvm_vcpu *vcpu,
>   {
>   	struct page *vmcs_page = NULL;
>
> -	down_read(&current->mm->mmap_sem);
>   	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr>>  PAGE_SHIFT);
> -	up_read(&current->mm->mmap_sem);
>    

Fold this into the patch that introduced the problem.

> -
>   	if (is_error_page(vmcs_page)) {
>   		printk(KERN_ERR "%s error allocating page 0x%llx\n",
>   		       __func__, vmcs_addr);
> @@ -832,13 +964,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>
>   	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
>   		u8 error;
> -
>   		per_cpu(current_vmcs, cpu) = vmx->vmcs;
> +
>    

Please avoid pointless whitespace changes.

>   		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
>   			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
>   			      : "cc");
> +
>   		if (error)
> -			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
> +			printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
>   			       vmx->vmcs, phys_addr);
>    

Fold.

> +
>   static int create_l1_state(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -1441,10 +1587,75 @@ static int create_l1_state(struct kvm_vcpu *vcpu)
>   	} else
>   		return 0;
>
> +	vmx->nested.l1_shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!vmx->nested.l1_shadow_vmcs) {
> +		printk(KERN_INFO "%s could not allocate memory for l1_shadow vmcs\n",
> +		       __func__);
> +		kfree(vmx->nested.l1_state);
> +		return -ENOMEM;
> +	}
> +
>   	INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list));
>   	return 0;
>   }
>
> +static struct vmcs *alloc_vmcs(void);
> +int create_l2_state(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct vmcs *l2_vmcs;
> +
> +	if (!nested_map_current(vcpu)) {
> +		printk(KERN_ERR "%s error mapping  level 2 page", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	l2_vmcs = nested_get_current_vmcs(vcpu);
> +	if (!l2_vmcs) {
> +		struct nested_vmcs_list *new_l2_guest =
> +			(struct nested_vmcs_list *)
> +			kmalloc(sizeof(struct nested_vmcs_list), GFP_KERNEL);
> +
> +		if (!new_l2_guest) {
> +			printk(KERN_ERR "%s error could not allocate memory for a new l2 guest list item",
> +			       __func__);
> +			nested_unmap_current(vcpu);
> +			return -ENOMEM;
> +		}
>    

Can the list grow without bounds?

> +
> +		l2_vmcs = alloc_vmcs();
> +
> +		if (!l2_vmcs) {
> +			printk(KERN_ERR "%s error could not allocate memory for l2_vmcs",
> +			       __func__);
> +			kfree(new_l2_guest);
> +			nested_unmap_current(vcpu);
> +			return -ENOMEM;
> +		}
> +
> +		new_l2_guest->vmcs_addr = vmx->nested.current_vmptr;
> +		new_l2_guest->l2_vmcs   = l2_vmcs;
> +		list_add(&(new_l2_guest->list),&(vmx->nested.l2_vmcs_list));
> +	}
> +
> +	if (cpu_has_vmx_msr_bitmap())
> +		vmx->nested.current_l2_page->l2_state.msr_bitmap =
> +			vmcs_read64(MSR_BITMAP);
> +	else
> +		vmx->nested.current_l2_page->l2_state.msr_bitmap = 0;
> +
> +	vmx->nested.current_l2_page->l2_state.io_bitmap_a =
> +		vmcs_read64(IO_BITMAP_A);
> +	vmx->nested.current_l2_page->l2_state.io_bitmap_b =
> +		vmcs_read64(IO_BITMAP_B);
>    

Don't understand why these reads are needed.

> +
> +	vmx->nested.current_l2_page->l2_state.first_launch = true;
> +
> +	nested_unmap_current(vcpu);
> +
> +	return 0;
> +}
> +
>   

> @@ -3633,8 +3849,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>   		return 1;
>   	}
>
> -	if (create_l1_state(vcpu)) {
> -		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +	r = create_l1_state(vcpu);
> +	if (r) {
> +		printk(KERN_ERR "%s create_l1_state failed: %d\n", __func__, r);
>    

Move this to the original patch.

>   		kvm_queue_exception(vcpu, UD_VECTOR);
>   		return 1;
>   	}
> @@ -3645,6 +3862,63 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>   	return 1;
>   }
>
> +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u64 guest_vmcs_addr;
> +	gva_t vmcs_gva;
> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	int r = 0;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
> +				       vmx_instruction_info);
> +
> +	if (read_guest_vmcs_gpa(vcpu, vmcs_gva,&guest_vmcs_addr))
> +		return 1;
> +
> +	if (vmx->nested.current_vmptr != guest_vmcs_addr) {
> +		vmx->nested.current_vmptr = guest_vmcs_addr;
> +		r = create_l2_state(vcpu);
> +		if (r) {
> +			printk(KERN_ERR "%s create_l2_state failed: %d\n",
> +			       __func__, r);
> +			return 1;
> +		}
> +	}
> +
> +	clear_rflags_cf_zf(vcpu);
> +	skip_emulated_instruction(vcpu);
> +	return 1;
>    

No set_rflags() on error?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite
  2009-12-10 18:38       ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite oritw
  2009-12-10 18:38         ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling oritw
@ 2009-12-16 14:44         ` Avi Kivity
  1 sibling, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-16 14:44 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
> ---
>   arch/x86/kvm/vmx.c |  670 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 files changed, 660 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 46a4f3a..8745d44 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -239,6 +239,7 @@ struct __attribute__ ((__packed__)) level_state {
>   struct __attribute__ ((__packed__)) nested_vmcs_page {
>   	u32 revision_id;
>   	u32 abort;
> +	struct shadow_vmcs shadow_vmcs;
>   	struct level_state l2_state;
>   };
>
> @@ -263,6 +264,55 @@ struct nested_vmx {
>   	struct nested_vmcs_page *current_l2_page;
>   };
>
> +enum vmcs_field_type {
> +	VMCS_FIELD_TYPE_U16 = 0,
> +	VMCS_FIELD_TYPE_U64 = 1,
> +	VMCS_FIELD_TYPE_U32 = 2,
> +	VMCS_FIELD_TYPE_ULONG = 3
> +};
> +
> +#define VMCS_FIELD_LENGTH_OFFSET 13
> +#define VMCS_FIELD_LENGTH_MASK 0x6000
> +
> +/*
> +  Returns VMCS Field type
> +*/
> +static inline int vmcs_field_type(unsigned long field)
> +{
> +	/* For 32 bit L1 when it using the HIGH field */
> +	if (0x1&  field)
> +		return VMCS_FIELD_TYPE_U32;
> +
> +	return (VMCS_FIELD_LENGTH_MASK&  field)>>  13;
> +}
> +
> +/*
> +  Returncs VMCS field size in bits
> +*/
> +static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
> +{
> +	switch (field_type) {
> +	case VMCS_FIELD_TYPE_U16:
> +		return 2;
> +	case VMCS_FIELD_TYPE_U32:
> +		return 4;
> +	case VMCS_FIELD_TYPE_U64:
> +		return 8;
> +	case VMCS_FIELD_TYPE_ULONG:
> +#ifdef CONFIG_X86_64
> +		if (is_long_mode(vcpu))
> +			return 8;
> +		else
>    

Can replace with #endif

> +			return 4;
> +#else
> +		return 4;
> +#endif
>    

... and drop the previous three lines.

> +	}
> +
> +	printk(KERN_INFO "WARNING: invalid field type %d \n", field_type);
> +	return 0;
>    

Can this happen?  The field is only two bits wide.

>
> +static inline struct shadow_vmcs *get_shadow_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	WARN_ON(!to_vmx(vcpu)->nested.current_l2_page);
> +	return&(to_vmx(vcpu)->nested.current_l2_page->shadow_vmcs);
> +}
> +
> +#define SHADOW_VMCS_OFFSET(x) offsetof(struct shadow_vmcs, x)
> +
> +static unsigned short vmcs_field_to_offset_table[HOST_RIP+1] = {
> +
> +	[VIRTUAL_PROCESSOR_ID] =
> +		SHADOW_VMCS_OFFSET(virtual_processor_id),
>    

Keep on one line, you can use a shorter macro name if it helps.  This 
table is just noise.

> +
> +static inline unsigned short vmcs_field_to_offset(unsigned long field)
> +{
> +
> +	if (field>  HOST_RIP || vmcs_field_to_offset_table[field] == 0) {
> +		printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
> +		return -1;
>    

This will be converted to 0xffff.

> +	}
> +
> +	return vmcs_field_to_offset_table[field];
> +}
> +
> +static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
> +					      unsigned long field)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	unsigned long *entry;
> +
> +	if (!vmx->nested.current_l2_page) {
> +		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> +		return -1;
> +	}
> +
> +	entry = (unsigned long *)((char *)(get_shadow_vmcs(vcpu)) +
> +				 vmcs_field_to_offset(field));
>    

Error check?

> +static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu, unsigned long field)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u64 *entry;
> +	if (!vmx->nested.current_l2_page) {
> +		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
> +		return -1;
> +	}
> +
> +	entry = (u64 *)((char *)(get_shadow_vmcs(vcpu)) +
> +				 vmcs_field_to_offset(field));
>    

Need to support the 'high' part of 64-bit fields.

> +	return *entry;
> +}
>
> +
> +static inline void nested_vmcs_write64(struct kvm_vcpu *vcpu,
> +				       unsigned long field, u64 value)
> +{
> +#ifdef CONFIG_X86_64
> +	nested_vmcs_writel(vcpu, field, value);
> +#else /* nested: 32 bit not actually tested */
> +	nested_vmcs_writel(vcpu, field, value);
> +	nested_vmcs_writel(vcpu, field+1, value>>  32);
> +#endif
>    

High field support needed.

>   static struct page *nested_get_page(struct kvm_vcpu *vcpu,
>   				    u64 vmcs_addr)
>   {
> @@ -354,11 +809,6 @@ static int nested_map_current(struct kvm_vcpu *vcpu)
>
>   	mapped_page = kmap_atomic(vmcs_page, KM_USER0);
>
> -	if (!mapped_page) {
> -		printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
> -		return 0;
> -	}
> -
>    

Fold.

>   	vmx->nested.current_l2_page = mapped_page;
>
>   	return 1;
> @@ -1390,7 +1840,7 @@ static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
>   				size, vcpu);
>   	if (r) {
>   		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> -		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
> +		       __func__, gva, r);
>    

Fold.

> @@ -3764,6 +4214,26 @@ static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
>   	return addr;
>   }
>
> +static void set_rflags_to_vmx_fail_invalid(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long rflags;
> +	rflags = vmx_get_rflags(vcpu);
> +	rflags |= X86_EFLAGS_CF;
> +	rflags&= ~X86_EFLAGS_PF&  ~X86_EFLAGS_AF&  ~X86_EFLAGS_ZF&
> +		~X86_EFLAGS_SF&  ~X86_EFLAGS_OF;
> +	vmx_set_rflags(vcpu, rflags);
> +}
> +
> +static void set_rflags_to_vmx_fail_valid(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long rflags;
> +	rflags = vmx_get_rflags(vcpu);
> +	rflags |= X86_EFLAGS_ZF;
> +	rflags&= ~X86_EFLAGS_PF&  ~X86_EFLAGS_AF&  ~X86_EFLAGS_CF&
> +		~X86_EFLAGS_SF&  ~X86_EFLAGS_OF;
> +	vmx_set_rflags(vcpu, rflags);
> +  }
> +
>    

These are needed much earlier.

>
> +static int handle_vmread_reg(struct kvm_vcpu *vcpu, int reg,
> +			     unsigned long field)
> +{
> +	u64 field_value;
> +
> +	switch (vmcs_field_type(field)) {
> +	case VMCS_FIELD_TYPE_U16:
> +		field_value = nested_vmcs_read16(vcpu, field);
> +		break;
> +	case VMCS_FIELD_TYPE_U32:
> +		field_value = nested_vmcs_read32(vcpu, field);
> +		break;
> +	case VMCS_FIELD_TYPE_U64:
> +		field_value = nested_vmcs_read64(vcpu, field);
> +#ifdef CONFIG_X86_64
> +		if (!is_long_mode(vcpu)) {
> +			kvm_register_write(vcpu, reg+1, field_value>>  32);
> +			field_value = (u32)field_value;
> +		}
> +#endif
> +		break;
> +	case VMCS_FIELD_TYPE_ULONG:
> +		field_value = nested_vmcs_readl(vcpu, field);
> +#ifdef CONFIG_X86_64
> +		if (!is_long_mode(vcpu)) {
> +			kvm_register_write(vcpu, reg+1, field_value>>  32);
> +			field_value = (u32)field_value;
> +		}
> +#endif
> +		break;
> +	default:
> +		printk(KERN_INFO "%s invalid field\n", __func__);
> +		return 0;
> +	}
> +
> +	kvm_register_write(vcpu, reg, field_value);
> +	return 1;
> +}
> +
> +static int handle_vmread_mem(struct kvm_vcpu *vcpu, gva_t gva,
> +			     unsigned long field)
> +{
> +	u64 field_value;
> +
> +	switch (vmcs_field_type(field)) {
> +	case VMCS_FIELD_TYPE_U16:
> +		field_value = nested_vmcs_read16(vcpu, field);
> +		break;
> +	case VMCS_FIELD_TYPE_U32:
> +		field_value = nested_vmcs_read32(vcpu, field);
> +		break;
> +	case VMCS_FIELD_TYPE_U64:
> +		field_value = nested_vmcs_read64(vcpu, field);
> +		break;
> +	case VMCS_FIELD_TYPE_ULONG:
> +		field_value = nested_vmcs_readl(vcpu, field);
> +		break;
> +	default:
> +		printk(KERN_INFO "%s invalid field\n", __func__);
> +		return 0;
> +	}
> +
> +	kvm_write_guest_virt(gva,&field_value,
> +			     vmcs_field_size(vmcs_field_type(field), vcpu),
> +			     vcpu);
> +	return 1;
> +}
>    

Looks like a lot of code duplication.  You can probably do this with a 
single function, and write either to a register or memory at the end.

> +
> +static int handle_vmread(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long field;
> +	int reg;
> +	int is_reg;
> +	unsigned long exit_qualification   = vmcs_readl(EXIT_QUALIFICATION);
> +	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	gva_t gva = 0;
> +	int read_succeed;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (!nested_map_current(vcpu)) {
> +		printk(KERN_INFO "%s invalid shadow vmcs\n", __func__);
> +		set_rflags_to_vmx_fail_invalid(vcpu);
> +		return 1;
> +	}
> +
> +	/* decode instruction info to get the field to read and where to store its value */
> +	/* Bit 10, Mem/Reg (0 = memory, 1 = register) */
> +	is_reg = vmx_instruction_info&  (1u<<  10);  /* bit 10 */
> +	field = kvm_register_read(vcpu, (vmx_instruction_info>>  28)&  0xf);  /* bits 31:28 */
> +
> +	if (is_reg) {
> +		reg   = (vmx_instruction_info>>  3)&  0xf;   /* bits 3:6 */
> +		read_succeed = handle_vmread_reg(vcpu, reg, field);
> +	} else {
> +		gva = get_vmx_mem_address(vcpu, exit_qualification,
> +					  vmx_instruction_info);
> +		read_succeed = handle_vmread_mem(vcpu, gva, field);
> +	}
> +
>    

This, too, can go into a separate function instead of being duplicated 
all over.

> +	if (read_succeed) {
> +		clear_rflags_cf_zf(vcpu);
> +		skip_emulated_instruction(vcpu);
> +	} else {
> +		set_rflags_to_vmx_fail_valid(vcpu);
> +		vmcs_write32(VM_INSTRUCTION_ERROR, 12);
> +	}
> +
> +	nested_unmap_current(vcpu);
> +	return 1;
> +}
> +
>
>    

>   static int handle_vmoff(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -3901,15 +4546,20 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
>   	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
>   	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
>   	gva_t vmcs_gva;
> -
> +	uint size;
>   	if (!nested_vmx_check_permission(vcpu))
>   		return 1;
>   	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
>   				       vmx_instruction_info);
>
> +	if (is_long_mode(vcpu))
> +		size = sizeof(u64);
> +	else
> +		size = sizeof(u32);
>    

I think the vmpointers are always 64-bit.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling
  2009-12-10 18:38         ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling oritw
  2009-12-10 18:38           ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume oritw
@ 2009-12-17  9:10           ` Avi Kivity
  1 sibling, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-17  9:10 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
>    

What exactly is the simplification?  Is it intended to have a functional 
change?

> ---
>   arch/x86/kvm/vmx.c |   27 +++++++++++++++++----------
>   1 files changed, 17 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 8745d44..de1f596 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1244,8 +1244,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
>   	u32 eb;
>
>   	eb = (1u<<  PF_VECTOR) | (1u<<  UD_VECTOR) | (1u<<  MC_VECTOR);
> -	if (!vcpu->fpu_active)
> -		eb |= 1u<<  NM_VECTOR;
>   	/*
>   	 * Unconditionally intercept #DB so we can maintain dr6 without
>   	 * reading it every exit.
> @@ -1463,10 +1461,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
>   	if (vcpu->fpu_active)
>   		return;
>   	vcpu->fpu_active = 1;
> -	vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
> -	if (vcpu->arch.cr0&  X86_CR0_TS)
> -		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> -	update_exception_bitmap(vcpu);
>   }
>
>   static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
> @@ -1474,8 +1468,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
>   	if (!vcpu->fpu_active)
>   		return;
>   	vcpu->fpu_active = 0;
> -	vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> -	update_exception_bitmap(vcpu);
>   }
>
>   static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
> @@ -2715,8 +2707,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
>
>   	vmx_flush_tlb(vcpu);
>   	vmcs_writel(GUEST_CR3, guest_cr3);
> -	if (vcpu->arch.cr0&  X86_CR0_PE)
> -		vmx_fpu_deactivate(vcpu);
> +	if (vcpu->arch.cr0&  X86_CR0_PE) {
> +		if (guest_cr3 != vmcs_readl(GUEST_CR3))
> +			vmx_fpu_deactivate(vcpu);
> +	}
>    

Why the added cr3 check?  It may make sense, but it isn't a simplification.

>   static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> @@ -5208,6 +5202,19 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   	if (vcpu->arch.switch_db_regs)
>   		get_debugreg(vcpu->arch.dr6, 6);
>
> +	if (vcpu->fpu_active) {
> +		if (vmcs_readl(CR0_READ_SHADOW)&  X86_CR0_TS)
> +			vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> +		else
> +			vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
> +			vmcs_write32(EXCEPTION_BITMAP,
> +				     vmcs_read32(EXCEPTION_BITMAP)&   ~(1u<<  NM_VECTOR));
> +	} else {
> +		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> +		vmcs_write32(EXCEPTION_BITMAP,
> +			     vmcs_read32(EXCEPTION_BITMAP) |  (1u<<  NM_VECTOR));
> +	}
>    

This is executed unconditionally, so the vmreads/vmwrites take place 
every time.  Need to cache the previous fpu_active state and only take 
action if it changed.

Since this is a large piece of code, move it to a function.

Please post this as the first patch (or better, separately), so I can 
apply it independently of the rest.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume
  2009-12-10 18:38           ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume oritw
  2009-12-10 18:38             ` [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits oritw
@ 2009-12-17 10:10             ` Avi Kivity
  1 sibling, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-17 10:10 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
>
> @@ -287,7 +297,7 @@ static inline int vmcs_field_type(unsigned long field)
>   }
>
>   /*
> -  Returncs VMCS field size in bits
> +  Returns VMCS field size in bits
>   */
>    

Fold.

>   static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
>   {
> @@ -313,6 +323,10 @@ static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>
> +#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
> +					VM_EXIT_SAVE_IA32_PAT))
> +#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
> +

I think a whitelist is better here, so if we add a new feature and 
forget it here, we don't introduce a vulnerability.

>
> +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
> +{
> +	return cpu_has_vmx_tpr_shadow()&&
> +		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control&
> +		CPU_BASED_TPR_SHADOW;
> +}
>    

bools are better.

> +
> +static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
> +{
> +	return cpu_has_secondary_exec_ctrls()&&
> +		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control&
> +		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +}
> +
> +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
> +							   *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control&
> +		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +}
>    

Need to check for secondary controls first.

> +
> +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->
> +		secondary_vm_exec_control&  SECONDARY_EXEC_ENABLE_EPT;
> +}
> +
> +static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control&
> +		SECONDARY_EXEC_ENABLE_VPID;
> +}
>    

A helper nested_check_2ndary_control() can help reduce duplication.

> +
> +static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->vm_entry_controls&
> +		VM_ENTRY_LOAD_IA32_PAT;
> +}
>    

Suggest dropping PAT support for now (it's optional in the spec IIRC, 
and doesn't help much).

>
> +
> +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> +{
>    

Not sure what this does.  From the name, it appears to prepare a vmcs.  
 From the contents, it appears to read the vmcs and save it into a 
shadow vmcs.

> +	struct shadow_vmcs *l2_shadow_vmcs =
> +		get_shadow_vmcs(vcpu);
> +
> +	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +
> +	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	l2_shadow_vmcs->guest_physical_address =
> +		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	l2_shadow_vmcs->vm_entry_intr_info_field =
> +		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	l2_shadow_vmcs->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	l2_shadow_vmcs->vm_entry_instruction_len =
> +		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vm_instruction_error =
> +		vmcs_read32(VM_INSTRUCTION_ERROR);
> +	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	l2_shadow_vmcs->vm_exit_intr_error_code =
> +		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	l2_shadow_vmcs->idt_vectoring_info_field =
> +		vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	l2_shadow_vmcs->idt_vectoring_error_code =
> +		vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	l2_shadow_vmcs->vm_exit_instruction_len =
> +		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vmx_instruction_info =
> +		vmcs_read32(VMX_INSTRUCTION_INFO);
> +	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	l2_shadow_vmcs->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	l2_shadow_vmcs->guest_activity_state =
> +		vmcs_read32(GUEST_ACTIVITY_STATE);
> +	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +
> +	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
>    

If EXIT_QUALIFICATION is a physical address, don't you need to translate 
it?  vmx will store a host physical address and we need a guest physical 
address.

> +	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
>    

Not all processors support this.

> +
> +	if (l2_shadow_vmcs->cr0_guest_host_mask&  X86_CR0_TS)
> +		l2_shadow_vmcs->guest_cr0 =  vmcs_readl(GUEST_CR0);
> +	else /* if CR0_GUEST_HOST_MASK[TS]=0 l1 should think TS was really written to CR0 */
> +		l2_shadow_vmcs->guest_cr0 =
> +			(vmcs_readl(GUEST_CR0)&~X86_CR0_TS) | (vmcs_readl(CR0_READ_SHADOW)&  X86_CR0_TS);
> +
> +	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
>    

GUEST_CR4 will be different from the guest's view of it (for example, 
without EPT CR4.PAE will always be set.

We also use CR4_GUEST_HOST_MASK, I think you need to account for that.

> +	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> +	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> +	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	l2_shadow_vmcs->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +}
> +
> +int load_vmcs_common(struct shadow_vmcs *src)
> +{
> +	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> +	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> +	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> +	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> +	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> +	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> +	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> +	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> +
> +	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> +
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
>    

This is dangerous, the guest can cause inconsistent page types and 
processor lockups.

> +
> +	if (src->vm_entry_msr_load_count<  512)
> +		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
>    

Ditto.  MSRs have to be validated.  This feature is optional, right?  
Suggest we don't support it for now.

> +	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> +		    src->guest_pending_dbg_exceptions);
>    

I think this is a new field.

>   static struct level_state *create_state(void)
>   {
>   	struct level_state *state = NULL;
> @@ -2301,8 +2578,6 @@ static void free_l1_state(struct kvm_vcpu *vcpu)
>   		kfree(list_item);
>   	}
>   }
> -
> -
>    

?

> @@ -3574,6 +3849,10 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> +	if (vmx->nested.nested_mode) {
> +		return;
> +	}
> +
>    

No braces on single-line if statements in kernel code.

>   	if (!cpu_has_virtual_nmis()) {
>   		/*
>   		 * Tracking the NMI-blocked state in software is built upon
> @@ -3759,7 +4038,12 @@ static int handle_exception(struct kvm_vcpu *vcpu)
>   		return 1;  /* already handled by vmx_vcpu_run() */
>
>   	if (is_no_device(intr_info)) {
> +		/* if l0 handled an fpu operation for l2 it's because l1 is
> +		   not interested (exception bitmap 12 does not include NM_VECTOR)
> +		   enable fpu and resume l2 (avoid switching to l1)
> +		*/
>    

Use standard comment style please.

>   		vmx_fpu_activate(vcpu);
> +
>   		return 1;
>   	}
>
>
>
> @@ -4504,7 +4793,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>   static int handle_vmptrld(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	u64 guest_vmcs_addr;
> +	gpa_t guest_vmcs_addr;
>    

Fold.

> @@ -4895,7 +5184,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>   			= vmcs_read32(VM_INSTRUCTION_ERROR);
>   		return 0;
>   	}
> -
>    

?

>   	if ((vectoring_info&  VECTORING_INFO_VALID_MASK)&&
>   			(exit_reason != EXIT_REASON_EXCEPTION_NMI&&
>   			exit_reason != EXIT_REASON_EPT_VIOLATION&&
> @@ -4903,8 +5191,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>   		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
>   		       "(0x%x) and exit reason is 0x%x\n",
>   		       __func__, vectoring_info, exit_reason);
> -
> -	if (unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked)) {
> +	if (!vmx->nested.nested_mode&&  unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked)) {
>    

Why is this different for nested mode?  At least, you have to check if 
the guest is intercepting nmis.

>   		if (vmx_interrupt_allowed(vcpu)) {
>   			vmx->soft_vnmi_blocked = 0;
>   		} else if (vmx->vnmi_blocked_time>  1000000000LL&&
> @@ -4951,10 +5238,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>   	int type;
>   	bool idtv_info_valid;
>
> -	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> -
>   	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
>
> +	if (vmx->nested.nested_mode)
> +		return;
>    

Again, I think you have to check if the guest is intercepting interrupts.

> +
> +	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +
>   	/* Handle machine checks before interrupts are enabled */
>   	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
>   	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> @@ -5068,6 +5358,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>   static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u32 nested_exception_bitmap = 0;
>
>   	/* Record the guest's net vcpu time for enforced NMI injections. */
>   	if (unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked))
> @@ -5099,6 +5390,37 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   	if (vcpu->arch.switch_db_regs)
>   		set_debugreg(vcpu->arch.dr6, 6);
>
> +	if (vcpu->fpu_active) {
> +		if (vmcs_readl(CR0_READ_SHADOW)&  X86_CR0_TS)
> +			vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> +		else
> +			vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
> +
> +		if (vmx->nested.nested_mode) {
> +			if (!nested_map_current(vcpu)) {
> +				vmx->fail = 1;
> +				return;
> +			}
> +
> +			nested_exception_bitmap =  get_shadow_vmcs(vcpu)->
> +				exception_bitmap;
> +
> +			nested_unmap_current(vcpu);
> +		}
> +
> +		if (vmx->nested.nested_mode&&
> +		    (nested_exception_bitmap&  (1u<<  NM_VECTOR)))
> +			vmcs_write32(EXCEPTION_BITMAP,
> +				     vmcs_read32(EXCEPTION_BITMAP) |  (1u<<  NM_VECTOR));
> +		else
> +			vmcs_write32(EXCEPTION_BITMAP,
> +				     vmcs_read32(EXCEPTION_BITMAP)&   ~(1u<<  NM_VECTOR));
>    

I'd like to see generalized handling of the exception bitmap in 
update_exception_bitmap(), not something ad-hoc.

> +	} else {
> +		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> +		vmcs_write32(EXCEPTION_BITMAP,
> +			     vmcs_read32(EXCEPTION_BITMAP) |  (1u<<  NM_VECTOR));
> +	}
>    

This looks confused wrt the previous patch.

>
> +void save_vmcs(struct shadow_vmcs *dst)
> +{
> +	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	if (cpu_has_vmx_msr_bitmap())
> +		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
>    

Instead of reading and writing the host parts, which don't change, I'd 
like to see the vmcs host initialization factored out and reused.

> +
> +	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> +	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> +	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
>    

We don't use those.

> +}
> +
> +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct shadow_vmcs *src , *l1_shadow_vmcs = vmx->nested.l1_shadow_vmcs;
> +	struct level_state *l2_state;
> +	u32 exec_control;
> +
> +	src = get_shadow_vmcs(vcpu);
> +	if (!src) {
> +		nested_unmap_current(vcpu);
> +		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> +		return 1;
> +	}
> +
> +	load_vmcs_common(src);
> +
> +	l2_state =&(vmx->nested.current_l2_page->l2_state);
> +
> +	if (l2_state->first_launch) {
> +
> +		vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
>    

Looks wrong.  The address needs translation at least.  Not sure what it 
does?

> +
> +		if (l2_state->io_bitmap_a)
> +			vmcs_write64(IO_BITMAP_A, l2_state->io_bitmap_a);
> +
> +		if (l2_state->io_bitmap_b)
> +			vmcs_write64(IO_BITMAP_B, l2_state->io_bitmap_b);
>    

Address translation?  Also, need to mask with the host's I/O bitmap.

> +
> +		if (l2_state->msr_bitmap)
> +			vmcs_write64(MSR_BITMAP, l2_state->msr_bitmap);
>    

Need to mask with the host's msr bitmap.

> +
> +		if (src->vm_entry_msr_load_count>  0) {
> +			struct page *page;
> +
> +			page = nested_get_page(vcpu,
> +					       src->vm_entry_msr_load_addr);
> +			if (!page)
> +				return 1;
> +
> +			vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
> +
> +			kvm_release_page_clean(page);
>    

I don't see how we can trust the guest's msr autoload.

> +
> +		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
> +			     (l1_shadow_vmcs->page_fault_error_code_mask&
> +			      src->page_fault_error_code_mask));
> +
> +		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
> +			     (l1_shadow_vmcs->page_fault_error_code_match&
> +			      src->page_fault_error_code_match));
> +
>    

I don't think this is right.  If both masks are 1 but host match is 1 
and guest match is 0, we will trap only on the guest's idea of PFEC 
matching.

Also, if the guest has bit 14 clear in EXCEPTION_BITMAP, this needs to 
be done completely differently.

I didn't see where PFEC matchin is handled in the exception handler?

> +		if (cpu_has_secondary_exec_ctrls()) {
> +
> +			exec_control =
> +				l1_shadow_vmcs->secondary_vm_exec_control;
> +
> +			if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
> +
> +				exec_control |= src->secondary_vm_exec_control;
> +
> +				if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
> +				    !nested_vm_need_virtualize_apic_accesses(vcpu))
> +					exec_control&=
> +						~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +			}
> +
> +			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> +		}
> +
> +		load_vmcs_host_state(l1_shadow_vmcs);
> +
> +		l2_state->first_launch = false;
> +	}
>    

How are all those calculations redone if the guest changes one of those 
controls?

> +
> +	if (vm_need_tpr_shadow(vcpu->kvm)&&
> +	    nested_cpu_has_vmx_tpr_shadow(vcpu))
> +		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
>    

If the guest doesn't trap interrupts, we need to stay with the host tpr 
threshold.

> +
> +	if (enable_ept) {
> +		if (!nested_cpu_has_vmx_ept(vcpu)) {
> +			vmcs_write64(EPT_POINTER,
> +				     l1_shadow_vmcs->ept_pointer);
> +			vmcs_write64(GUEST_PDPTR0,
> +				     l1_shadow_vmcs->guest_pdptr0);
> +			vmcs_write64(GUEST_PDPTR1,
> +				     l1_shadow_vmcs->guest_pdptr1);
> +			vmcs_write64(GUEST_PDPTR2,
> +				     l1_shadow_vmcs->guest_pdptr2);
> +			vmcs_write64(GUEST_PDPTR3,
> +				     l1_shadow_vmcs->guest_pdptr3);
> +		}
> +	}
>    

This version doesn't support ept, so please drop.

> +
> +	exec_control = l1_shadow_vmcs->cpu_based_vm_exec_control;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +	exec_control&= ~CPU_BASED_TPR_SHADOW;
> +
> +	exec_control |= src->cpu_based_vm_exec_control;
>    

Need to whitelist good bits.

> +void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long mask;
> +
> +	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
> +		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
> +	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
> +		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
> +
> +	mask = ~((1<<  VCPU_REGS_RSP) | (1<<  VCPU_REGS_RIP));
> +
> +	if (vcpu->arch.regs_dirty&  mask) {
> +		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
> +		       vcpu->arch.regs_dirty, mask);
> +		WARN_ON(1);
> +	}
> +
> +	vcpu->arch.regs_dirty = 0;
> +}
>    

Split into a separate patch (and use in existing code which already does 
this).


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits
  2009-12-10 18:38             ` [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits oritw
@ 2009-12-17 13:46               ` Avi Kivity
  0 siblings, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-17 13:46 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
>    

(changelog)

> @@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>   			new_offset = vmcs_read64(TSC_OFFSET) + delta;
>   			vmcs_write64(TSC_OFFSET, new_offset);
>   		}
> +
> +		if (l1_shadow_vmcs != NULL) {
> +			l1_shadow_vmcs->host_tr_base =
> +				vmcs_readl(HOST_TR_BASE);
> +			l1_shadow_vmcs->host_gdtr_base =
> +				vmcs_readl(HOST_GDTR_BASE);
> +			l1_shadow_vmcs->host_ia32_sysenter_esp =
> +				vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +
> +			if (tsc_this<  vcpu->arch.host_tsc)
> +				l1_shadow_vmcs->tsc_offset =
> +					vmcs_read64(TSC_OFFSET);
> +
> +			if (vmx->nested.nested_mode)
> +				load_vmcs_host_state(l1_shadow_vmcs);
> +		}
>    

Please share this code with non-nested vmcs setup.

> @@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
>   {
>   	u32 cpu_based_vm_exec_control;
>
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		nested_vmx_intr(vcpu);
> +		return;
> +	}
>    

I think this happens too late?  enable_irq_window() is called after 
we've given up on injecting the interrupt because interrupts are 
disabled.  But if we're running a guest, we can vmexit and inject the 
interrupt.  This code will only vmexit.

Hm, I see the vmexit code has an in_interrupt case, but I'd like this to 
be more regular: adjust vmx_interrupt_allowed() to allow interrupts if 
in a guest, and vmx_inject_irq() to force the vmexit.  This way 
interrupts have a single code path.

>
>   static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
>   {
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		if (!nested_vmx_intr(vcpu))
> +			return 0;
> +	}
>    

... and you do that... so I wonder why the changes to 
enable_irq_window() are needed?

> +
>   	return (vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
>   		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)&
>   			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
> @@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
>   		   not interested (exception bitmap 12 does not include NM_VECTOR)
>   		   enable fpu and resume l2 (avoid switching to l1)
>   		*/
> +
> +		if (vmx->nested.nested_mode)
> +			vmx->nested.nested_run_pending = 1; /* removing this line cause hung on boot of l2*/
> +
>    

This indicates a hack?

>   		vmx_fpu_activate(vcpu);
>
>   		return 1;
> @@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu)
>   		trace_kvm_cr_write(cr, val);
>   		switch (cr) {
>   		case 0:
> -			kvm_set_cr0(vcpu, val);
> +			if (to_vmx(vcpu)->nested.nested_mode) {
> +				/* assume only X86_CR0_TS is handled by l0 */
> +				long new_cr0 = vmcs_readl(GUEST_CR0);
> +				long new_cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +
> +				vmx_fpu_deactivate(vcpu);
>    
> +
> +				if (val&  X86_CR0_TS) {
> +					new_cr0 |= X86_CR0_TS;
> +					new_cr0_read_shadow |= X86_CR0_TS;
> +					vcpu->arch.cr0 |= X86_CR0_TS;
> +				} else {
> +					new_cr0&= ~X86_CR0_TS;
> +					new_cr0_read_shadow&= ~X86_CR0_TS;
> +					vcpu->arch.cr0&= X86_CR0_TS;
> +				}
> +
> +				vmcs_writel(GUEST_CR0, new_cr0);
> +				vmcs_writel(CR0_READ_SHADOW, new_cr0_read_shadow);
>    

Don't you need to #vmexit if the new cr0 violates the cr0_bits_always_on 
constraint, or if it changes bits in cr0 that the guest intercepts?

> +
> +				if (!(val&  X86_CR0_TS) || !(val&  X86_CR0_PE))
> +					vmx_fpu_activate(vcpu);
> +
> +				to_vmx(vcpu)->nested.nested_run_pending = 1;
>    

Please split into a function.

> +			} else
> +				kvm_set_cr0(vcpu, val);
> +
>   			skip_emulated_instruction(vcpu);
>   			return 1;
>   		case 3:
> @@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu)
>   		break;
>   	case 2: /* clts */
>   		vmx_fpu_deactivate(vcpu);
> -		vcpu->arch.cr0&= ~X86_CR0_TS;
> -		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
> +		if (to_vmx(vcpu)->nested.nested_mode) {
> +			vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0)&  ~X86_CR0_TS);
> +			vmcs_writel(CR0_READ_SHADOW, vmcs_readl(CR0_READ_SHADOW)&  ~X86_CR0_TS);
> +			vcpu->arch.cr0&= ~X86_CR0_TS;
> +			to_vmx(vcpu)->nested.nested_run_pending = 1;
>    

Won't the guest want to intercept this some time?

>   	/* Access CR3 don't cause VMExit in paging mode, so we need
>   	 * to sync with guest real CR3. */
>   	if (enable_ept&&  is_paging(vcpu))
> @@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>   		| vmx->rmode.irq.vector;
>   }
>
> +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
>    

I asked for this to be renamed.

>   #ifdef CONFIG_X86_64
>   #define R "r"
>   #define Q "q"
> @@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>   static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int r;
>   	u32 nested_exception_bitmap = 0;
>
> +	if (vmx->nested.nested_mode) {
> +		r = nested_handle_valid_idt(vcpu);
>    

This will cause vmread()s before the launch of state that is not saved.  
This means it is broken on migration or after set_regs().

In general we follow the following pattern:

   read from memory
   vmwrite
   vmlaunch/vmresume
   vmread
   write to memory
   loop

There are exceptions where we allow state to be cached, mostly 
registers.  But we keep accessors for them so that save/restore works.

> +
> +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
> +{
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		struct page *msr_page = NULL;
> +		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
> +		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> +		struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
> +
> +		if (!cpu_has_vmx_msr_bitmap()
> +		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
> +			return 1;
> +
> +		msr_page = nested_get_page(vcpu,
> +					   l2svmcs->msr_bitmap);
> +
> +		if (!msr_page) {
> +			printk(KERN_INFO "%s error in nested_get_page\n",
> +			       __func__);
> +			return 0;
> +		}
> +
> +		switch (exit_code) {
> +		case EXIT_REASON_MSR_READ:
> +			if (msr_index<= 0x1fff) {
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x000)))
> +					return 1;
> +			} else if ((msr_index>= 0xc0000000)&&
> +				   (msr_index<= 0xc0001fff)) {
> +				msr_index&= 0x1fff;
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x400)))
> +					return 1;
> +			}
> +			break;
> +		case EXIT_REASON_MSR_WRITE:
> +			if (msr_index<= 0x1fff) {
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x800)))
> +						return 1;
> +			} else if ((msr_index>= 0xc0000000)&&
> +				   (msr_index<= 0xc0001fff)) {
> +				msr_index&= 0x1fff;
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0xc00)))
> +					return 1;
> +			}
> +			break;
> +		}
> +	}
> +
>    

Please refactor with a single test_bit, just calculate the offsets 
differently (+400*8 for high msrs, +800*8 for writes).

> +
> +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
> +{
> +	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	struct shadow_vmcs *l2svmcs;
> +
> +	int r = 0;
> +
> +	if (vmx->nested.nested_run_pending)
> +		return 0;
> +
> +	if (unlikely(vmx->fail)) {
> +		printk(KERN_INFO "%s failed vm entry %x\n",
> +		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> +		return 1;
> +	}
> +
> +	if (kvm_override) {
>    

What's kvm_override?

> +		switch (exit_code) {
> +		case EXIT_REASON_EXTERNAL_INTERRUPT:
> +			return 0;
> +		case EXIT_REASON_EXCEPTION_NMI:
> +			if (!is_exception(intr_info))
> +				return 0;
> +
> +			if (is_page_fault(intr_info)&&  (!enable_ept))
> +				return 0;
> +
> +			break;
> +		case EXIT_REASON_EPT_VIOLATION:
> +			if (enable_ept)
> +				return 0;
> +
> +			break;
> +		}
> +	}
> +
> +
> +	if (!nested_map_current(vcpu))
> +		return 0;
> +
> +	l2svmcs = get_shadow_vmcs(vcpu);
> +
> +	switch (exit_code) {
> +	case EXIT_REASON_INVLPG:
> +		if (l2svmcs->cpu_based_vm_exec_control&
> +		    CPU_BASED_INVLPG_EXITING)
> +			r = 1;
> +		break;
> +	case EXIT_REASON_MSR_READ:
> +	case EXIT_REASON_MSR_WRITE:
> +		r = nested_vmx_exit_handled_msr(vcpu);
> +		break;
> +	case EXIT_REASON_CR_ACCESS: {
> +		unsigned long exit_qualification =
> +			vmcs_readl(EXIT_QUALIFICATION);
> +		int cr = exit_qualification&  15;
> +		int reg = (exit_qualification>>  8)&  15;
> +		unsigned long val = kvm_register_read(vcpu, reg);
> +
> +		switch ((exit_qualification>>  4)&  3) {
> +		case 0: /* mov to cr */
> +			switch (cr) {
> +			case 0:
> +				if (l2svmcs->cr0_guest_host_mask&
> +				    (val ^ l2svmcs->cr0_read_shadow))
> +					r = 1;
>    
> +				break;
> +			case 3:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR3_LOAD_EXITING)
> +					r = 1;
> +				break;
> +			case 4:
> +				if (l2svmcs->cr4_guest_host_mask&
> +				    (l2svmcs->cr4_read_shadow ^ val))
> +					r = 1;
> +				break;
> +			case 8:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR8_LOAD_EXITING)
> +					r = 1;
> +				break;
> +			}
> +			break;
> +		case 2: /* clts */
> +			if (l2svmcs->cr0_guest_host_mask&  X86_CR0_TS)
> +				r = 1;
> +			break;
> +		case 1: /*mov from cr*/
> +			switch (cr) {
> +			case 0:
> +				r = 1;
> +			case 3:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR3_STORE_EXITING)
> +					r = 1;
> +				break;
> +			case 4:
> +				r = 1;
> +				break;
> +			case 8:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR8_STORE_EXITING)
> +					r = 1;
> +				break;
> +			}
> +			break;
> +		case 3: /* lmsw */
> +			if (l2svmcs->cr0_guest_host_mask&
> +			    (val ^ l2svmcs->cr0_read_shadow))
> +				r = 1;
> +			break;
> +		}
> +		break;
> +	}
> +	case EXIT_REASON_DR_ACCESS: {
> +		if (l2svmcs->cpu_based_vm_exec_control&
> +		    CPU_BASED_MOV_DR_EXITING)
> +			r = 1;
> +		break;
> +	}
> +
> +	case EXIT_REASON_EXCEPTION_NMI: {
> +
> +		if (is_external_interrupt(intr_info)&&
> +		    (l2svmcs->pin_based_vm_exec_control&
> +		     PIN_BASED_EXT_INTR_MASK))
> +			r = 1;
> +		else if (is_nmi(intr_info)&&
> +		    (l2svmcs->pin_based_vm_exec_control&
> +		     PIN_BASED_NMI_EXITING))
> +			r = 1;
> +		else if (is_exception(intr_info)&&
> +		    (l2svmcs->exception_bitmap&
> +		     (1u<<  (intr_info&  INTR_INFO_VECTOR_MASK))))
> +			r = 1;
> +		else if (is_page_fault(intr_info))
> +			r = 1;
> +		break;
> +	}
> +
> +	case EXIT_REASON_EXTERNAL_INTERRUPT:
> +		if (l2svmcs->pin_based_vm_exec_control&
> +		    PIN_BASED_EXT_INTR_MASK)
> +			r = 1;
> +		break;
> +	default:
> +		r = 1;
> +	}
> +	nested_unmap_current(vcpu);
> +
>    

Please move these to the normal handlers so it is possible to follow the 
code.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Nested VMX support v4
  2009-12-10 18:38 Nested VMX support v4 oritw
  2009-12-10 18:38 ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff oritw
@ 2009-12-17 13:49 ` Avi Kivity
  1 sibling, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-17 13:49 UTC (permalink / raw)
  To: oritw; +Cc: kvm, benami, abelg, muli, aliguori, mdday

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> Avi,
> We have addressed all of the comments, please apply.
>
>    

I'm afraid there is still a lot of work remaining.

> This work was inspired by the nested SVM support by Alexander Graf and Joerg
> Roedel.
>
>    

Please try to make this as readable as the svm work.  That means smaller 
patches, detailed changelogs, reduced forward declarations.  I would 
appreciate documentation of the data structures.  I am still confused 
about the triplication in kvm_vcpu, l1_state, and l2_state - there 
shouldn't be three, just two.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-10 18:38 ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff oritw
  2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
  2009-12-16 13:34   ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
@ 2009-12-20 14:20   ` Gleb Natapov
  2009-12-20 14:23     ` Avi Kivity
  2009-12-20 17:08     ` Andi Kleen
  2 siblings, 2 replies; 24+ messages in thread
From: Gleb Natapov @ 2009-12-20 14:20 UTC (permalink / raw)
  To: oritw; +Cc: avi, kvm, benami, abelg, muli, aliguori, mdday

On Thu, Dec 10, 2009 at 08:38:23PM +0200, oritw@il.ibm.com wrote:
> From: Orit Wasserman <oritw@il.ibm.com>
> 
> ---
>  arch/x86/kvm/svm.c |    3 -
>  arch/x86/kvm/vmx.c |  265 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c |   11 ++-
>  arch/x86/kvm/x86.h |    2 +
>  4 files changed, 274 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 3de0b37..3f63cdd 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -121,9 +121,6 @@ static int npt = 1;
>  
>  module_param(npt, int, S_IRUGO);
>  
> -static int nested = 1;
> -module_param(nested, int, S_IRUGO);
> -
>  static void svm_flush_tlb(struct kvm_vcpu *vcpu);
>  static void svm_complete_interrupts(struct vcpu_svm *svm);
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 9a0a2cf..2726a6c 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -92,6 +92,16 @@ struct shared_msr_entry {
>  	u64 mask;
>  };
>  
> +struct __attribute__ ((__packed__)) level_state {
> +};
> +
> +struct nested_vmx {
> +	/* Has the level1 guest done vmxon? */
> +	bool vmxon;
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
> +};
> +
>  struct vcpu_vmx {
>  	struct kvm_vcpu       vcpu;
>  	struct list_head      local_vcpus_link;
> @@ -136,6 +146,9 @@ struct vcpu_vmx {
>  	ktime_t entry_time;
>  	s64 vnmi_blocked_time;
>  	u32 exit_reason;
> +
> +	/* Nested vmx */
> +	struct nested_vmx nested;
>  };
>  
>  static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
> @@ -201,6 +214,7 @@ static struct kvm_vmx_segment_field {
>  static u64 host_efer;
>  
>  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
> +static int create_l1_state(struct kvm_vcpu *vcpu);
>  
>  /*
>   * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
> @@ -961,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
>  }
>  
>  /*
> + * Handles msr read for nested virtualization
> + */
> +static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
> +			      u64 *pdata)
> +{
> +	u64 vmx_msr = 0;
> +
> +	switch (msr_index) {
> +	case MSR_IA32_FEATURE_CONTROL:
> +		*pdata = 0;
> +		break;
> +	case MSR_IA32_VMX_BASIC:
> +		*pdata = 0;
> +		rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
> +		*pdata = (vmx_msr & 0x00ffffcfffffffff);
> +		break;
> +	case MSR_IA32_VMX_PINBASED_CTLS:
> +		rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr);
> +		*pdata = (PIN_BASED_EXT_INTR_MASK & vmcs_config.pin_based_exec_ctrl) |
> +			(PIN_BASED_NMI_EXITING & vmcs_config.pin_based_exec_ctrl) |
> +			(PIN_BASED_VIRTUAL_NMIS & vmcs_config.pin_based_exec_ctrl);
> +		break;
> +	case MSR_IA32_VMX_PROCBASED_CTLS:
> +	{
> +		u32 vmx_msr_high, vmx_msr_low;
> +		u32 control = CPU_BASED_HLT_EXITING |
> +#ifdef CONFIG_X86_64
> +			CPU_BASED_CR8_LOAD_EXITING |
> +			CPU_BASED_CR8_STORE_EXITING |
> +#endif
> +			CPU_BASED_CR3_LOAD_EXITING |
> +			CPU_BASED_CR3_STORE_EXITING |
> +			CPU_BASED_USE_IO_BITMAPS |
> +			CPU_BASED_MOV_DR_EXITING |
> +			CPU_BASED_USE_TSC_OFFSETING |
> +			CPU_BASED_INVLPG_EXITING |
> +			CPU_BASED_TPR_SHADOW |
> +			CPU_BASED_USE_MSR_BITMAPS |
> +			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +
> +		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
> +
> +		control &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
> +		control |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
> +
> +		*pdata = (CPU_BASED_HLT_EXITING & control) |
> +#ifdef CONFIG_X86_64
> +			(CPU_BASED_CR8_LOAD_EXITING & control) |
> +			(CPU_BASED_CR8_STORE_EXITING & control) |
> +#endif
> +			(CPU_BASED_CR3_LOAD_EXITING & control) |
> +			(CPU_BASED_CR3_STORE_EXITING & control) |
> +			(CPU_BASED_USE_IO_BITMAPS & control) |
> +			(CPU_BASED_MOV_DR_EXITING & control) |
> +			(CPU_BASED_USE_TSC_OFFSETING & control) |
> +			(CPU_BASED_INVLPG_EXITING & control) ;
> +
> +		if (cpu_has_secondary_exec_ctrls())
> +			*pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +
> +		if (vm_need_tpr_shadow(vcpu->kvm))
> +			*pdata |= CPU_BASED_TPR_SHADOW;
> +		break;
> +	}
> +	case MSR_IA32_VMX_EXIT_CTLS:
> +		*pdata = 0;
> +#ifdef CONFIG_X86_64
> +		*pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
> +#endif
> +		break;
> +	case MSR_IA32_VMX_ENTRY_CTLS:
> +		*pdata = 0;
> +		break;
> +	case MSR_IA32_VMX_PROCBASED_CTLS2:
> +		*pdata = 0;
> +		if (vm_need_virtualize_apic_accesses(vcpu->kvm))
> +			*pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +		break;
> +	case MSR_IA32_VMX_EPT_VPID_CAP:
> +		*pdata = 0;
> +		break;
> +	default:
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
>   * Reads an msr value (of 'msr_index') into 'pdata'.
>   * Returns 0 on success, non-0 otherwise.
>   * Assumes vcpu_load() was already called.
> @@ -1004,6 +1107,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
>  		break;
>  	default:
>  		vmx_load_host_state(to_vmx(vcpu));
> +		if (nested &&
> +		    !nested_vmx_get_msr(vcpu, msr_index, &data))
> +			break;
>  		msr = find_msr_entry(to_vmx(vcpu), msr_index);
>  		if (msr) {
>  			vmx_load_host_state(to_vmx(vcpu));
> @@ -1018,6 +1124,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
>  }
>  
>  /*
> + * Writes msr value for nested virtualization
> + * Returns 0 on success, non-0 otherwise.
> + */
> +static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
> +{
> +	switch (msr_index) {
> +	case MSR_IA32_FEATURE_CONTROL:
> +		if ((data & (FEATURE_CONTROL_LOCKED |
> +			     FEATURE_CONTROL_VMXON_ENABLED))
> +		    != (FEATURE_CONTROL_LOCKED |
> +			FEATURE_CONTROL_VMXON_ENABLED))
> +			return 1;
> +		break;
> +	default:
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
>   * Writes msr value into into the appropriate "register".
>   * Returns 0 on success, non-0 otherwise.
>   * Assumes vcpu_load() was already called.
> @@ -1067,6 +1194,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
>  		}
>  		/* Otherwise falls through to kvm_set_msr_common */
>  	default:
> +		if (nested &&
> +		    !nested_vmx_set_msr(vcpu, msr_index, data))
> +			break;
>  		msr = find_msr_entry(vmx, msr_index);
>  		if (msr) {
>  			vmx_load_host_state(vmx);
> @@ -1163,6 +1293,31 @@ static void vmclear_local_vcpus(void)
>  		__vcpu_clear(vmx);
>  }
>  
> +static struct level_state *create_state(void)
> +{
> +	struct level_state *state = NULL;
> +
> +	state = kzalloc(sizeof(struct level_state), GFP_KERNEL);
> +	if (!state) {
> +		printk(KERN_INFO "Error create level state\n");
> +		return NULL;
> +	}
> +	return state;
> +}
> +
> +static int create_l1_state(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.l1_state) {
> +		vmx->nested.l1_state = create_state();
> +		if (!vmx->nested.l1_state)
> +			return -ENOMEM;
> +	} else
> +		return 0;
> +
Else clause is not needed.

> +	return 0;
> +}
>  
>  /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
>   * tricks.
> @@ -1333,6 +1488,18 @@ static void free_vmcs(struct vmcs *vmcs)
>  	free_pages((unsigned long)vmcs, vmcs_config.order);
>  }
>  
> +static void free_l1_state(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.l1_state)
> +		return;
> +
> +	kfree(vmx->nested.l1_state);
> +	vmx->nested.l1_state = NULL;
> +}
> +
> +
>  static void free_kvm_area(void)
>  {
>  	int cpu;
> @@ -3146,12 +3313,105 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +/*
> + * Check to see if vcpu can execute vmx command
> + * Inject the corrseponding exception
> + */
> +static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_segment cs;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
> +
> +	if (!vmx->nested.vmxon) {
> +		pr_debug("%s: vmx not on\n", __func__);
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 0;
> +	}
> +
> +	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
> +	    (is_long_mode(vcpu) && !cs.l)) {
> +		pr_debug("%s: invalid mode cs.l %d is_long mode %d\n",
> +			 __func__, cs.l, is_long_mode(vcpu));
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 0;
> +	}
> +
> +	if (vmx_get_cpl(vcpu)) {
> +		kvm_inject_gp(vcpu, 0);
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
>  static int handle_vmx_insn(struct kvm_vcpu *vcpu)
>  {
>  	kvm_queue_exception(vcpu, UD_VECTOR);
>  	return 1;
>  }
>  
> +static int handle_vmoff(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vmx->nested.vmxon = 0;
> +
> +	free_l1_state(vcpu);
> +
> +	skip_emulated_instruction(vcpu);
> +	return 1;
> +}
> +
> +static int handle_vmon(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_segment cs;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!nested) {
> +		pr_debug("%s: nested vmx not enabled\n", __func__);
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		return 1;
> +	}
> +
> +	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
> +
> +	if (!(vcpu->arch.cr4 & X86_CR4_VMXE) ||
> +	    !(vcpu->arch.cr0 & X86_CR0_PE) ||
> +	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		printk(KERN_INFO "%s invalid register state\n", __func__);
> +		return 1;
> +	}
> +
> +	if (is_long_mode(vcpu) && !cs.l) {
> +		kvm_queue_exception(vcpu, UD_VECTOR);
> +		printk(KERN_INFO "%s invalid register state\n", __func__);
> +		return 1;
> +	}
> +
> +	if (vmx_get_cpl(vcpu)) {
> +		printk(KERN_INFO "%s no permission\n", __func__);
> +		kvm_inject_gp(vcpu, 0);
> +		return 1;
> +	}
> +
> +	if (create_l1_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +		kvm_queue_exception(vcpu, UD_VECTOR);
Should we send UD exception if there is internal error? May be doing
VMfailinvalid would be more appropriate? Also this function doesn't handle
all errors (address alignment, version checking, register operand).

> +		return 1;
> +	}
> +
> +	vmx->nested.vmxon = 1;
> +
> +	skip_emulated_instruction(vcpu);
> +	return 1;
> +}
> +
>  static int handle_invlpg(struct kvm_vcpu *vcpu)
>  {
>  	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> @@ -3442,8 +3702,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
>  	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
>  	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
> -	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
> -	[EXIT_REASON_VMON]                    = handle_vmx_insn,
> +	[EXIT_REASON_VMOFF]                   = handle_vmoff,
> +	[EXIT_REASON_VMON]                    = handle_vmon,
>  	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
>  	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
>  	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
> @@ -3823,6 +4083,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>  	if (vmx->vpid != 0)
>  		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
>  	spin_unlock(&vmx_vpid_lock);
> +	free_l1_state(vcpu);
>  	vmx_free_vmcs(vcpu);
>  	kfree(vmx->guest_msrs);
>  	kvm_vcpu_uninit(vcpu);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index dd15d7a..b698952 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -88,6 +88,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
>  int ignore_msrs = 0;
>  module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
>  
> +int nested = 1;
> +EXPORT_SYMBOL_GPL(nested);
> +module_param(nested, int, S_IRUGO);
> +
>  #define KVM_NR_SHARED_MSRS 16
>  
>  struct kvm_shared_msrs_global {
> @@ -505,7 +509,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
>  		return;
>  	}
>  
> -	if (cr4 & X86_CR4_VMXE) {
> +	if (cr4 & X86_CR4_VMXE && !nested) {
>  		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
>  		kvm_inject_gp(vcpu, 0);
>  		return;
> @@ -615,7 +619,10 @@ static u32 msrs_to_save[] = {
>  #ifdef CONFIG_X86_64
>  	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
>  #endif
> -	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
> +	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
> +	MSR_IA32_FEATURE_CONTROL,  MSR_IA32_VMX_BASIC, MSR_IA32_VMX_PINBASED_CTLS,
> +	MSR_IA32_VMX_PROCBASED_CTLS, MSR_IA32_VMX_EXIT_CTLS, MSR_IA32_VMX_ENTRY_CTLS,
> +	MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_FEATURE_CONTROL
>  };
>  
>  static unsigned num_msrs_to_save;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 5eadea5..57204cb 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
>  struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
>                                               u32 function, u32 index);
>  
> +extern int nested;
> +
>  #endif
> -- 
> 1.6.0.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-20 14:20   ` Gleb Natapov
@ 2009-12-20 14:23     ` Avi Kivity
  2009-12-20 14:25       ` Gleb Natapov
  2009-12-20 17:08     ` Andi Kleen
  1 sibling, 1 reply; 24+ messages in thread
From: Avi Kivity @ 2009-12-20 14:23 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: oritw, kvm, benami, abelg, muli, aliguori, mdday

On 12/20/2009 04:20 PM, Gleb Natapov wrote:
> +
>> +	if (create_l1_state(vcpu)) {
>> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
>> +		kvm_queue_exception(vcpu, UD_VECTOR);
>>      
> Should we send UD exception if there is internal error? May be doing
> VMfailinvalid would be more appropriate? Also this function doesn't handle
> all errors (address alignment, version checking, register operand).
>    


Host errors like -ENOMEM should propagate to the return code of ioctl().

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-20 14:23     ` Avi Kivity
@ 2009-12-20 14:25       ` Gleb Natapov
  0 siblings, 0 replies; 24+ messages in thread
From: Gleb Natapov @ 2009-12-20 14:25 UTC (permalink / raw)
  To: Avi Kivity; +Cc: oritw, kvm, benami, abelg, muli, aliguori, mdday

On Sun, Dec 20, 2009 at 04:23:38PM +0200, Avi Kivity wrote:
> On 12/20/2009 04:20 PM, Gleb Natapov wrote:
> >+
> >>+	if (create_l1_state(vcpu)) {
> >>+		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> >>+		kvm_queue_exception(vcpu, UD_VECTOR);
> >Should we send UD exception if there is internal error? May be doing
> >VMfailinvalid would be more appropriate? Also this function doesn't handle
> >all errors (address alignment, version checking, register operand).
> 
> 
> Host errors like -ENOMEM should propagate to the return code of ioctl().
> 
Yes, even better.

--
			Gleb.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-20 14:20   ` Gleb Natapov
  2009-12-20 14:23     ` Avi Kivity
@ 2009-12-20 17:08     ` Andi Kleen
  2009-12-20 19:04       ` Avi Kivity
  1 sibling, 1 reply; 24+ messages in thread
From: Andi Kleen @ 2009-12-20 17:08 UTC (permalink / raw)
  To: Gleb Natapov; +Cc: oritw, avi, kvm, benami, abelg, muli, aliguori, mdday

Gleb Natapov <gleb@redhat.com> writes:
>>  
>> +int nested = 1;
>> +EXPORT_SYMBOL_GPL(nested);

Unless this is a lot better tested and audited wouldn't it make more sense
to default it to off?

I don't think it's a big burden to let users set a special knob for this,
but it would be a big problem if there was some kind of jail break 
hidden in there that could be exploited by malicious guests.

Since VMX was not originally designed to be nested that wouldn't surprise me.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-20 17:08     ` Andi Kleen
@ 2009-12-20 19:04       ` Avi Kivity
  2009-12-21 15:52         ` Muli Ben-Yehuda
  0 siblings, 1 reply; 24+ messages in thread
From: Avi Kivity @ 2009-12-20 19:04 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Gleb Natapov, oritw, kvm, benami, abelg, muli, aliguori, mdday

On 12/20/2009 07:08 PM, Andi Kleen wrote:
> Gleb Natapov<gleb@redhat.com>  writes:
>    
>>>
>>> +int nested = 1;
>>> +EXPORT_SYMBOL_GPL(nested);
>>>        
> Unless this is a lot better tested and audited wouldn't it make more sense
> to default it to off?
>
>    

This is actually a move of an existing svm-only variable, which defaults 
to enabled.  Nested svm has been tested for a while.

> I don't think it's a big burden to let users set a special knob for this,
> but it would be a big problem if there was some kind of jail break
> hidden in there that could be exploited by malicious guests.
>    

True.  It makes sense to have different defaults of vmx and svm.

> Since VMX was not originally designed to be nested that wouldn't surprise me.
>    

vmx was designed to correct the non-virtualizability of x86.  It would 
have been criminal to design it without nesting in mind, especially 
given all the prior art.

vmx does support nesting, albeit not very efficiently.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-20 19:04       ` Avi Kivity
@ 2009-12-21 15:52         ` Muli Ben-Yehuda
  2009-12-21 16:00           ` Avi Kivity
  0 siblings, 1 reply; 24+ messages in thread
From: Muli Ben-Yehuda @ 2009-12-21 15:52 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Andi Kleen, Gleb Natapov, Orit Wasserman, kvm, Ben-Ami Yassour1,
	Abel Gordon, aliguori, mdday

On Sun, Dec 20, 2009 at 09:04:49PM +0200, Avi Kivity wrote:

> >Since VMX was not originally designed to be nested that wouldn't
> >surprise me.
> 
> vmx was designed to correct the non-virtualizability of x86.  It
> would have been criminal to design it without nesting in mind,
> especially given all the prior art.
> 
> vmx does support nesting, albeit not very efficiently.

I would say that VMX only supports nesting if you define "supports" as
"does not make it impossible". The fact that VMX operations in
executed in non-root mode are trapped is welcome, but there's so much
more that could be done in hardware to make nesting "better supported"
that I would hesitate to say that the current generation of VMX
supports nesting.

Cheers,
Muli
-- 
Muli Ben-Yehuda | muli@il.ibm.com | +972-4-8281080
Manager, Virtualization and Systems Architecture
Master Inventor, IBM Research -- Haifa
Second Workshop on I/O Virtualization (WIOV '10):
http://sysrun.haifa.il.ibm.com/hrl/wiov2010/

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff
  2009-12-21 15:52         ` Muli Ben-Yehuda
@ 2009-12-21 16:00           ` Avi Kivity
  0 siblings, 0 replies; 24+ messages in thread
From: Avi Kivity @ 2009-12-21 16:00 UTC (permalink / raw)
  To: Muli Ben-Yehuda
  Cc: Andi Kleen, Gleb Natapov, Orit Wasserman, kvm, Ben-Ami Yassour1,
	Abel Gordon, aliguori, mdday

On 12/21/2009 05:52 PM, Muli Ben-Yehuda wrote:
>
> I would say that VMX only supports nesting if you define "supports" as
> "does not make it impossible". The fact that VMX operations in
> executed in non-root mode are trapped is welcome, but there's so much
> more that could be done in hardware to make nesting "better supported"
> that I would hesitate to say that the current generation of VMX
> supports nesting.
>    

I would phrase it as "there is so much less the hardware can do to make 
nesting better supported" (such as avoid using vmread/vmwrite).  As 
nested svm shows, you don't really need much (nested paging is very 
important though).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/7] Nested VMX patch 2 implements vmclear
  2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
  2009-12-10 18:38     ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst oritw
  2009-12-16 13:59     ` [PATCH 2/7] Nested VMX patch 2 implements vmclear Avi Kivity
@ 2009-12-28 14:57     ` Gleb Natapov
  2 siblings, 0 replies; 24+ messages in thread
From: Gleb Natapov @ 2009-12-28 14:57 UTC (permalink / raw)
  To: oritw; +Cc: avi, kvm, benami, abelg, muli, aliguori, mdday

On Thu, Dec 10, 2009 at 08:38:24PM +0200, oritw@il.ibm.com wrote:
> From: Orit Wasserman <oritw@il.ibm.com>
> 
> ---
>  arch/x86/kvm/vmx.c |  235 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c |    5 +-
>  arch/x86/kvm/x86.h |    3 +
>  3 files changed, 240 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 2726a6c..a7ffd5e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -93,13 +93,39 @@ struct shared_msr_entry {
>  };
>  
>  struct __attribute__ ((__packed__)) level_state {
> +	/* Has the level1 guest done vmclear? */
> +	bool vmclear;
> +};
> +
> +/*
> + * This structure is mapped to guest memory.
> + * It is packed in order to preseve the binary content
> + * after live migration.
> + * If there are changed in the content or layout the revision_id must be updated.
> + */
> +struct __attribute__ ((__packed__)) nested_vmcs_page {
> +	u32 revision_id;
> +	u32 abort;
> +	struct level_state l2_state;
> +};
> +
> +struct nested_vmcs_list {
> +	struct list_head list;
> +	gpa_t vmcs_addr;
> +	struct vmcs *l2_vmcs;
>  };
>  
>  struct nested_vmx {
>  	/* Has the level1 guest done vmxon? */
>  	bool vmxon;
> +	/* What is the location of the current vmcs l1 keeps for l2 */
> +	gpa_t current_vmptr;
>  	/* Level 1 state for switching to level 2 and back */
>  	struct level_state *l1_state;
> +	/* list of vmcs for each l2 guest created by l1 */
> +	struct list_head l2_vmcs_list;
> +	/* l2 page corresponding to the current vmcs set by l1 */
> +	struct nested_vmcs_page *current_l2_page;
>  };
>  
>  struct vcpu_vmx {
> @@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
>  	return container_of(vcpu, struct vcpu_vmx, vcpu);
>  }
>  
> +static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> +				    u64 vmcs_addr)
> +{
> +	struct page *vmcs_page = NULL;
> +
> +	down_read(&current->mm->mmap_sem);
> +	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
> +	up_read(&current->mm->mmap_sem);
> +
> +	if (is_error_page(vmcs_page)) {
> +		printk(KERN_ERR "%s error allocating page 0x%llx\n",
> +		       __func__, vmcs_addr);
> +		kvm_release_page_clean(vmcs_page);
> +		return NULL;
> +	}
> +
> +	return vmcs_page;
> +
> +}
> +
> +static int nested_map_current(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page =
> +		nested_get_page(vcpu, vmx->nested.current_vmptr);
> +	struct nested_vmcs_page *mapped_page;
> +
> +	if (vmcs_page == NULL) {
> +		printk(KERN_INFO "%s: failure in nested_get_page\n", __func__);
> +		return 0;
> +	}
> +
> +	if (vmx->nested.current_l2_page) {
> +		printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__);
> +		WARN_ON(1);
> +		return 0;
> +	}
> +
> +	mapped_page = kmap_atomic(vmcs_page, KM_USER0);
> +
> +	if (!mapped_page) {
> +		printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
> +		return 0;
> +	}
> +
> +	vmx->nested.current_l2_page = mapped_page;
> +
> +	return 1;
> +}
> +
> +static void nested_unmap_current(struct kvm_vcpu *vcpu)
> +{
> +	struct page *page;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!vmx->nested.current_l2_page) {
> +		printk(KERN_INFO "Shadow vmcs already unmapped\n");
> +		WARN_ON(1);
> +		return;
> +	}
> +
> +	page = kmap_atomic_to_page(vmx->nested.current_l2_page);
> +
> +	kunmap_atomic(vmx->nested.current_l2_page, KM_USER0);
> +
> +	kvm_release_page_dirty(page);
> +
> +	vmx->nested.current_l2_page = NULL;
> +}
> +
>  static int init_rmode(struct kvm *kvm);
>  static u64 construct_eptp(unsigned long root_hpa);
>  
> @@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
>  	return 0;
>  }
>  
> +static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
> +{
> +	int r = 0;
> +	uint size;
> +
> +	*gentry = 0;
> +
> +	if (is_long_mode(vcpu))
> +		size = sizeof(u64);
> +	else
> +		size = sizeof(u32);
> +
> +	r = kvm_read_guest_virt(gva, gentry,
> +				size, vcpu);
> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> +		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
> +		return r;
> +	}
> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * Writes msr value into into the appropriate "register".
>   * Returns 0 on success, non-0 otherwise.
> @@ -1316,6 +1441,7 @@ static int create_l1_state(struct kvm_vcpu *vcpu)
>  	} else
>  		return 0;
>  
> +	INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list));
>  	return 0;
>  }
>  
> @@ -1488,15 +1614,35 @@ static void free_vmcs(struct vmcs *vmcs)
>  	free_pages((unsigned long)vmcs, vmcs_config.order);
>  }
>  
> +static void nested_free_current_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct nested_vmcs_list *list_item, *n;
> +
> +	list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list)
> +		if (list_item->vmcs_addr == vmx->nested.current_vmptr) {
> +			free_vmcs(list_item->l2_vmcs);
> +			list_del(&(list_item->list));
> +			return;
> +		}
> +}
> +
>  static void free_l1_state(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct nested_vmcs_list *list_item, *n;
>  
>  	if (!vmx->nested.l1_state)
>  		return;
>  
>  	kfree(vmx->nested.l1_state);
>  	vmx->nested.l1_state = NULL;
> +
> +	list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list,
> +				 list) {
> +		free_vmcs(list_item->l2_vmcs);
> +		list_del(&(list_item->list));
> +	}
>  }
>  
>  
> @@ -3352,6 +3498,93 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long rflags;
> +	rflags = vmx_get_rflags(vcpu);
> +	rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
> +	vmx_set_rflags(vcpu, rflags);
> +}
> +
> +/*
> + * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11
> + * For additional information regarding offset calculation see 3.7.5
> + */
> +static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
> +				 unsigned long exit_qualification,
> +				 u32 vmx_instruction_info)
> +{
> +	int  scaling        = vmx_instruction_info & 3;             /* bits 0:1 scaling */
> +	int  addr_size      = (vmx_instruction_info >> 7) & 7;      /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */
> +	bool is_reg         = vmx_instruction_info & (1u << 10);    /* bit  10  1=register operand, 0= memory */
> +	int  seg_reg        = (vmx_instruction_info >> 15) & 7;     /* bits 15:17 segment register */
> +	int  index_reg      = (vmx_instruction_info >> 18) & 0xf;   /* bits 18:21 index register */
> +	bool index_is_valid = !(vmx_instruction_info & (1u << 22)); /* bit  22 index register validity, 0=valid, 1=invalid */
> +	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;   /* bits 23:26 index register */
> +	bool base_is_valid  = !(vmx_instruction_info & (1u << 27)); /* bit  27 base register validity, 0=valid, 1=invalid */
> +	gva_t addr;
> +
> +	if (is_reg)
> +		return 0;
> +
> +	switch (addr_size) {
> +	case 1:
> +		exit_qualification &= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */
> +		break;
> +	case 2:
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	/* Addr = segment_base + offset */
> +	/* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */
> +	addr = vmx_get_segment_base(vcpu, seg_reg);
> +	if (base_is_valid)
> +		addr += kvm_register_read(vcpu, base_reg);
> +	if (index_is_valid)
> +		addr += kvm_register_read(vcpu, index_reg)*scaling;
> +	addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */
> +
> +	return addr;
> +}
> +
> +static int handle_vmclear(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct level_state *l2_state;
> +	gpa_t guest_vmcs_addr;
> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	gva_t vmcs_gva;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
> +				       vmx_instruction_info);
> +
> +	if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr))
> +		return 1;
> +
Should check that vmcs address is 4K aligned and given address is not equal
to vmxon pointer.

> +	vmx->nested.current_vmptr = guest_vmcs_addr;
vmclear doesn't change current vmcs pointer.

> +	if (!nested_map_current(vcpu))
> +		return 1;
> +
> +	l2_state = &(to_vmx(vcpu)->nested.current_l2_page->l2_state);
> +	l2_state->vmclear = 1;
> +	nested_free_current_vmcs(vcpu);
> +
> +	vmx->nested.current_vmptr = -1ull;
> +
vmclear reset current vmcs pointer to -1 only if it was called with
current vmcs pointer as an argument.

> +	nested_unmap_current(vcpu);
> +
> +	skip_emulated_instruction(vcpu);
> +	clear_rflags_cf_zf(vcpu);
> +
> +	return 1;
> +}
> +
>  static int handle_vmoff(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -3695,7 +3928,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_HLT]                     = handle_halt,
>  	[EXIT_REASON_INVLPG]		      = handle_invlpg,
>  	[EXIT_REASON_VMCALL]                  = handle_vmcall,
> -	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
> +	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
>  	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
>  	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
>  	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b698952..e5acf22 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2773,8 +2773,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
>  	return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
>  }
>  
> -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> -			       struct kvm_vcpu *vcpu)
> +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> +			struct kvm_vcpu *vcpu)
>  {
>  	void *data = val;
>  	int r = X86EMUL_CONTINUE;
> @@ -2802,6 +2802,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
>  out:
>  	return r;
>  }
> +EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
>  
>  static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
>  				struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 57204cb..2d7b2dc 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -35,6 +35,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
>  struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
>                                               u32 function, u32 index);
>  
> +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> +			struct kvm_vcpu *vcpu);
> +
>  extern int nested;
>  
>  #endif
> -- 
> 1.6.0.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
			Gleb.

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2009-12-28 14:57 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-10 18:38 Nested VMX support v4 oritw
2009-12-10 18:38 ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff oritw
2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
2009-12-10 18:38     ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst oritw
2009-12-10 18:38       ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite oritw
2009-12-10 18:38         ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling oritw
2009-12-10 18:38           ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume oritw
2009-12-10 18:38             ` [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits oritw
2009-12-17 13:46               ` Avi Kivity
2009-12-17 10:10             ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume Avi Kivity
2009-12-17  9:10           ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling Avi Kivity
2009-12-16 14:44         ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
2009-12-16 14:32       ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
2009-12-16 13:59     ` [PATCH 2/7] Nested VMX patch 2 implements vmclear Avi Kivity
2009-12-28 14:57     ` Gleb Natapov
2009-12-16 13:34   ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
2009-12-20 14:20   ` Gleb Natapov
2009-12-20 14:23     ` Avi Kivity
2009-12-20 14:25       ` Gleb Natapov
2009-12-20 17:08     ` Andi Kleen
2009-12-20 19:04       ` Avi Kivity
2009-12-21 15:52         ` Muli Ben-Yehuda
2009-12-21 16:00           ` Avi Kivity
2009-12-17 13:49 ` Nested VMX support v4 Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).